Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.

author: shadchin <shadchin@yandex-team.ru> 2022-02-10 16:44:30 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:44:30 +0300
commit: 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch)
tree: 012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Transforms/Vectorize
parent: 6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff)
download: ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz
15 files changed, 4969 insertions, 4969 deletions
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 6ec5590d76..12f3203bd8 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -666,10 +666,10 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
                cast<IntrinsicInst>(&I)->getIntrinsicID() ==
                    Intrinsic::sideeffect) {
       // Ignore llvm.sideeffect calls.
-    } else if (isa<IntrinsicInst>(&I) &&
-               cast<IntrinsicInst>(&I)->getIntrinsicID() ==
-                   Intrinsic::pseudoprobe) {
-      // Ignore llvm.pseudoprobe calls.
+    } else if (isa<IntrinsicInst>(&I) && 
+               cast<IntrinsicInst>(&I)->getIntrinsicID() == 
+                   Intrinsic::pseudoprobe) { 
+      // Ignore llvm.pseudoprobe calls. 
     } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
       LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
                         << '\n');
@@ -766,8 +766,8 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
   return Chain.slice(0, ChainIdx);
 }
 
-static ChainID getChainID(const Value *Ptr) {
-  const Value *ObjPtr = getUnderlyingObject(Ptr);
+static ChainID getChainID(const Value *Ptr) { 
+  const Value *ObjPtr = getUnderlyingObject(Ptr); 
   if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
     // The select's themselves are distinct instructions even if they share the
     // same condition and evaluate to consecutive pointers for true and false
@@ -834,7 +834,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
         continue;
 
       // Save the load locations.
-      const ChainID ID = getChainID(Ptr);
+      const ChainID ID = getChainID(Ptr); 
       LoadRefs[ID].push_back(LI);
     } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
       if (!SI->isSimple())
@@ -880,7 +880,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
         continue;
 
       // Save store location.
-      const ChainID ID = getChainID(Ptr);
+      const ChainID ID = getChainID(Ptr); 
       StoreRefs[ID].push_back(SI);
     }
   }
@@ -1031,8 +1031,8 @@ bool Vectorizer::vectorizeStoreChain(
   unsigned EltSzInBytes = Sz / 8;
   unsigned SzInBytes = EltSzInBytes * ChainSize;
 
-  FixedVectorType *VecTy;
-  auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
+  FixedVectorType *VecTy; 
+  auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy); 
   if (VecStoreTy)
     VecTy = FixedVectorType::get(StoreTy->getScalarType(),
                                  Chain.size() * VecStoreTy->getNumElements());
@@ -1184,7 +1184,7 @@ bool Vectorizer::vectorizeLoadChain(
   unsigned EltSzInBytes = Sz / 8;
   unsigned SzInBytes = EltSzInBytes * ChainSize;
   VectorType *VecTy;
-  auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
+  auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy); 
   if (VecLoadTy)
     VecTy = FixedVectorType::get(LoadTy->getScalarType(),
                                  Chain.size() * VecLoadTy->getNumElements());
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index b8c21a0e1c..e40cd652e5 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -13,16 +13,16 @@
 // pass. It should be easy to create an analysis pass around it if there
 // is a need (but D45420 needs to happen first).
 //
-
+ 
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h" 
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Transforms/Utils/SizeOpts.h"
+#include "llvm/Transforms/Utils/SizeOpts.h" 
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 
 using namespace llvm;
@@ -66,7 +66,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
     return (Val <= 1);
   case HK_ISVECTORIZED:
   case HK_PREDICATE:
-  case HK_SCALABLE:
+  case HK_SCALABLE: 
     return (Val == 0 || Val == 1);
   }
   return false;
@@ -79,8 +79,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
       Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
       Force("vectorize.enable", FK_Undefined, HK_FORCE),
       IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
-      Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
-      Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L),
+      Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), 
+      Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L), 
       ORE(ORE) {
   // Populate values with existing loop metadata.
   getHintsFromMetadata();
@@ -93,8 +93,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
     // If the vectorization width and interleaving count are both 1 then
     // consider the loop to have been already vectorized because there's
     // nothing more that we can do.
-    IsVectorized.Value =
-        getWidth() == ElementCount::getFixed(1) && Interleave.Value == 1;
+    IsVectorized.Value = 
+        getWidth() == ElementCount::getFixed(1) && Interleave.Value == 1; 
   LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs()
              << "LV: Interleaving disabled by the pass manager\n");
 }
@@ -167,7 +167,7 @@ void LoopVectorizeHints::emitRemarkWithHints() const {
       if (Force.Value == LoopVectorizeHints::FK_Enabled) {
         R << " (Force=" << NV("Force", true);
         if (Width.Value != 0)
-          R << ", Vector Width=" << NV("VectorWidth", getWidth());
+          R << ", Vector Width=" << NV("VectorWidth", getWidth()); 
         if (Interleave.Value != 0)
           R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
         R << ")";
@@ -178,11 +178,11 @@ void LoopVectorizeHints::emitRemarkWithHints() const {
 }
 
 const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
-  if (getWidth() == ElementCount::getFixed(1))
+  if (getWidth() == ElementCount::getFixed(1)) 
     return LV_NAME;
   if (getForce() == LoopVectorizeHints::FK_Disabled)
     return LV_NAME;
-  if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
+  if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero()) 
     return LV_NAME;
   return OptimizationRemarkAnalysis::AlwaysPrint;
 }
@@ -233,8 +233,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
     return;
   unsigned Val = C->getZExtValue();
 
-  Hint *Hints[] = {&Width,        &Interleave, &Force,
-                   &IsVectorized, &Predicate,  &Scalable};
+  Hint *Hints[] = {&Width,        &Interleave, &Force, 
+                   &IsVectorized, &Predicate,  &Scalable}; 
   for (auto H : Hints) {
     if (Name == H->Name) {
       if (H->validate(Val))
@@ -419,11 +419,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   const ValueToValueMap &Strides =
       getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
 
-  Function *F = TheLoop->getHeader()->getParent();
-  bool OptForSize = F->hasOptSize() ||
-                    llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
-                                                PGSOQueryType::IRPass);
-  bool CanAddPredicate = !OptForSize;
+  Function *F = TheLoop->getHeader()->getParent(); 
+  bool OptForSize = F->hasOptSize() || 
+                    llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI, 
+                                                PGSOQueryType::IRPass); 
+  bool CanAddPredicate = !OptForSize; 
   int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
   if (Stride == 1 || Stride == -1)
     return Stride;
@@ -435,7 +435,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
 }
 
 bool LoopVectorizationLegality::canVectorizeOuterLoop() {
-  assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
+  assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop."); 
   // Store the result and return it at the end instead of exiting early, in case
   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
   bool Result = true;
@@ -779,7 +779,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // supported on the target.
         if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
           // Arbitrarily try a vector of 2 elements.
-          auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
+          auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2); 
           assert(VecTy && "did not find vectorized version of stored type");
           if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
             reportVectorizationFailure(
@@ -794,7 +794,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
           // For nontemporal loads, check that a nontemporal vector version is
           // supported on the target (arbitrarily try a vector of 2 elements).
-          auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
+          auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2); 
           assert(VecTy && "did not find vectorized version of load type");
           if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
             reportVectorizationFailure(
@@ -923,9 +923,9 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
 }
 
 bool LoopVectorizationLegality::blockCanBePredicated(
-    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
-    SmallPtrSetImpl<const Instruction *> &MaskedOp,
-    SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const {
+    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, 
+    SmallPtrSetImpl<const Instruction *> &MaskedOp, 
+    SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const { 
   for (Instruction &I : *BB) {
     // Check that we don't have a constant expression that can trap as operand.
     for (Value *Operand : I.operands()) {
@@ -941,19 +941,19 @@ bool LoopVectorizationLegality::blockCanBePredicated(
       continue;
     }
 
-    // Do not let llvm.experimental.noalias.scope.decl block the vectorization.
-    // TODO: there might be cases that it should block the vectorization. Let's
-    // ignore those for now.
-    if (isa<NoAliasScopeDeclInst>(&I))
-      continue;
-
+    // Do not let llvm.experimental.noalias.scope.decl block the vectorization. 
+    // TODO: there might be cases that it should block the vectorization. Let's 
+    // ignore those for now. 
+    if (isa<NoAliasScopeDeclInst>(&I)) 
+      continue; 
+ 
     // We might be able to hoist the load.
     if (I.mayReadFromMemory()) {
       auto *LI = dyn_cast<LoadInst>(&I);
       if (!LI)
         return false;
       if (!SafePtrs.count(LI->getPointerOperand())) {
-        MaskedOp.insert(LI);
+        MaskedOp.insert(LI); 
         continue;
       }
     }
@@ -1012,7 +1012,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     ScalarEvolution &SE = *PSE.getSE();
     for (Instruction &I : *BB) {
       LoadInst *LI = dyn_cast<LoadInst>(&I);
-      if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) &&
+      if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) && 
           isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))
         SafePointers.insert(LI->getPointerOperand());
     }
@@ -1032,8 +1032,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 
     // We must be able to predicate all blocks that need to be predicated.
     if (blockNeedsPredication(BB)) {
-      if (!blockCanBePredicated(BB, SafePointers, MaskedOp,
-                                ConditionalAssumes)) {
+      if (!blockCanBePredicated(BB, SafePointers, MaskedOp, 
+                                ConditionalAssumes)) { 
         reportVectorizationFailure(
             "Control flow cannot be substituted for a select",
             "control flow cannot be substituted for a select",
@@ -1058,7 +1058,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 // Helper function to canVectorizeLoopNestCFG.
 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
                                                     bool UseVPlanNativePath) {
-  assert((UseVPlanNativePath || Lp->isInnermost()) &&
+  assert((UseVPlanNativePath || Lp->isInnermost()) && 
          "VPlan-native path is not enabled.");
 
   // TODO: ORE should be improved to show more accurate information when an
@@ -1094,14 +1094,14 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
       return false;
   }
 
-  // We currently must have a single "exit block" after the loop. Note that
-  // multiple "exiting blocks" inside the loop are allowed, provided they all
-  // reach the single exit block.
-  // TODO: This restriction can be relaxed in the near future, it's here solely
-  // to allow separation of changes for review. We need to generalize the phi
-  // update logic in a number of places.
-  if (!Lp->getUniqueExitBlock()) {
-    reportVectorizationFailure("The loop must have a unique exit block",
+  // We currently must have a single "exit block" after the loop. Note that 
+  // multiple "exiting blocks" inside the loop are allowed, provided they all 
+  // reach the single exit block. 
+  // TODO: This restriction can be relaxed in the near future, it's here solely 
+  // to allow separation of changes for review. We need to generalize the phi 
+  // update logic in a number of places. 
+  if (!Lp->getUniqueExitBlock()) { 
+    reportVectorizationFailure("The loop must have a unique exit block", 
         "loop control flow is not understood by vectorizer",
         "CFGNotUnderstood", ORE, TheLoop);
     if (DoExtraAnalysis)
@@ -1159,7 +1159,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
 
   // Specific checks for outer loops. We skip the remaining legal checks at this
   // point because they don't support outer loops.
-  if (!TheLoop->isInnermost()) {
+  if (!TheLoop->isInnermost()) { 
     assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
 
     if (!canVectorizeOuterLoop()) {
@@ -1176,7 +1176,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
     return Result;
   }
 
-  assert(TheLoop->isInnermost() && "Inner loop expected.");
+  assert(TheLoop->isInnermost() && "Inner loop expected."); 
   // Check if we can if-convert non-single-bb loops.
   unsigned NumBlocks = TheLoop->getNumBlocks();
   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
@@ -1251,10 +1251,10 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
       Instruction *UI = cast<Instruction>(U);
       if (TheLoop->contains(UI))
         continue;
-      LLVM_DEBUG(
-          dbgs()
-          << "LV: Cannot fold tail by masking, loop has an outside user for "
-          << *UI << "\n");
+      LLVM_DEBUG( 
+          dbgs() 
+          << "LV: Cannot fold tail by masking, loop has an outside user for " 
+          << *UI << "\n"); 
       return false;
     }
   }
@@ -1262,25 +1262,25 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
   // The list of pointers that we can safely read and write to remains empty.
   SmallPtrSet<Value *, 8> SafePointers;
 
-  SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
-  SmallPtrSet<Instruction *, 8> TmpConditionalAssumes;
-
+  SmallPtrSet<const Instruction *, 8> TmpMaskedOp; 
+  SmallPtrSet<Instruction *, 8> TmpConditionalAssumes; 
+ 
   // Check and mark all blocks for predication, including those that ordinarily
   // do not need predication such as the header block.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
-                              TmpConditionalAssumes)) {
-      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
+    if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, 
+                              TmpConditionalAssumes)) { 
+      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); 
       return false;
     }
   }
 
   LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
-
-  MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());
-  ConditionalAssumes.insert(TmpConditionalAssumes.begin(),
-                            TmpConditionalAssumes.end());
-
+ 
+  MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end()); 
+  ConditionalAssumes.insert(TmpConditionalAssumes.begin(), 
+                            TmpConditionalAssumes.end()); 
+ 
   return true;
 }
 
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 19797e6f78..25e4a37d63 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -34,7 +34,7 @@ namespace llvm {
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class PredicatedScalarEvolution;
-class VPRecipeBuilder;
+class VPRecipeBuilder; 
 
 /// VPlan-based builder utility analogous to IRBuilder.
 class VPBuilder {
@@ -142,10 +142,10 @@ public:
     return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
   }
 
-  VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) {
-    return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal});
-  }
-
+  VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) { 
+    return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}); 
+  } 
+ 
   //===--------------------------------------------------------------------===//
   // RAII helpers.
   //===--------------------------------------------------------------------===//
@@ -176,22 +176,22 @@ public:
 /// Information about vectorization costs
 struct VectorizationFactor {
   // Vector width with best cost
-  ElementCount Width;
+  ElementCount Width; 
   // Cost of the loop with that width
   unsigned Cost;
 
   // Width 1 means no vectorization, cost 0 means uncomputed cost.
-  static VectorizationFactor Disabled() {
-    return {ElementCount::getFixed(1), 0};
-  }
+  static VectorizationFactor Disabled() { 
+    return {ElementCount::getFixed(1), 0}; 
+  } 
 
   bool operator==(const VectorizationFactor &rhs) const {
     return Width == rhs.Width && Cost == rhs.Cost;
   }
-
-  bool operator!=(const VectorizationFactor &rhs) const {
-    return !(*this == rhs);
-  }
+ 
+  bool operator!=(const VectorizationFactor &rhs) const { 
+    return !(*this == rhs); 
+  } 
 };
 
 /// Planner drives the vectorization process after having passed
@@ -237,10 +237,10 @@ class LoopVectorizationPlanner {
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
-  /// The best number of elements of the vector types used in the
-  /// transformed loop. BestVF = None means that vectorization is
-  /// disabled.
-  Optional<ElementCount> BestVF = None;
+  /// The best number of elements of the vector types used in the 
+  /// transformed loop. BestVF = None means that vectorization is 
+  /// disabled. 
+  Optional<ElementCount> BestVF = None; 
   unsigned BestUF = 0;
 
 public:
@@ -255,14 +255,14 @@ public:
 
   /// Plan how to best vectorize, return the best VF and its cost, or None if
   /// vectorization and interleaving should be avoided up front.
-  Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
+  Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC); 
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
-  VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
+  VectorizationFactor planInVPlanNativePath(ElementCount UserVF); 
 
   /// Finalize the best decision and dispose of all other VPlans.
-  void setBestPlan(ElementCount VF, unsigned UF);
+  void setBestPlan(ElementCount VF, unsigned UF); 
 
   /// Generate the IR code for the body of the vectorized loop according to the
   /// best selected VPlan.
@@ -273,21 +273,21 @@ public:
       O << *Plan;
   }
 
-  /// Look through the existing plans and return true if we have one with all
-  /// the vectorization factors in question.
-  bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const {
-    return any_of(VPlans, [&](const VPlanPtr &Plan) {
-      return all_of(VFs, [&](const ElementCount &VF) {
-        return Plan->hasVF(VF);
-      });
-    });
-  }
-
+  /// Look through the existing plans and return true if we have one with all 
+  /// the vectorization factors in question. 
+  bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const { 
+    return any_of(VPlans, [&](const VPlanPtr &Plan) { 
+      return all_of(VFs, [&](const ElementCount &VF) { 
+        return Plan->hasVF(VF); 
+      }); 
+    }); 
+  } 
+ 
   /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
   /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
   /// returned value holds for the entire \p Range.
   static bool
-  getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate,
+  getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate, 
                            VFRange &Range);
 
 protected:
@@ -299,7 +299,7 @@ protected:
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop.
-  void buildVPlans(ElementCount MinVF, ElementCount MaxVF);
+  void buildVPlans(ElementCount MinVF, ElementCount MaxVF); 
 
 private:
   /// Build a VPlan according to the information gathered by Legal. \return a
@@ -310,20 +310,20 @@ private:
   /// Build a VPlan using VPRecipes according to the information gather by
   /// Legal. This method is only used for the legacy inner loop vectorizer.
   VPlanPtr buildVPlanWithVPRecipes(
-      VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
+      VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 
       const DenseMap<Instruction *, Instruction *> &SinkAfter);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
-
-  /// Adjust the recipes for any inloop reductions. The chain of instructions
-  /// leading from the loop exit instr to the phi need to be converted to
-  /// reductions, with one operand being vector and the other being the scalar
-  /// reduction chain.
-  void adjustRecipesForInLoopReductions(VPlanPtr &Plan,
-                                        VPRecipeBuilder &RecipeBuilder);
+  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); 
+ 
+  /// Adjust the recipes for any inloop reductions. The chain of instructions 
+  /// leading from the loop exit instr to the phi need to be converted to 
+  /// reductions, with one operand being vector and the other being the scalar 
+  /// reduction chain. 
+  void adjustRecipesForInLoopReductions(VPlanPtr &Plan, 
+                                        VPRecipeBuilder &RecipeBuilder); 
 };
 
 } // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp
index b456a97aa4..decb6ce1d7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -130,7 +130,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/InstructionCost.h"
+#include "llvm/Support/InstructionCost.h" 
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -158,38 +158,38 @@ using namespace llvm;
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
 
-#ifndef NDEBUG
-const char VerboseDebug[] = DEBUG_TYPE "-verbose";
-#endif
-
+#ifndef NDEBUG 
+const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 
+#endif 
+ 
 /// @{
 /// Metadata attribute names
-const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
-const char LLVMLoopVectorizeFollowupVectorized[] =
+const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 
+const char LLVMLoopVectorizeFollowupVectorized[] = 
     "llvm.loop.vectorize.followup_vectorized";
-const char LLVMLoopVectorizeFollowupEpilogue[] =
+const char LLVMLoopVectorizeFollowupEpilogue[] = 
     "llvm.loop.vectorize.followup_epilogue";
 /// @}
 
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
-STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
-
-static cl::opt<bool> EnableEpilogueVectorization(
-    "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
-    cl::desc("Enable vectorization of epilogue loops."));
-
-static cl::opt<unsigned> EpilogueVectorizationForceVF(
-    "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
-    cl::desc("When epilogue vectorization is enabled, and a value greater than "
-             "1 is specified, forces the given VF for all applicable epilogue "
-             "loops."));
-
-static cl::opt<unsigned> EpilogueVectorizationMinVF(
-    "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
-    cl::desc("Only loops with vectorization factor equal to or larger than "
-             "the specified value are considered for epilogue vectorization."));
-
+STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 
+
+static cl::opt<bool> EnableEpilogueVectorization( 
+    "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 
+    cl::desc("Enable vectorization of epilogue loops.")); 
+ 
+static cl::opt<unsigned> EpilogueVectorizationForceVF( 
+    "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 
+    cl::desc("When epilogue vectorization is enabled, and a value greater than " 
+             "1 is specified, forces the given VF for all applicable epilogue " 
+             "loops.")); 
+ 
+static cl::opt<unsigned> EpilogueVectorizationMinVF( 
+    "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 
+    cl::desc("Only loops with vectorization factor equal to or larger than " 
+             "the specified value are considered for epilogue vectorization.")); 
+ 
 /// Loops with a known constant trip count below this number are vectorized only
 /// if no scalar iteration overheads are incurred.
 static cl::opt<unsigned> TinyTripCountVectorThreshold(
@@ -198,37 +198,37 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
              "value are vectorized only if no scalar iteration overheads "
              "are incurred."));
 
-// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
-// that predication is preferred, and this lists all options. I.e., the
-// vectorizer will try to fold the tail-loop (epilogue) into the vector body
-// and predicate the instructions accordingly. If tail-folding fails, there are
-// different fallback strategies depending on these values:
-namespace PreferPredicateTy {
-  enum Option {
-    ScalarEpilogue = 0,
-    PredicateElseScalarEpilogue,
-    PredicateOrDontVectorize
-  };
-} // namespace PreferPredicateTy
-
-static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
-    "prefer-predicate-over-epilogue",
-    cl::init(PreferPredicateTy::ScalarEpilogue),
-    cl::Hidden,
-    cl::desc("Tail-folding and predication preferences over creating a scalar "
-             "epilogue loop."),
-    cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
-                         "scalar-epilogue",
-                         "Don't tail-predicate loops, create scalar epilogue"),
-              clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
-                         "predicate-else-scalar-epilogue",
-                         "prefer tail-folding, create scalar epilogue if tail "
-                         "folding fails."),
-              clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
-                         "predicate-dont-vectorize",
-                         "prefers tail-folding, don't attempt vectorization if "
-                         "tail-folding fails.")));
-
+// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 
+// that predication is preferred, and this lists all options. I.e., the 
+// vectorizer will try to fold the tail-loop (epilogue) into the vector body 
+// and predicate the instructions accordingly. If tail-folding fails, there are 
+// different fallback strategies depending on these values: 
+namespace PreferPredicateTy { 
+  enum Option { 
+    ScalarEpilogue = 0, 
+    PredicateElseScalarEpilogue, 
+    PredicateOrDontVectorize 
+  }; 
+} // namespace PreferPredicateTy 
+
+static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 
+    "prefer-predicate-over-epilogue", 
+    cl::init(PreferPredicateTy::ScalarEpilogue), 
+    cl::Hidden, 
+    cl::desc("Tail-folding and predication preferences over creating a scalar " 
+             "epilogue loop."), 
+    cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 
+                         "scalar-epilogue", 
+                         "Don't tail-predicate loops, create scalar epilogue"), 
+              clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 
+                         "predicate-else-scalar-epilogue", 
+                         "prefer tail-folding, create scalar epilogue if tail " 
+                         "folding fails."), 
+              clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 
+                         "predicate-dont-vectorize", 
+                         "prefers tail-folding, don't attempt vectorization if " 
+                         "tail-folding fails."))); 
+ 
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -239,7 +239,7 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 
 /// An interleave-group may need masking if it resides in a block that needs
-/// predication, or in order to mask away gaps.
+/// predication, or in order to mask away gaps. 
 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
@@ -273,12 +273,12 @@ static cl::opt<unsigned> ForceTargetInstructionCost(
              "an instruction to a single constant value. Mostly "
              "useful for getting consistent testing."));
 
-static cl::opt<bool> ForceTargetSupportsScalableVectors(
-    "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
-    cl::desc(
-        "Pretend that scalable vectors are supported, even if the target does "
-        "not support them. This flag should only be used for testing."));
-
+static cl::opt<bool> ForceTargetSupportsScalableVectors( 
+    "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 
+    cl::desc( 
+        "Pretend that scalable vectors are supported, even if the target does " 
+        "not support them. This flag should only be used for testing.")); 
+ 
 static cl::opt<unsigned> SmallLoopCost(
     "small-loop-cost", cl::init(20), cl::Hidden,
     cl::desc(
@@ -296,12 +296,12 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
     cl::desc(
         "Enable runtime interleaving until load/store ports are saturated"));
 
-/// Interleave small loops with scalar reductions.
-static cl::opt<bool> InterleaveSmallLoopScalarReduction(
-    "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
-    cl::desc("Enable interleaving for loops with small iteration counts that "
-             "contain scalar reductions to expose ILP."));
-
+/// Interleave small loops with scalar reductions. 
+static cl::opt<bool> InterleaveSmallLoopScalarReduction( 
+    "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 
+    cl::desc("Enable interleaving for loops with small iteration counts that " 
+             "contain scalar reductions to expose ILP.")); 
+ 
 /// The number of stores in a loop that are allowed to need predication.
 static cl::opt<unsigned> NumberOfStoresToPredicate(
     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -320,17 +320,17 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
     cl::desc("The maximum interleave count to use when interleaving a scalar "
              "reduction in a nested loop."));
 
-static cl::opt<bool>
-    PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
-                           cl::Hidden,
-                           cl::desc("Prefer in-loop vector reductions, "
-                                    "overriding the targets preference."));
-
-static cl::opt<bool> PreferPredicatedReductionSelect(
-    "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
-    cl::desc(
-        "Prefer predicating a reduction operation over an after loop select."));
-
+static cl::opt<bool> 
+    PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 
+                           cl::Hidden, 
+                           cl::desc("Prefer in-loop vector reductions, " 
+                                    "overriding the targets preference.")); 
+ 
+static cl::opt<bool> PreferPredicatedReductionSelect( 
+    "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 
+    cl::desc( 
+        "Prefer predicating a reduction operation over an after loop select.")); 
+ 
 cl::opt<bool> EnableVPlanNativePath(
     "enable-vplan-native-path", cl::init(false), cl::Hidden,
     cl::desc("Enable VPlan-native vectorization path with "
@@ -372,11 +372,11 @@ static Type *getMemInstValueType(Value *I) {
 
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
-/// element of the corresponding vector type.
-static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
-  // Determine if an array of N elements of type Ty is "bitcast compatible"
-  // with a <N x Ty> vector.
-  // This is only true if there is no padding between the array elements.
+/// element of the corresponding vector type. 
+static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 
+  // Determine if an array of N elements of type Ty is "bitcast compatible" 
+  // with a <N x Ty> vector. 
+  // This is only true if there is no padding between the array elements. 
   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
 }
 
@@ -453,42 +453,42 @@ public:
                       LoopInfo *LI, DominatorTree *DT,
                       const TargetLibraryInfo *TLI,
                       const TargetTransformInfo *TTI, AssumptionCache *AC,
-                      OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
+                      OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 
                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
-                      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
-                      ProfileSummaryInfo *PSI)
+                      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 
+                      ProfileSummaryInfo *PSI) 
       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
         Builder(PSE.getSE()->getContext()),
-        VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
-        BFI(BFI), PSI(PSI) {
-    // Query this against the original loop and save it here because the profile
-    // of the original loop header may change as the transformation happens.
-    OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
-        OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
-  }
-
+        VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 
+        BFI(BFI), PSI(PSI) { 
+    // Query this against the original loop and save it here because the profile 
+    // of the original loop header may change as the transformation happens. 
+    OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 
+        OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 
+  } 
+ 
   virtual ~InnerLoopVectorizer() = default;
 
-  /// Create a new empty loop that will contain vectorized instructions later
-  /// on, while the old loop will be used as the scalar remainder. Control flow
-  /// is generated around the vectorized (and scalar epilogue) loops consisting
-  /// of various checks and bypasses. Return the pre-header block of the new
-  /// loop.
-  /// In the case of epilogue vectorization, this function is overriden to
-  /// handle the more complex control flow around the loops.
-  virtual BasicBlock *createVectorizedLoopSkeleton();
+  /// Create a new empty loop that will contain vectorized instructions later 
+  /// on, while the old loop will be used as the scalar remainder. Control flow 
+  /// is generated around the vectorized (and scalar epilogue) loops consisting 
+  /// of various checks and bypasses. Return the pre-header block of the new 
+  /// loop. 
+  /// In the case of epilogue vectorization, this function is overriden to 
+  /// handle the more complex control flow around the loops. 
+  virtual BasicBlock *createVectorizedLoopSkeleton(); 
 
   /// Widen a single instruction within the innermost loop.
-  void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
+  void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 
                         VPTransformState &State);
 
   /// Widen a single call instruction within the innermost loop.
-  void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
+  void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 
                             VPTransformState &State);
 
   /// Widen a single select instruction within the innermost loop.
-  void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
+  void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 
                               bool InvariantCond, VPTransformState &State);
 
   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
@@ -504,15 +504,15 @@ public:
 
   /// Vectorize a single GetElementPtrInst based on information gathered and
   /// decisions taken during planning.
-  void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
-                unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
+  void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 
+                unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 
                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
 
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
-  void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
-                           Value *StartV, unsigned UF, ElementCount VF);
+  void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 
+                           Value *StartV, unsigned UF, ElementCount VF); 
 
   /// A helper function to scalarize a single Instruction in the innermost loop.
   /// Generates a sequence of scalar instances for each lane between \p MinLane
@@ -526,8 +526,8 @@ public:
   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
   /// is provided, the integer induction variable will first be truncated to
   /// the corresponding type.
-  void widenIntOrFpInduction(PHINode *IV, Value *Start,
-                             TruncInst *Trunc = nullptr);
+  void widenIntOrFpInduction(PHINode *IV, Value *Start, 
+                             TruncInst *Trunc = nullptr); 
 
   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
   /// vector or scalar value on-demand if one is not yet available. When
@@ -552,10 +552,10 @@ public:
   /// value into a vector.
   Value *getOrCreateVectorValue(Value *V, unsigned Part);
 
-  void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
-    VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
-  }
-
+  void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 
+    VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 
+  } 
+ 
   /// Return a value in the new loop corresponding to \p V from the original
   /// loop at unroll and vector indices \p Instance. If the value has been
   /// vectorized but not scalarized, the necessary extractelement instruction
@@ -570,9 +570,9 @@ public:
   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
   /// values in the vectorized loop.
   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
-                                ArrayRef<VPValue *> VPDefs,
+                                ArrayRef<VPValue *> VPDefs, 
                                 VPTransformState &State, VPValue *Addr,
-                                ArrayRef<VPValue *> StoredValues,
+                                ArrayRef<VPValue *> StoredValues, 
                                 VPValue *BlockInMask = nullptr);
 
   /// Vectorize Load and Store instructions with the base address given in \p
@@ -580,8 +580,8 @@ public:
   /// non-null. Use \p State to translate given VPValues to IR values in the
   /// vectorized loop.
   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
-                                  VPValue *Def, VPValue *Addr,
-                                  VPValue *StoredValue, VPValue *BlockInMask);
+                                  VPValue *Def, VPValue *Addr, 
+                                  VPValue *StoredValue, VPValue *BlockInMask); 
 
   /// Set the debug location in the builder using the debug location in
   /// the instruction.
@@ -625,11 +625,11 @@ protected:
   /// Clear NSW/NUW flags from reduction instructions if necessary.
   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
 
-  /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
-  /// means we need to add the appropriate incoming value from the middle
-  /// block as exiting edges from the scalar epilogue loop (if present) are
-  /// already in place, and we exit the vector loop exclusively to the middle
-  /// block.
+  /// Fixup the LCSSA phi nodes in the unique exit block.  This simply 
+  /// means we need to add the appropriate incoming value from the middle 
+  /// block as exiting edges from the scalar epilogue loop (if present) are 
+  /// already in place, and we exit the vector loop exclusively to the middle 
+  /// block. 
   void fixLCSSAPHIs();
 
   /// Iteratively sink the scalarized operands of a predicated instruction into
@@ -668,8 +668,8 @@ protected:
   /// truncate instruction, instead of widening the original IV, we widen a
   /// version of the IV truncated to \p EntryVal's type.
   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
-                                       Value *Step, Value *Start,
-                                       Instruction *EntryVal);
+                                       Value *Step, Value *Start, 
+                                       Instruction *EntryVal); 
 
   /// Returns true if an instruction \p I should be scalarized instead of
   /// vectorized for the chosen vectorization factor.
@@ -737,28 +737,28 @@ protected:
                               const DataLayout &DL,
                               const InductionDescriptor &ID) const;
 
-  /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
-  /// vector loop preheader, middle block and scalar preheader. Also
-  /// allocate a loop object for the new vector loop and return it.
-  Loop *createVectorLoopSkeleton(StringRef Prefix);
-
-  /// Create new phi nodes for the induction variables to resume iteration count
-  /// in the scalar epilogue, from where the vectorized loop left off (given by
-  /// \p VectorTripCount).
-  /// In cases where the loop skeleton is more complicated (eg. epilogue
-  /// vectorization) and the resume values can come from an additional bypass
-  /// block, the \p AdditionalBypass pair provides information about the bypass
-  /// block and the end value on the edge from bypass to this loop.
-  void createInductionResumeValues(
-      Loop *L, Value *VectorTripCount,
-      std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
-
-  /// Complete the loop skeleton by adding debug MDs, creating appropriate
-  /// conditional branches in the middle block, preparing the builder and
-  /// running the verifier. Take in the vector loop \p L as argument, and return
-  /// the preheader of the completed vector loop.
-  BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
-
+  /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 
+  /// vector loop preheader, middle block and scalar preheader. Also 
+  /// allocate a loop object for the new vector loop and return it. 
+  Loop *createVectorLoopSkeleton(StringRef Prefix); 
+ 
+  /// Create new phi nodes for the induction variables to resume iteration count 
+  /// in the scalar epilogue, from where the vectorized loop left off (given by 
+  /// \p VectorTripCount). 
+  /// In cases where the loop skeleton is more complicated (eg. epilogue 
+  /// vectorization) and the resume values can come from an additional bypass 
+  /// block, the \p AdditionalBypass pair provides information about the bypass 
+  /// block and the end value on the edge from bypass to this loop. 
+  void createInductionResumeValues( 
+      Loop *L, Value *VectorTripCount, 
+      std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 
+ 
+  /// Complete the loop skeleton by adding debug MDs, creating appropriate 
+  /// conditional branches in the middle block, preparing the builder and 
+  /// running the verifier. Take in the vector loop \p L as argument, and return 
+  /// the preheader of the completed vector loop. 
+  BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 
+ 
   /// Add additional metadata to \p To that was not present on \p Orig.
   ///
   /// Currently this is used to add the noalias annotations based on the
@@ -777,11 +777,11 @@ protected:
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 
-  /// Allow subclasses to override and print debug traces before/after vplan
-  /// execution, when trace information is requested.
-  virtual void printDebugTracesAtStart(){};
-  virtual void printDebugTracesAtEnd(){};
-
+  /// Allow subclasses to override and print debug traces before/after vplan 
+  /// execution, when trace information is requested. 
+  virtual void printDebugTracesAtStart(){}; 
+  virtual void printDebugTracesAtEnd(){}; 
+ 
   /// The original loop.
   Loop *OrigLoop;
 
@@ -820,7 +820,7 @@ protected:
 
   /// The vectorization SIMD factor to use. Each vector will have this many
   /// vector elements.
-  ElementCount VF;
+  ElementCount VF; 
 
   /// The vectorization unroll factor to use. Each scalar is vectorized to this
   /// many different vector instructions.
@@ -840,8 +840,8 @@ protected:
   /// Middle Block between the vector and the scalar.
   BasicBlock *LoopMiddleBlock;
 
-  /// The (unique) ExitBlock of the scalar loop.  Note that
-  /// there can be multiple exiting edges reaching this block.
+  /// The (unique) ExitBlock of the scalar loop.  Note that 
+  /// there can be multiple exiting edges reaching this block. 
   BasicBlock *LoopExitBlock;
 
   /// The vector loop body.
@@ -890,14 +890,14 @@ protected:
   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
   // fixed up at the end of vector code generation.
   SmallVector<PHINode *, 8> OrigPHIsToFix;
-
-  /// BFI and PSI are used to check for profile guided size optimizations.
-  BlockFrequencyInfo *BFI;
-  ProfileSummaryInfo *PSI;
-
-  // Whether this loop should be optimized for size based on profile guided size
-  // optimizatios.
-  bool OptForSizeBasedOnProfile;
+ 
+  /// BFI and PSI are used to check for profile guided size optimizations. 
+  BlockFrequencyInfo *BFI; 
+  ProfileSummaryInfo *PSI; 
+ 
+  // Whether this loop should be optimized for size based on profile guided size 
+  // optimizatios. 
+  bool OptForSizeBasedOnProfile; 
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -908,11 +908,11 @@ public:
                     const TargetTransformInfo *TTI, AssumptionCache *AC,
                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
                     LoopVectorizationLegality *LVL,
-                    LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
-                    ProfileSummaryInfo *PSI)
-      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
-                            ElementCount::getFixed(1), UnrollFactor, LVL, CM,
-                            BFI, PSI) {}
+                    LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 
+                    ProfileSummaryInfo *PSI) 
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 
+                            ElementCount::getFixed(1), UnrollFactor, LVL, CM, 
+                            BFI, PSI) {} 
 
 private:
   Value *getBroadcastInstrs(Value *V) override;
@@ -922,128 +922,128 @@ private:
   Value *reverseVector(Value *Vec) override;
 };
 
-/// Encapsulate information regarding vectorization of a loop and its epilogue.
-/// This information is meant to be updated and used across two stages of
-/// epilogue vectorization.
-struct EpilogueLoopVectorizationInfo {
-  ElementCount MainLoopVF = ElementCount::getFixed(0);
-  unsigned MainLoopUF = 0;
-  ElementCount EpilogueVF = ElementCount::getFixed(0);
-  unsigned EpilogueUF = 0;
-  BasicBlock *MainLoopIterationCountCheck = nullptr;
-  BasicBlock *EpilogueIterationCountCheck = nullptr;
-  BasicBlock *SCEVSafetyCheck = nullptr;
-  BasicBlock *MemSafetyCheck = nullptr;
-  Value *TripCount = nullptr;
-  Value *VectorTripCount = nullptr;
-
-  EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
-                                unsigned EUF)
-      : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
-        EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
-    assert(EUF == 1 &&
-           "A high UF for the epilogue loop is likely not beneficial.");
-  }
-};
-
-/// An extension of the inner loop vectorizer that creates a skeleton for a
-/// vectorized loop that has its epilogue (residual) also vectorized.
-/// The idea is to run the vplan on a given loop twice, firstly to setup the
-/// skeleton and vectorize the main loop, and secondly to complete the skeleton
-/// from the first step and vectorize the epilogue.  This is achieved by
-/// deriving two concrete strategy classes from this base class and invoking
-/// them in succession from the loop vectorizer planner.
-class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
-public:
-  InnerLoopAndEpilogueVectorizer(
-      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
-      DominatorTree *DT, const TargetLibraryInfo *TLI,
-      const TargetTransformInfo *TTI, AssumptionCache *AC,
-      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
-      LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
-      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
-      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
-                            EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
-        EPI(EPI) {}
-
-  // Override this function to handle the more complex control flow around the
-  // three loops.
-  BasicBlock *createVectorizedLoopSkeleton() final override {
-    return createEpilogueVectorizedLoopSkeleton();
-  }
-
-  /// The interface for creating a vectorized skeleton using one of two
-  /// different strategies, each corresponding to one execution of the vplan
-  /// as described above.
-  virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
-
-  /// Holds and updates state information required to vectorize the main loop
-  /// and its epilogue in two separate passes. This setup helps us avoid
-  /// regenerating and recomputing runtime safety checks. It also helps us to
-  /// shorten the iteration-count-check path length for the cases where the
-  /// iteration count of the loop is so small that the main vector loop is
-  /// completely skipped.
-  EpilogueLoopVectorizationInfo &EPI;
-};
-
-/// A specialized derived class of inner loop vectorizer that performs
-/// vectorization of *main* loops in the process of vectorizing loops and their
-/// epilogues.
-class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
-public:
-  EpilogueVectorizerMainLoop(
-      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
-      DominatorTree *DT, const TargetLibraryInfo *TLI,
-      const TargetTransformInfo *TTI, AssumptionCache *AC,
-      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
-      LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
-      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
-      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
-                                       EPI, LVL, CM, BFI, PSI) {}
-  /// Implements the interface for creating a vectorized skeleton using the
-  /// *main loop* strategy (ie the first pass of vplan execution).
-  BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
-
-protected:
-  /// Emits an iteration count bypass check once for the main loop (when \p
-  /// ForEpilogue is false) and once for the epilogue loop (when \p
-  /// ForEpilogue is true).
-  BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
-                                             bool ForEpilogue);
-  void printDebugTracesAtStart() override;
-  void printDebugTracesAtEnd() override;
-};
-
-// A specialized derived class of inner loop vectorizer that performs
-// vectorization of *epilogue* loops in the process of vectorizing loops and
-// their epilogues.
-class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
-public:
-  EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
-                    LoopInfo *LI, DominatorTree *DT,
-                    const TargetLibraryInfo *TLI,
-                    const TargetTransformInfo *TTI, AssumptionCache *AC,
-                    OptimizationRemarkEmitter *ORE,
-                    EpilogueLoopVectorizationInfo &EPI,
-                    LoopVectorizationLegality *LVL,
-                    llvm::LoopVectorizationCostModel *CM,
-                    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
-      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
-                                       EPI, LVL, CM, BFI, PSI) {}
-  /// Implements the interface for creating a vectorized skeleton using the
-  /// *epilogue loop* strategy (ie the second pass of vplan execution).
-  BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
-
-protected:
-  /// Emits an iteration count bypass check after the main vector loop has
-  /// finished to see if there are any iterations left to execute by either
-  /// the vector epilogue or the scalar epilogue.
-  BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
-                                                      BasicBlock *Bypass,
-                                                      BasicBlock *Insert);
-  void printDebugTracesAtStart() override;
-  void printDebugTracesAtEnd() override;
-};
+/// Encapsulate information regarding vectorization of a loop and its epilogue. 
+/// This information is meant to be updated and used across two stages of 
+/// epilogue vectorization. 
+struct EpilogueLoopVectorizationInfo { 
+  ElementCount MainLoopVF = ElementCount::getFixed(0); 
+  unsigned MainLoopUF = 0; 
+  ElementCount EpilogueVF = ElementCount::getFixed(0); 
+  unsigned EpilogueUF = 0; 
+  BasicBlock *MainLoopIterationCountCheck = nullptr; 
+  BasicBlock *EpilogueIterationCountCheck = nullptr; 
+  BasicBlock *SCEVSafetyCheck = nullptr; 
+  BasicBlock *MemSafetyCheck = nullptr; 
+  Value *TripCount = nullptr; 
+  Value *VectorTripCount = nullptr; 
+ 
+  EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 
+                                unsigned EUF) 
+      : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 
+        EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 
+    assert(EUF == 1 && 
+           "A high UF for the epilogue loop is likely not beneficial."); 
+  } 
+}; 
+ 
+/// An extension of the inner loop vectorizer that creates a skeleton for a 
+/// vectorized loop that has its epilogue (residual) also vectorized. 
+/// The idea is to run the vplan on a given loop twice, firstly to setup the 
+/// skeleton and vectorize the main loop, and secondly to complete the skeleton 
+/// from the first step and vectorize the epilogue.  This is achieved by 
+/// deriving two concrete strategy classes from this base class and invoking 
+/// them in succession from the loop vectorizer planner. 
+class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 
+public: 
+  InnerLoopAndEpilogueVectorizer( 
+      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 
+      DominatorTree *DT, const TargetLibraryInfo *TLI, 
+      const TargetTransformInfo *TTI, AssumptionCache *AC, 
+      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 
+      LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 
+      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 
+                            EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 
+        EPI(EPI) {} 
+ 
+  // Override this function to handle the more complex control flow around the 
+  // three loops. 
+  BasicBlock *createVectorizedLoopSkeleton() final override { 
+    return createEpilogueVectorizedLoopSkeleton(); 
+  } 
+ 
+  /// The interface for creating a vectorized skeleton using one of two 
+  /// different strategies, each corresponding to one execution of the vplan 
+  /// as described above. 
+  virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 
+ 
+  /// Holds and updates state information required to vectorize the main loop 
+  /// and its epilogue in two separate passes. This setup helps us avoid 
+  /// regenerating and recomputing runtime safety checks. It also helps us to 
+  /// shorten the iteration-count-check path length for the cases where the 
+  /// iteration count of the loop is so small that the main vector loop is 
+  /// completely skipped. 
+  EpilogueLoopVectorizationInfo &EPI; 
+}; 
+ 
+/// A specialized derived class of inner loop vectorizer that performs 
+/// vectorization of *main* loops in the process of vectorizing loops and their 
+/// epilogues. 
+class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 
+public: 
+  EpilogueVectorizerMainLoop( 
+      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 
+      DominatorTree *DT, const TargetLibraryInfo *TLI, 
+      const TargetTransformInfo *TTI, AssumptionCache *AC, 
+      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 
+      LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 
+      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 
+      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 
+                                       EPI, LVL, CM, BFI, PSI) {} 
+  /// Implements the interface for creating a vectorized skeleton using the 
+  /// *main loop* strategy (ie the first pass of vplan execution). 
+  BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 
+ 
+protected: 
+  /// Emits an iteration count bypass check once for the main loop (when \p 
+  /// ForEpilogue is false) and once for the epilogue loop (when \p 
+  /// ForEpilogue is true). 
+  BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 
+                                             bool ForEpilogue); 
+  void printDebugTracesAtStart() override; 
+  void printDebugTracesAtEnd() override; 
+}; 
+ 
+// A specialized derived class of inner loop vectorizer that performs 
+// vectorization of *epilogue* loops in the process of vectorizing loops and 
+// their epilogues. 
+class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 
+public: 
+  EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 
+                    LoopInfo *LI, DominatorTree *DT, 
+                    const TargetLibraryInfo *TLI, 
+                    const TargetTransformInfo *TTI, AssumptionCache *AC, 
+                    OptimizationRemarkEmitter *ORE, 
+                    EpilogueLoopVectorizationInfo &EPI, 
+                    LoopVectorizationLegality *LVL, 
+                    llvm::LoopVectorizationCostModel *CM, 
+                    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 
+      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 
+                                       EPI, LVL, CM, BFI, PSI) {} 
+  /// Implements the interface for creating a vectorized skeleton using the 
+  /// *epilogue loop* strategy (ie the second pass of vplan execution). 
+  BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 
+ 
+protected: 
+  /// Emits an iteration count bypass check after the main vector loop has 
+  /// finished to see if there are any iterations left to execute by either 
+  /// the vector epilogue or the scalar epilogue. 
+  BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 
+                                                      BasicBlock *Bypass, 
+                                                      BasicBlock *Insert); 
+  void printDebugTracesAtStart() override; 
+  void printDebugTracesAtEnd() override; 
+}; 
 } // end namespace llvm
 
 /// Look for a meaningful debug location on the instruction or it's
@@ -1070,9 +1070,9 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
     const DILocation *DIL = Inst->getDebugLoc();
     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
         !isa<DbgInfoIntrinsic>(Inst)) {
-      assert(!VF.isScalable() && "scalable vectors not yet supported.");
-      auto NewDIL =
-          DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
+      assert(!VF.isScalable() && "scalable vectors not yet supported."); 
+      auto NewDIL = 
+          DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 
       if (NewDIL)
         B.SetCurrentDebugLocation(NewDIL.getValue());
       else
@@ -1126,15 +1126,15 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
   return R;
 }
 
-/// Return a value for Step multiplied by VF.
-static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
-  assert(isa<ConstantInt>(Step) && "Expected an integer step");
-  Constant *StepVal = ConstantInt::get(
-      Step->getType(),
-      cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
-  return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
-}
-
+/// Return a value for Step multiplied by VF. 
+static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 
+  assert(isa<ConstantInt>(Step) && "Expected an integer step"); 
+  Constant *StepVal = ConstantInt::get( 
+      Step->getType(), 
+      cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 
+  return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 
+} 
+ 
 namespace llvm {
 
 void reportVectorizationFailure(const StringRef DebugMsg,
@@ -1206,10 +1206,10 @@ enum ScalarEpilogueLowering {
   CM_ScalarEpilogueNotAllowedLowTripLoop,
 
   // Loop hint predicate indicating an epilogue is undesired.
-  CM_ScalarEpilogueNotNeededUsePredicate,
-
-  // Directive indicating we must either tail fold or not vectorize
-  CM_ScalarEpilogueNotAllowedUsePredicate
+  CM_ScalarEpilogueNotNeededUsePredicate, 
+ 
+  // Directive indicating we must either tail fold or not vectorize 
+  CM_ScalarEpilogueNotAllowedUsePredicate 
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1236,7 +1236,7 @@ public:
 
   /// \return An upper bound for the vectorization factor, or None if
   /// vectorization and interleaving should be avoided up front.
-  Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
+  Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 
 
   /// \return True if runtime checks are required for vectorization, and false
   /// otherwise.
@@ -1246,13 +1246,13 @@ public:
   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
-  VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
-  VectorizationFactor
-  selectEpilogueVectorizationFactor(const ElementCount MaxVF,
-                                    const LoopVectorizationPlanner &LVP);
+  VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 
+  VectorizationFactor 
+  selectEpilogueVectorizationFactor(const ElementCount MaxVF, 
+                                    const LoopVectorizationPlanner &LVP); 
 
   /// Setup cost-based decisions for user vectorization factor.
-  void selectUserVectorizationFactor(ElementCount UserVF) {
+  void selectUserVectorizationFactor(ElementCount UserVF) { 
     collectUniformsAndScalars(UserVF);
     collectInstsToScalarize(UserVF);
   }
@@ -1266,7 +1266,7 @@ public:
   /// If interleave count has been specified by metadata it will be returned.
   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
   /// are the selected vectorization factor and the cost of the selected VF.
-  unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
+  unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 
 
   /// Memory access instruction may be vectorized in more than one way.
   /// Form of instruction after vectorization depends on cost.
@@ -1275,7 +1275,7 @@ public:
   /// the lists of loop-uniform and loop-scalar instructions.
   /// The calculated cost is saved with widening decision in order to
   /// avoid redundant calculations.
-  void setCostBasedWideningDecision(ElementCount VF);
+  void setCostBasedWideningDecision(ElementCount VF); 
 
   /// A struct that represents some properties of the register usage
   /// of a loop.
@@ -1290,16 +1290,16 @@ public:
 
   /// \return Returns information about the register usages of the loop for the
   /// given vectorization factors.
-  SmallVector<RegisterUsage, 8>
-  calculateRegisterUsage(ArrayRef<ElementCount> VFs);
+  SmallVector<RegisterUsage, 8> 
+  calculateRegisterUsage(ArrayRef<ElementCount> VFs); 
 
   /// Collect values we want to ignore in the cost model.
   void collectValuesToIgnore();
 
-  /// Split reductions into those that happen in the loop, and those that happen
-  /// outside. In loop reductions are collected into InLoopReductionChains.
-  void collectInLoopReductions();
-
+  /// Split reductions into those that happen in the loop, and those that happen 
+  /// outside. In loop reductions are collected into InLoopReductionChains. 
+  void collectInLoopReductions(); 
+ 
   /// \returns The smallest bitwidth each instruction can be represented with.
   /// The vector equivalents of these instructions should be truncated to this
   /// type.
@@ -1309,9 +1309,9 @@ public:
 
   /// \returns True if it is more profitable to scalarize instruction \p I for
   /// vectorization factor \p VF.
-  bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
-    assert(VF.isVector() &&
-           "Profitable to scalarize relevant only for VF > 1.");
+  bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 
+    assert(VF.isVector() && 
+           "Profitable to scalarize relevant only for VF > 1."); 
 
     // Cost model is not run in the VPlan-native path - return conservative
     // result until this changes.
@@ -1325,8 +1325,8 @@ public:
   }
 
   /// Returns true if \p I is known to be uniform after vectorization.
-  bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
-    if (VF.isScalar())
+  bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 
+    if (VF.isScalar()) 
       return true;
 
     // Cost model is not run in the VPlan-native path - return conservative
@@ -1341,8 +1341,8 @@ public:
   }
 
   /// Returns true if \p I is known to be scalar after vectorization.
-  bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
-    if (VF.isScalar())
+  bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 
+    if (VF.isScalar()) 
       return true;
 
     // Cost model is not run in the VPlan-native path - return conservative
@@ -1358,8 +1358,8 @@ public:
 
   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
   /// for vectorization factor \p VF.
-  bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
-    return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
+  bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 
+    return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 
            !isProfitableToScalarize(I, VF) &&
            !isScalarAfterVectorization(I, VF);
   }
@@ -1376,18 +1376,18 @@ public:
 
   /// Save vectorization decision \p W and \p Cost taken by the cost model for
   /// instruction \p I and vector width \p VF.
-  void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
-                           InstructionCost Cost) {
-    assert(VF.isVector() && "Expected VF >=2");
+  void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 
+                           InstructionCost Cost) { 
+    assert(VF.isVector() && "Expected VF >=2"); 
     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
   }
 
   /// Save vectorization decision \p W and \p Cost taken by the cost model for
   /// interleaving group \p Grp and vector width \p VF.
-  void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
-                           ElementCount VF, InstWidening W,
-                           InstructionCost Cost) {
-    assert(VF.isVector() && "Expected VF >=2");
+  void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 
+                           ElementCount VF, InstWidening W, 
+                           InstructionCost Cost) { 
+    assert(VF.isVector() && "Expected VF >=2"); 
     /// Broadcast this decicion to all instructions inside the group.
     /// But the cost will be assigned to one instruction only.
     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
@@ -1403,14 +1403,14 @@ public:
   /// Return the cost model decision for the given instruction \p I and vector
   /// width \p VF. Return CM_Unknown if this instruction did not pass
   /// through the cost modeling.
-  InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
-    assert(VF.isVector() && "Expected VF to be a vector VF");
+  InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 
+    assert(VF.isVector() && "Expected VF to be a vector VF"); 
     // Cost model is not run in the VPlan-native path - return conservative
     // result until this changes.
     if (EnableVPlanNativePath)
       return CM_GatherScatter;
 
-    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
+    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 
     auto Itr = WideningDecisions.find(InstOnVF);
     if (Itr == WideningDecisions.end())
       return CM_Unknown;
@@ -1419,9 +1419,9 @@ public:
 
   /// Return the vectorization cost for the given instruction \p I and vector
   /// width \p VF.
-  InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
-    assert(VF.isVector() && "Expected VF >=2");
-    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
+  InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 
+    assert(VF.isVector() && "Expected VF >=2"); 
+    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 
     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
            "The cost is not calculated");
     return WideningDecisions[InstOnVF].second;
@@ -1430,7 +1430,7 @@ public:
   /// Return True if instruction \p I is an optimizable truncate whose operand
   /// is an induction variable. Such a truncate will be removed by adding a new
   /// induction variable with the destination type.
-  bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
+  bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 
     // If the instruction is not a truncate, return false.
     auto *Trunc = dyn_cast<TruncInst>(I);
     if (!Trunc)
@@ -1455,14 +1455,14 @@ public:
 
   /// Collects the instructions to scalarize for each predicated instruction in
   /// the loop.
-  void collectInstsToScalarize(ElementCount VF);
+  void collectInstsToScalarize(ElementCount VF); 
 
   /// Collect Uniform and Scalar values for the given \p VF.
   /// The sets depend on CM decision for Load/Store instructions
   /// that may be vectorized as interleave, gather-scatter or scalarized.
-  void collectUniformsAndScalars(ElementCount VF) {
+  void collectUniformsAndScalars(ElementCount VF) { 
     // Do the analysis once.
-    if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
+    if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 
       return;
     setCostBasedWideningDecision(VF);
     collectLoopUniforms(VF);
@@ -1513,8 +1513,8 @@ public:
   /// instructions that may divide by zero.
   /// If a non-zero VF has been calculated, we check if I will be scalarized
   /// predication for that VF.
-  bool isScalarWithPredication(Instruction *I,
-                               ElementCount VF = ElementCount::getFixed(1));
+  bool isScalarWithPredication(Instruction *I, 
+                               ElementCount VF = ElementCount::getFixed(1)); 
 
   // Returns true if \p I is an instruction that will be predicated either
   // through scalar predication or masked load/store or masked gather/scatter.
@@ -1531,16 +1531,16 @@ public:
 
   /// Returns true if \p I is a memory instruction with consecutive memory
   /// access that can be widened.
-  bool
-  memoryInstructionCanBeWidened(Instruction *I,
-                                ElementCount VF = ElementCount::getFixed(1));
+  bool 
+  memoryInstructionCanBeWidened(Instruction *I, 
+                                ElementCount VF = ElementCount::getFixed(1)); 
 
   /// Returns true if \p I is a memory instruction in an interleaved-group
   /// of memory accesses that can be vectorized with wide vector loads/stores
   /// and shuffles.
-  bool
-  interleavedAccessCanBeWidened(Instruction *I,
-                                ElementCount VF = ElementCount::getFixed(1));
+  bool 
+  interleavedAccessCanBeWidened(Instruction *I, 
+                                ElementCount VF = ElementCount::getFixed(1)); 
 
   /// Check if \p Instr belongs to any interleaved access group.
   bool isAccessInterleaved(Instruction *Instr) {
@@ -1553,16 +1553,16 @@ public:
     return InterleaveInfo.getInterleaveGroup(Instr);
   }
 
-  /// Returns true if we're required to use a scalar epilogue for at least
-  /// the final iteration of the original loop.
+  /// Returns true if we're required to use a scalar epilogue for at least 
+  /// the final iteration of the original loop. 
   bool requiresScalarEpilogue() const {
-    if (!isScalarEpilogueAllowed())
-      return false;
-    // If we might exit from anywhere but the latch, must run the exiting
-    // iteration in scalar form.
-    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
-      return true;
-    return InterleaveInfo.requiresScalarEpilogue();
+    if (!isScalarEpilogueAllowed()) 
+      return false; 
+    // If we might exit from anywhere but the latch, must run the exiting 
+    // iteration in scalar form. 
+    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 
+      return true; 
+    return InterleaveInfo.requiresScalarEpilogue(); 
   }
 
   /// Returns true if a scalar epilogue is not allowed due to optsize or a
@@ -1578,34 +1578,34 @@ public:
     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
   }
 
-  /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
-  /// nodes to the chain of instructions representing the reductions. Uses a
-  /// MapVector to ensure deterministic iteration order.
-  using ReductionChainMap =
-      SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
-
-  /// Return the chain of instructions representing an inloop reduction.
-  const ReductionChainMap &getInLoopReductionChains() const {
-    return InLoopReductionChains;
-  }
-
-  /// Returns true if the Phi is part of an inloop reduction.
-  bool isInLoopReduction(PHINode *Phi) const {
-    return InLoopReductionChains.count(Phi);
-  }
-
+  /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 
+  /// nodes to the chain of instructions representing the reductions. Uses a 
+  /// MapVector to ensure deterministic iteration order. 
+  using ReductionChainMap = 
+      SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 
+ 
+  /// Return the chain of instructions representing an inloop reduction. 
+  const ReductionChainMap &getInLoopReductionChains() const { 
+    return InLoopReductionChains; 
+  } 
+ 
+  /// Returns true if the Phi is part of an inloop reduction. 
+  bool isInLoopReduction(PHINode *Phi) const { 
+    return InLoopReductionChains.count(Phi); 
+  } 
+ 
   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
   /// with factor VF.  Return the cost of the instruction, including
   /// scalarization overhead if it's needed.
-  InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
+  InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 
 
   /// Estimate cost of a call instruction CI if it were vectorized with factor
   /// VF. Return the cost of the instruction, including scalarization overhead
   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
   /// scalarized -
   /// i.e. either vector version isn't available, or is too expensive.
-  InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
-                                    bool &NeedToScalarize);
+  InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 
+                                    bool &NeedToScalarize); 
 
   /// Invalidates decisions already taken by the cost model.
   void invalidateCostModelingDecisions() {
@@ -1620,8 +1620,8 @@ private:
   /// \return An upper bound for the vectorization factor, a power-of-2 larger
   /// than zero. One is returned if vectorization should best be avoided due
   /// to cost.
-  ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
-                                    ElementCount UserVF);
+  ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 
+                                    ElementCount UserVF); 
 
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
@@ -1630,54 +1630,54 @@ private:
   /// is
   /// false, then all operations will be scalarized (i.e. no vectorization has
   /// actually taken place).
-  using VectorizationCostTy = std::pair<InstructionCost, bool>;
+  using VectorizationCostTy = std::pair<InstructionCost, bool>; 
 
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
   /// the factor width.
-  VectorizationCostTy expectedCost(ElementCount VF);
+  VectorizationCostTy expectedCost(ElementCount VF); 
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
+  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 
 
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
-  InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
-                                     Type *&VectorTy);
-
-  /// Return the cost of instructions in an inloop reduction pattern, if I is
-  /// part of that pattern.
-  InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
-                                          Type *VectorTy,
-                                          TTI::TargetCostKind CostKind);
-
+  InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 
+                                     Type *&VectorTy); 
+
+  /// Return the cost of instructions in an inloop reduction pattern, if I is 
+  /// part of that pattern. 
+  InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 
+                                          Type *VectorTy, 
+                                          TTI::TargetCostKind CostKind); 
+ 
   /// Calculate vectorization cost of memory instruction \p I.
-  InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
+  InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 
 
   /// The cost computation for scalarized memory instruction.
-  InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
+  InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 
 
   /// The cost computation for interleaving group of memory instructions.
-  InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
+  InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 
 
   /// The cost computation for Gather/Scatter instruction.
-  InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
+  InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 
 
   /// The cost computation for widening instruction \p I with consecutive
   /// memory access.
-  InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
+  InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 
 
   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
   /// Load: scalar load + broadcast.
   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
   /// element)
-  InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
+  InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 
 
   /// Estimate the overhead of scalarizing an instruction. This is a
   /// convenience wrapper for the type-based getScalarizationOverhead API.
-  InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
+  InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); 
 
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
@@ -1695,7 +1695,7 @@ private:
   /// A type representing the costs for instructions if they were to be
   /// scalarized rather than vectorized. The entries are Instruction-Cost
   /// pairs.
-  using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
+  using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 
 
   /// A set containing all BasicBlocks that are known to present after
   /// vectorization as a predicated block.
@@ -1717,38 +1717,38 @@ private:
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
-  DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
+  DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 
 
   /// Holds the instructions known to be uniform after vectorization.
   /// The data is collected per VF.
-  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
+  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 
 
   /// Holds the instructions known to be scalar after vectorization.
   /// The data is collected per VF.
-  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
+  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 
 
   /// Holds the instructions (address computations) that are forced to be
   /// scalarized.
-  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
-
-  /// PHINodes of the reductions that should be expanded in-loop along with
-  /// their associated chains of reduction operations, in program order from top
-  /// (PHI) to bottom
-  ReductionChainMap InLoopReductionChains;
-
-  /// A Map of inloop reduction operations and their immediate chain operand.
-  /// FIXME: This can be removed once reductions can be costed correctly in
-  /// vplan. This was added to allow quick lookup to the inloop operations,
-  /// without having to loop through InLoopReductionChains.
-  DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
-
+  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 
+
+  /// PHINodes of the reductions that should be expanded in-loop along with 
+  /// their associated chains of reduction operations, in program order from top 
+  /// (PHI) to bottom 
+  ReductionChainMap InLoopReductionChains; 
+ 
+  /// A Map of inloop reduction operations and their immediate chain operand. 
+  /// FIXME: This can be removed once reductions can be costed correctly in 
+  /// vplan. This was added to allow quick lookup to the inloop operations, 
+  /// without having to loop through InLoopReductionChains. 
+  DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 
+ 
   /// Returns the expected difference in cost from scalarizing the expression
   /// feeding a predicated instruction \p PredInst. The instructions to
   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
   /// non-negative return value implies the expression will be scalarized.
   /// Currently, only single-use chains are considered for scalarization.
   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
-                              ElementCount VF);
+                              ElementCount VF); 
 
   /// Collect the instructions that are uniform after vectorization. An
   /// instruction is uniform if we represent it with a single scalar value in
@@ -1759,28 +1759,28 @@ private:
   /// scalarized instruction will be represented by VF scalar values in the
   /// vectorized loop, each corresponding to an iteration of the original
   /// scalar loop.
-  void collectLoopUniforms(ElementCount VF);
+  void collectLoopUniforms(ElementCount VF); 
 
   /// Collect the instructions that are scalar after vectorization. An
   /// instruction is scalar if it is known to be uniform or will be scalarized
   /// during vectorization. Non-uniform scalarized instructions will be
   /// represented by VF values in the vectorized loop, each corresponding to an
   /// iteration of the original scalar loop.
-  void collectLoopScalars(ElementCount VF);
+  void collectLoopScalars(ElementCount VF); 
 
   /// Keeps cost model vectorization decision and cost for instructions.
   /// Right now it is used for memory instructions only.
-  using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
-                                std::pair<InstWidening, InstructionCost>>;
+  using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 
+                                std::pair<InstWidening, InstructionCost>>; 
 
   DecisionList WideningDecisions;
 
   /// Returns true if \p V is expected to be vectorized and it needs to be
   /// extracted.
-  bool needsExtract(Value *V, ElementCount VF) const {
+  bool needsExtract(Value *V, ElementCount VF) const { 
     Instruction *I = dyn_cast<Instruction>(V);
-    if (VF.isScalar() || !I || !TheLoop->contains(I) ||
-        TheLoop->isLoopInvariant(I))
+    if (VF.isScalar() || !I || !TheLoop->contains(I) || 
+        TheLoop->isLoopInvariant(I)) 
       return false;
 
     // Assume we can vectorize V (and hence we need extraction) if the
@@ -1795,21 +1795,21 @@ private:
 
   /// Returns a range containing only operands needing to be extracted.
   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
-                                                   ElementCount VF) {
+                                                   ElementCount VF) { 
     return SmallVector<Value *, 4>(make_filter_range(
         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
   }
 
-  /// Determines if we have the infrastructure to vectorize loop \p L and its
-  /// epilogue, assuming the main loop is vectorized by \p VF.
-  bool isCandidateForEpilogueVectorization(const Loop &L,
-                                           const ElementCount VF) const;
-
-  /// Returns true if epilogue vectorization is considered profitable, and
-  /// false otherwise.
-  /// \p VF is the vectorization factor chosen for the original loop.
-  bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
-
+  /// Determines if we have the infrastructure to vectorize loop \p L and its 
+  /// epilogue, assuming the main loop is vectorized by \p VF. 
+  bool isCandidateForEpilogueVectorization(const Loop &L, 
+                                           const ElementCount VF) const; 
+ 
+  /// Returns true if epilogue vectorization is considered profitable, and 
+  /// false otherwise. 
+  /// \p VF is the vectorization factor chosen for the original loop. 
+  bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 
+ 
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -1852,9 +1852,9 @@ public:
 
   /// Values to ignore in the cost model when VF > 1.
   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
-
-  /// Profitable vector factors.
-  SmallVector<VectorizationFactor, 8> ProfitableVFs;
+ 
+  /// Profitable vector factors. 
+  SmallVector<VectorizationFactor, 8> ProfitableVFs; 
 };
 
 } // end namespace llvm
@@ -1875,7 +1875,7 @@ public:
 // representation for pragma 'omp simd' is introduced.
 static bool isExplicitVecOuterLoop(Loop *OuterLp,
                                    OptimizationRemarkEmitter *ORE) {
-  assert(!OuterLp->isInnermost() && "This is not an outer loop");
+  assert(!OuterLp->isInnermost() && "This is not an outer loop"); 
   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
 
   // Only outer loops with an explicit vectorization hint are supported.
@@ -1908,7 +1908,7 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,
   // now, only collect outer loops that have explicit vectorization hints. If we
   // are stress testing the VPlan H-CFG construction, we collect the outermost
   // loop of every loop nest.
-  if (L.isInnermost() || VPlanBuildStressTest ||
+  if (L.isInnermost() || VPlanBuildStressTest || 
       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
     LoopBlocksRPO RPOT(&L);
     RPOT.perform(LI);
@@ -2022,8 +2022,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
 }
 
 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
-    const InductionDescriptor &II, Value *Step, Value *Start,
-    Instruction *EntryVal) {
+    const InductionDescriptor &II, Value *Step, Value *Start, 
+    Instruction *EntryVal) { 
   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
          "Expected either an induction phi-node or a truncate of it!");
 
@@ -2055,8 +2055,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
 
   // Multiply the vectorization factor by the step using integer or
   // floating-point arithmetic as appropriate.
-  Value *ConstVF =
-      getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
+  Value *ConstVF = 
+      getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 
   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
 
   // Create a vector splat to use in the induction update.
@@ -2064,10 +2064,10 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
   // FIXME: If the step is non-constant, we create the vector splat with
   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
   //        handle a constant vector splat.
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
-  Value *SplatVF = isa<Constant>(Mul)
-                       ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
-                       : Builder.CreateVectorSplat(VF, Mul);
+  assert(!VF.isScalable() && "scalable vectors not yet supported."); 
+  Value *SplatVF = isa<Constant>(Mul) 
+                       ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 
+                       : Builder.CreateVectorSplat(VF, Mul); 
   Builder.restoreIP(CurrIP);
 
   // We may need to add the step a number of times, depending on the unroll
@@ -2143,8 +2143,8 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
 }
 
-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
-                                                TruncInst *Trunc) {
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 
+                                                TruncInst *Trunc) { 
   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
          "Primary induction variable must have an integer type");
 
@@ -2202,10 +2202,10 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
     for (unsigned Part = 0; Part < UF; ++Part) {
-      assert(!VF.isScalable() && "scalable vectors not yet supported.");
+      assert(!VF.isScalable() && "scalable vectors not yet supported."); 
       Value *EntryPart =
-          getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
-                        ID.getInductionOpcode());
+          getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 
+                        ID.getInductionOpcode()); 
       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
       if (Trunc)
         addMetadata(EntryPart, Trunc);
@@ -2215,7 +2215,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
 
   // Now do the actual transformations, and start with creating the step value.
   Value *Step = CreateStepValue(ID.getStep());
-  if (VF.isZero() || VF.isScalar()) {
+  if (VF.isZero() || VF.isScalar()) { 
     Value *ScalarIV = CreateScalarIV(Step);
     CreateSplatIV(ScalarIV, Step);
     return;
@@ -2226,7 +2226,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
   // least one user in the loop that is not widened.
   auto NeedsScalarIV = needsScalarInduction(EntryVal);
   if (!NeedsScalarIV) {
-    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
+    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 
     return;
   }
 
@@ -2234,7 +2234,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
   if (!shouldScalarizeInstruction(EntryVal)) {
-    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
+    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 
     Value *ScalarIV = CreateScalarIV(Step);
     // Create scalar steps that can be used by instructions we will later
     // scalarize. Note that the addition of the scalar steps will not increase
@@ -2256,7 +2256,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
                                           Instruction::BinaryOps BinOp) {
   // Create and check the types.
-  auto *ValVTy = cast<FixedVectorType>(Val->getType());
+  auto *ValVTy = cast<FixedVectorType>(Val->getType()); 
   int VLen = ValVTy->getNumElements();
 
   Type *STy = Val->getType()->getScalarType();
@@ -2313,7 +2313,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
                                            Instruction *EntryVal,
                                            const InductionDescriptor &ID) {
   // We shouldn't have to build scalar steps if we aren't vectorizing.
-  assert(VF.isVector() && "VF should be greater than one");
+  assert(VF.isVector() && "VF should be greater than one"); 
   // Get the value type and ensure it and the step have the same integer type.
   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
   assert(ScalarIVTy == Step->getType() &&
@@ -2335,27 +2335,27 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
   // iteration. If EntryVal is uniform, we only need to generate the first
   // lane. Otherwise, we generate all VF values.
   unsigned Lanes =
-      Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
-          ? 1
-          : VF.getKnownMinValue();
-  assert((!VF.isScalable() || Lanes == 1) &&
-         "Should never scalarize a scalable vector");
+      Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 
+          ? 1 
+          : VF.getKnownMinValue(); 
+  assert((!VF.isScalable() || Lanes == 1) && 
+         "Should never scalarize a scalable vector"); 
   // Compute the scalar steps and save the results in VectorLoopValueMap.
   for (unsigned Part = 0; Part < UF; ++Part) {
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-      auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
-                                         ScalarIVTy->getScalarSizeInBits());
-      Value *StartIdx =
-          createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
-      if (ScalarIVTy->isFloatingPointTy())
-        StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
-      StartIdx = addFastMathFlag(Builder.CreateBinOp(
-          AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
-      // The step returned by `createStepForVF` is a runtime-evaluated value
-      // when VF is scalable. Otherwise, it should be folded into a Constant.
-      assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
-             "Expected StartIdx to be folded to a constant when VF is not "
-             "scalable");
+      auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 
+                                         ScalarIVTy->getScalarSizeInBits()); 
+      Value *StartIdx = 
+          createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 
+      if (ScalarIVTy->isFloatingPointTy()) 
+        StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 
+      StartIdx = addFastMathFlag(Builder.CreateBinOp( 
+          AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); 
+      // The step returned by `createStepForVF` is a runtime-evaluated value 
+      // when VF is scalable. Otherwise, it should be folded into a Constant. 
+      assert((VF.isScalable() || isa<Constant>(StartIdx)) && 
+             "Expected StartIdx to be folded to a constant when VF is not " 
+             "scalable"); 
       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
@@ -2389,7 +2389,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
 
     // If we aren't vectorizing, we can just copy the scalar map values over to
     // the vector map.
-    if (VF.isScalar()) {
+    if (VF.isScalar()) { 
       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
       return ScalarValue;
     }
@@ -2398,11 +2398,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
     // is known to be uniform after vectorization, this corresponds to lane zero
     // of the Part unroll iteration. Otherwise, the last instruction is the one
     // we created for the last vector lane of the Part unroll iteration.
-    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
-                            ? 0
-                            : VF.getKnownMinValue() - 1;
-    assert((!VF.isScalable() || LastLane == 0) &&
-           "Scalable vectorization can't lead to any scalarized values.");
+    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 
+                            ? 0 
+                            : VF.getKnownMinValue() - 1; 
+    assert((!VF.isScalable() || LastLane == 0) && 
+           "Scalable vectorization can't lead to any scalarized values."); 
     auto *LastInst = cast<Instruction>(
         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
 
@@ -2423,11 +2423,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
       VectorValue = getBroadcastInstrs(ScalarValue);
       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
     } else {
-      // Initialize packing with insertelements to start from poison.
-      assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-      Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF));
-      VectorLoopValueMap.setVectorValue(V, Part, Poison);
-      for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
+      // Initialize packing with insertelements to start from poison. 
+      assert(!VF.isScalable() && "VF is assumed to be non scalable."); 
+      Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); 
+      VectorLoopValueMap.setVectorValue(V, Part, Poison); 
+      for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 
         packScalarIntoVectorValue(V, {Part, Lane});
       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
     }
@@ -2466,7 +2466,7 @@ InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
   // extractelement instruction.
   auto *U = getOrCreateVectorValue(V, Instance.Part);
   if (!U->getType()->isVectorTy()) {
-    assert(VF.isScalar() && "Value not scalarized has non-vector type");
+    assert(VF.isScalar() && "Value not scalarized has non-vector type"); 
     return U;
   }
 
@@ -2491,12 +2491,12 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(
 
 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
   assert(Vec->getType()->isVectorTy() && "Invalid type");
-  assert(!VF.isScalable() && "Cannot reverse scalable vectors");
+  assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 
   SmallVector<int, 8> ShuffleMask;
-  for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
-    ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
+  for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 
+    ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 
 
-  return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
+  return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 
 }
 
 // Return whether we allow using masked interleave-groups (for dealing with
@@ -2521,9 +2521,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
 //   }
 // To:
 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
-//   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
-//   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
-//   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
+//   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements 
+//   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements 
+//   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements 
 //
 // Or translate following interleaved store group (factor = 3):
 //   for (i = 0; i < N; i+=3) {
@@ -2534,22 +2534,22 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
 //   }
 // To:
 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
-//   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
+//   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 
 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
 void InnerLoopVectorizer::vectorizeInterleaveGroup(
-    const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
-    VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
-    VPValue *BlockInMask) {
+    const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 
+    VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 
+    VPValue *BlockInMask) { 
   Instruction *Instr = Group->getInsertPos();
   const DataLayout &DL = Instr->getModule()->getDataLayout();
 
   // Prepare for the vector type of the interleaved load/store.
   Type *ScalarTy = getMemInstValueType(Instr);
   unsigned InterleaveFactor = Group->getFactor();
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
-  auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
+  assert(!VF.isScalable() && "scalable vectors not yet supported."); 
+  auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 
 
   // Prepare for the new pointers.
   SmallVector<Value *, 2> AddrParts;
@@ -2565,10 +2565,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   // pointer operand of the interleaved access is supposed to be uniform. For
   // uniform instructions, we're only required to generate a value for the
   // first vector lane in each unroll iteration.
-  assert(!VF.isScalable() &&
-         "scalable vector reverse operation is not implemented");
+  assert(!VF.isScalable() && 
+         "scalable vector reverse operation is not implemented"); 
   if (Group->isReverse())
-    Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
+    Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 
 
   for (unsigned Part = 0; Part < UF; Part++) {
     Value *AddrPart = State.get(Addr, {Part, 0});
@@ -2599,12 +2599,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   }
 
   setDebugLocFromInst(Builder, Instr);
-  Value *PoisonVec = PoisonValue::get(VecTy);
+  Value *PoisonVec = PoisonValue::get(VecTy); 
 
   Value *MaskForGaps = nullptr;
   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
-    MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
+    assert(!VF.isScalable() && "scalable vectors not yet supported."); 
+    MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 
     assert(MaskForGaps && "Mask for Gaps is required but it is null");
   }
 
@@ -2620,11 +2620,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         Value *GroupMask = MaskForGaps;
         if (BlockInMask) {
           Value *BlockInMaskPart = State.get(BlockInMask, Part);
-          assert(!VF.isScalable() && "scalable vectors not yet supported.");
+          assert(!VF.isScalable() && "scalable vectors not yet supported."); 
           Value *ShuffledMask = Builder.CreateShuffleVector(
-              BlockInMaskPart,
-              createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
-              "interleaved.mask");
+              BlockInMaskPart, 
+              createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 
+              "interleaved.mask"); 
           GroupMask = MaskForGaps
                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
                                                 MaskForGaps)
@@ -2632,7 +2632,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         }
         NewLoad =
             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
-                                     GroupMask, PoisonVec, "wide.masked.vec");
+                                     GroupMask, PoisonVec, "wide.masked.vec"); 
       }
       else
         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
@@ -2643,7 +2643,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
 
     // For each member in the group, shuffle out the appropriate data from the
     // wide loads.
-    unsigned J = 0;
+    unsigned J = 0; 
     for (unsigned I = 0; I < InterleaveFactor; ++I) {
       Instruction *Member = Group->getMember(I);
 
@@ -2651,33 +2651,33 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
       if (!Member)
         continue;
 
-      assert(!VF.isScalable() && "scalable vectors not yet supported.");
-      auto StrideMask =
-          createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
+      assert(!VF.isScalable() && "scalable vectors not yet supported."); 
+      auto StrideMask = 
+          createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 
       for (unsigned Part = 0; Part < UF; Part++) {
         Value *StridedVec = Builder.CreateShuffleVector(
-            NewLoads[Part], StrideMask, "strided.vec");
+            NewLoads[Part], StrideMask, "strided.vec"); 
 
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
-          assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-          VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
+          assert(!VF.isScalable() && "VF is assumed to be non scalable."); 
+          VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 
           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
         }
 
         if (Group->isReverse())
           StridedVec = reverseVector(StridedVec);
 
-        State.set(VPDefs[J], Member, StridedVec, Part);
+        State.set(VPDefs[J], Member, StridedVec, Part); 
       }
-      ++J;
+      ++J; 
     }
     return;
   }
 
   // The sub vector type for current instruction.
-  assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-  auto *SubVT = VectorType::get(ScalarTy, VF);
+  assert(!VF.isScalable() && "VF is assumed to be non scalable."); 
+  auto *SubVT = VectorType::get(ScalarTy, VF); 
 
   // Vectorize the interleaved store group.
   for (unsigned Part = 0; Part < UF; Part++) {
@@ -2685,10 +2685,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     SmallVector<Value *, 4> StoredVecs;
     for (unsigned i = 0; i < InterleaveFactor; i++) {
       // Interleaved store group doesn't allow a gap, so each index has a member
-      assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
-
-      Value *StoredVec = State.get(StoredValues[i], Part);
+      assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 
 
+      Value *StoredVec = State.get(StoredValues[i], Part); 
+ 
       if (Group->isReverse())
         StoredVec = reverseVector(StoredVec);
 
@@ -2704,17 +2704,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     Value *WideVec = concatenateVectors(Builder, StoredVecs);
 
     // Interleave the elements in the wide vector.
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
+    assert(!VF.isScalable() && "scalable vectors not yet supported."); 
     Value *IVec = Builder.CreateShuffleVector(
-        WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
+        WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 
         "interleaved.vec");
 
     Instruction *NewStoreInstr;
     if (BlockInMask) {
       Value *BlockInMaskPart = State.get(BlockInMask, Part);
       Value *ShuffledMask = Builder.CreateShuffleVector(
-          BlockInMaskPart,
-          createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
+          BlockInMaskPart, 
+          createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 
           "interleaved.mask");
       NewStoreInstr = Builder.CreateMaskedStore(
           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
@@ -2727,9 +2727,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   }
 }
 
-void InnerLoopVectorizer::vectorizeMemoryInstruction(
-    Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
-    VPValue *StoredValue, VPValue *BlockInMask) {
+void InnerLoopVectorizer::vectorizeMemoryInstruction( 
+    Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 
+    VPValue *StoredValue, VPValue *BlockInMask) { 
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
   StoreInst *SI = dyn_cast<StoreInst>(Instr);
@@ -2746,8 +2746,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
          "CM decision is not to widen the memory instruction");
 
   Type *ScalarDataTy = getMemInstValueType(Instr);
-
-  auto *DataTy = VectorType::get(ScalarDataTy, VF);
+ 
+  auto *DataTy = VectorType::get(ScalarDataTy, VF); 
   const Align Alignment = getLoadStoreAlignment(Instr);
 
   // Determine if the pointer operand of the access is either consecutive or
@@ -2779,23 +2779,23 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
       InBounds = gep->isInBounds();
 
     if (Reverse) {
-      assert(!VF.isScalable() &&
-             "Reversing vectors is not yet supported for scalable vectors.");
-
+      assert(!VF.isScalable() && 
+             "Reversing vectors is not yet supported for scalable vectors."); 
+ 
       // If the address is consecutive but reversed, then the
       // wide store needs to start at the last vector element.
-      PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
-          ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
+      PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 
+          ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 
       PartPtr->setIsInBounds(InBounds);
-      PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
-          ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
+      PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 
+          ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 
       PartPtr->setIsInBounds(InBounds);
       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
     } else {
-      Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
+      Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 
       PartPtr = cast<GetElementPtrInst>(
-          Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
+          Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 
       PartPtr->setIsInBounds(InBounds);
     }
 
@@ -2850,7 +2850,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
       if (isMaskRequired)
         NewLI = Builder.CreateMaskedLoad(
-            VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
+            VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 
             "wide.masked.load");
       else
         NewLI =
@@ -2861,8 +2861,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
       if (Reverse)
         NewLI = reverseVector(NewLI);
     }
-
-    State.set(Def, Instr, NewLI, Part);
+ 
+    State.set(Def, Instr, NewLI, Part); 
   }
 }
 
@@ -2872,12 +2872,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
                                                VPTransformState &State) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
 
-  // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
-  // the first lane and part.
-  if (isa<NoAliasScopeDeclInst>(Instr))
-    if (Instance.Lane != 0 || Instance.Part != 0)
-      return;
-
+  // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 
+  // the first lane and part. 
+  if (isa<NoAliasScopeDeclInst>(Instr)) 
+    if (Instance.Lane != 0 || Instance.Part != 0) 
+      return; 
+ 
   setDebugLocFromInst(Builder, Instr);
 
   // Does this instruction return a value ?
@@ -2890,12 +2890,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
   // Replace the operands of the cloned instructions with their scalar
   // equivalents in the new loop.
   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
-    auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
-    auto InputInstance = Instance;
-    if (!Operand || !OrigLoop->contains(Operand) ||
-        (Cost->isUniformAfterVectorization(Operand, State.VF)))
-      InputInstance.Lane = 0;
-    auto *NewOp = State.get(User.getOperand(op), InputInstance);
+    auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 
+    auto InputInstance = Instance; 
+    if (!Operand || !OrigLoop->contains(Operand) || 
+        (Cost->isUniformAfterVectorization(Operand, State.VF))) 
+      InputInstance.Lane = 0; 
+    auto *NewOp = State.get(User.getOperand(op), InputInstance); 
     Cloned->setOperand(op, NewOp);
   }
   addNewMetadata(Cloned, Instr);
@@ -2903,9 +2903,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
   // Place the cloned scalar in the new loop.
   Builder.Insert(Cloned);
 
-  // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
-  // representing scalar values in VPTransformState. Add the cloned scalar to
-  // the scalar map entry.
+  // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 
+  // representing scalar values in VPTransformState. Add the cloned scalar to 
+  // the scalar map entry. 
   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
 
   // If we just cloned a new assumption, add it the assumption cache.
@@ -2942,7 +2942,7 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
   Induction->addIncoming(Next, Latch);
   // Create the compare.
   Value *ICmp = Builder.CreateICmpEQ(Next, End);
-  Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
+  Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 
 
   // Now we have two terminators. Remove the old one from the block.
   Latch->getTerminator()->eraseFromParent();
@@ -2959,7 +2959,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
   // Find the loop boundaries.
   ScalarEvolution *SE = PSE.getSE();
   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
-  assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
+  assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 
          "Invalid loop count");
 
   Type *IdxTy = Legal->getWidestInductionType();
@@ -3005,8 +3005,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
 
   Type *Ty = TC->getType();
-  // This is where we can make the step a runtime constant.
-  Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
+  // This is where we can make the step a runtime constant. 
+  Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 
 
   // If the tail is to be folded by masking, round the number of iterations N
   // up to a multiple of Step instead of rounding down. This is done by first
@@ -3015,12 +3015,12 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   // that it starts at zero and its Step is a power of two; the loop will then
   // exit, with the last early-exit vector comparison also producing all-true.
   if (Cost->foldTailByMasking()) {
-    assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
+    assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 
            "VF*UF must be a power of 2 when folding tail by masking");
-    assert(!VF.isScalable() &&
-           "Tail folding not yet supported for scalable vectors");
-    TC = Builder.CreateAdd(
-        TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
+    assert(!VF.isScalable() && 
+           "Tail folding not yet supported for scalable vectors"); 
+    TC = Builder.CreateAdd( 
+        TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 
   }
 
   // Now we need to generate the expression for the part of the loop that the
@@ -3030,18 +3030,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   // unroll factor (number of SIMD instructions).
   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
 
-  // There are two cases where we need to ensure (at least) the last iteration
-  // runs in the scalar remainder loop. Thus, if the step evenly divides
+  // There are two cases where we need to ensure (at least) the last iteration 
+  // runs in the scalar remainder loop. Thus, if the step evenly divides 
   // the trip count, we set the remainder to be equal to the step. If the step
   // does not evenly divide the trip count, no adjustment is necessary since
   // there will already be scalar iterations. Note that the minimum iterations
-  // check ensures that N >= Step. The cases are:
-  // 1) If there is a non-reversed interleaved group that may speculatively
-  //    access memory out-of-bounds.
-  // 2) If any instruction may follow a conditionally taken exit. That is, if
-  //    the loop contains multiple exiting blocks, or a single exiting block
-  //    which is not the latch.
-  if (VF.isVector() && Cost->requiresScalarEpilogue()) {
+  // check ensures that N >= Step. The cases are: 
+  // 1) If there is a non-reversed interleaved group that may speculatively 
+  //    access memory out-of-bounds. 
+  // 2) If any instruction may follow a conditionally taken exit. That is, if 
+  //    the loop contains multiple exiting blocks, or a single exiting block 
+  //    which is not the latch. 
+  if (VF.isVector() && Cost->requiresScalarEpilogue()) { 
     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
     R = Builder.CreateSelect(IsZero, Step, R);
   }
@@ -3054,18 +3054,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
                                                    const DataLayout &DL) {
   // Verify that V is a vector type with same number of elements as DstVTy.
-  auto *DstFVTy = cast<FixedVectorType>(DstVTy);
-  unsigned VF = DstFVTy->getNumElements();
-  auto *SrcVecTy = cast<FixedVectorType>(V->getType());
+  auto *DstFVTy = cast<FixedVectorType>(DstVTy); 
+  unsigned VF = DstFVTy->getNumElements(); 
+  auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 
   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
   Type *SrcElemTy = SrcVecTy->getElementType();
-  Type *DstElemTy = DstFVTy->getElementType();
+  Type *DstElemTy = DstFVTy->getElementType(); 
   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
          "Vector elements must have same size");
 
   // Do a direct cast if element types are castable.
   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
-    return Builder.CreateBitOrPointerCast(V, DstFVTy);
+    return Builder.CreateBitOrPointerCast(V, DstFVTy); 
   }
   // V cannot be directly casted to desired vector type.
   // May happen when V is a floating point vector but DstVTy is a vector of
@@ -3079,7 +3079,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
-  return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
+  return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 
 }
 
 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
@@ -3100,11 +3100,11 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
 
   // If tail is to be folded, vector loop takes care of all iterations.
   Value *CheckMinIters = Builder.getFalse();
-  if (!Cost->foldTailByMasking()) {
-    Value *Step =
-        createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
-    CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
-  }
+  if (!Cost->foldTailByMasking()) { 
+    Value *Step = 
+        createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 
+    CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 
+  } 
   // Create new preheader for vector loop.
   LoopVectorPreHeader =
       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
@@ -3141,9 +3141,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
     if (C->isZero())
       return;
 
-  assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
-           (OptForSizeBasedOnProfile &&
-            Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
+  assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 
+           (OptForSizeBasedOnProfile && 
+            Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 
          "Cannot SCEV check stride or overflow when optimizing for size");
 
   SCEVCheckBlock->setName("vector.scevcheck");
@@ -3182,7 +3182,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   if (!RtPtrChecking.Need)
     return;
 
-  if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
+  if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 
     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
            "Cannot emit memory checks when optimizing for size, unless forced "
            "to vectorize.");
@@ -3202,33 +3202,33 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
                  "vector.ph");
 
-  auto *CondBranch = cast<BranchInst>(
-      Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
-  ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
-  LoopBypassBlocks.push_back(MemCheckBlock);
-  AddedSafetyChecks = true;
-
+  auto *CondBranch = cast<BranchInst>( 
+      Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 
+  ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 
+  LoopBypassBlocks.push_back(MemCheckBlock); 
+  AddedSafetyChecks = true; 
+ 
   // Update dominator only if this is first RT check.
   if (LoopBypassBlocks.empty()) {
     DT->changeImmediateDominator(Bypass, MemCheckBlock);
     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
   }
 
-  Instruction *FirstCheckInst;
-  Instruction *MemRuntimeCheck;
-  std::tie(FirstCheckInst, MemRuntimeCheck) =
-      addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
-                       RtPtrChecking.getChecks(), RtPtrChecking.getSE());
-  assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
-                            "claimed checks are required");
-  CondBranch->setCondition(MemRuntimeCheck);
+  Instruction *FirstCheckInst; 
+  Instruction *MemRuntimeCheck; 
+  std::tie(FirstCheckInst, MemRuntimeCheck) = 
+      addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 
+                       RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 
+  assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 
+                            "claimed checks are required"); 
+  CondBranch->setCondition(MemRuntimeCheck); 
 
   // We currently don't use LoopVersioning for the actual loop cloning but we
   // still use it to add the noalias metadata.
-  LVer = std::make_unique<LoopVersioning>(
-      *Legal->getLAI(),
-      Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
-      DT, PSE.getSE());
+  LVer = std::make_unique<LoopVersioning>( 
+      *Legal->getLAI(), 
+      Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 
+      DT, PSE.getSE()); 
   LVer->prepareNoAliasMetadata();
 }
 
@@ -3332,35 +3332,35 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
   llvm_unreachable("invalid enum");
 }
 
-Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
+Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 
   LoopScalarBody = OrigLoop->getHeader();
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
-  LoopExitBlock = OrigLoop->getUniqueExitBlock();
+  LoopExitBlock = OrigLoop->getUniqueExitBlock(); 
   assert(LoopExitBlock && "Must have an exit block");
   assert(LoopVectorPreHeader && "Invalid loop structure");
 
   LoopMiddleBlock =
       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
-                 LI, nullptr, Twine(Prefix) + "middle.block");
+                 LI, nullptr, Twine(Prefix) + "middle.block"); 
   LoopScalarPreHeader =
       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
-                 nullptr, Twine(Prefix) + "scalar.ph");
-
-  // Set up branch from middle block to the exit and scalar preheader blocks.
-  // completeLoopSkeleton will update the condition to use an iteration check,
-  // if required to decide whether to execute the remainder.
-  BranchInst *BrInst =
-      BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
-  auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
-  BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
-  ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
-
+                 nullptr, Twine(Prefix) + "scalar.ph"); 
+ 
+  // Set up branch from middle block to the exit and scalar preheader blocks. 
+  // completeLoopSkeleton will update the condition to use an iteration check, 
+  // if required to decide whether to execute the remainder. 
+  BranchInst *BrInst = 
+      BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 
+  auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 
+  BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 
+  ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 
+ 
   // We intentionally don't let SplitBlock to update LoopInfo since
   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
   // LoopVectorBody is explicitly added to the correct place few lines later.
   LoopVectorBody =
       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
-                 nullptr, nullptr, Twine(Prefix) + "vector.body");
+                 nullptr, nullptr, Twine(Prefix) + "vector.body"); 
 
   // Update dominator for loop exit.
   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
@@ -3377,16 +3377,16 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
     LI->addTopLevelLoop(Lp);
   }
   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
-  return Lp;
-}
-
-void InnerLoopVectorizer::createInductionResumeValues(
-    Loop *L, Value *VectorTripCount,
-    std::pair<BasicBlock *, Value *> AdditionalBypass) {
-  assert(VectorTripCount && L && "Expected valid arguments");
-  assert(((AdditionalBypass.first && AdditionalBypass.second) ||
-          (!AdditionalBypass.first && !AdditionalBypass.second)) &&
-         "Inconsistent information about additional bypass.");
+  return Lp; 
+} 
+
+void InnerLoopVectorizer::createInductionResumeValues( 
+    Loop *L, Value *VectorTripCount, 
+    std::pair<BasicBlock *, Value *> AdditionalBypass) { 
+  assert(VectorTripCount && L && "Expected valid arguments"); 
+  assert(((AdditionalBypass.first && AdditionalBypass.second) || 
+          (!AdditionalBypass.first && !AdditionalBypass.second)) && 
+         "Inconsistent information about additional bypass."); 
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
   // PHIs that are left in the scalar version of the loop.
@@ -3405,31 +3405,31 @@ void InnerLoopVectorizer::createInductionResumeValues(
     // Copy original phi DL over to the new one.
     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
     Value *&EndValue = IVEndValues[OrigPhi];
-    Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
+    Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 
     if (OrigPhi == OldInduction) {
       // We know what the end value is.
-      EndValue = VectorTripCount;
+      EndValue = VectorTripCount; 
     } else {
-      IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+      IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 
       Type *StepType = II.getStep()->getType();
       Instruction::CastOps CastOp =
-          CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
-      Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
+          CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 
+      Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 
       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
       EndValue->setName("ind.end");
-
-      // Compute the end value for the additional bypass (if applicable).
-      if (AdditionalBypass.first) {
-        B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
-        CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
-                                         StepType, true);
-        CRD =
-            B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
-        EndValueFromAdditionalBypass =
-            emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
-        EndValueFromAdditionalBypass->setName("ind.end");
-      }
+ 
+      // Compute the end value for the additional bypass (if applicable). 
+      if (AdditionalBypass.first) { 
+        B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 
+        CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 
+                                         StepType, true); 
+        CRD = 
+            B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 
+        EndValueFromAdditionalBypass = 
+            emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 
+        EndValueFromAdditionalBypass->setName("ind.end"); 
+      } 
     }
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
@@ -3440,44 +3440,44 @@ void InnerLoopVectorizer::createInductionResumeValues(
     // value.
     for (BasicBlock *BB : LoopBypassBlocks)
       BCResumeVal->addIncoming(II.getStartValue(), BB);
-
-    if (AdditionalBypass.first)
-      BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
-                                            EndValueFromAdditionalBypass);
-
+ 
+    if (AdditionalBypass.first) 
+      BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 
+                                            EndValueFromAdditionalBypass); 
+ 
     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
   }
-}
+} 
 
-BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
-                                                      MDNode *OrigLoopID) {
-  assert(L && "Expected valid loop.");
-
-  // The trip counts should be cached by now.
-  Value *Count = getOrCreateTripCount(L);
-  Value *VectorTripCount = getOrCreateVectorTripCount(L);
-
-  auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
+BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 
+                                                      MDNode *OrigLoopID) { 
+  assert(L && "Expected valid loop."); 
 
+  // The trip counts should be cached by now. 
+  Value *Count = getOrCreateTripCount(L); 
+  Value *VectorTripCount = getOrCreateVectorTripCount(L); 
+ 
+  auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 
+ 
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
   // If tail is to be folded, we know we don't need to run the remainder.
   if (!Cost->foldTailByMasking()) {
-    Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-                                        Count, VectorTripCount, "cmp.n",
-                                        LoopMiddleBlock->getTerminator());
+    Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 
+                                        Count, VectorTripCount, "cmp.n", 
+                                        LoopMiddleBlock->getTerminator()); 
 
-    // Here we use the same DebugLoc as the scalar loop latch terminator instead
+    // Here we use the same DebugLoc as the scalar loop latch terminator instead 
     // of the corresponding compare because they may have ended up with
     // different line numbers and we want to avoid awkward line stepping while
     // debugging. Eg. if the compare has got a line number inside the loop.
-    CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
-    cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
+    CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 
+    cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 
   }
 
   // Get ready to start creating new instructions into the vectorized body.
-  assert(LoopVectorPreHeader == L->getLoopPreheader() &&
+  assert(LoopVectorPreHeader == L->getLoopPreheader() && 
          "Inconsistent vector loop preheader");
   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
 
@@ -3485,7 +3485,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
                                       LLVMLoopVectorizeFollowupVectorized});
   if (VectorizedLoopID.hasValue()) {
-    L->setLoopID(VectorizedLoopID.getValue());
+    L->setLoopID(VectorizedLoopID.getValue()); 
 
     // Do not setAlreadyVectorized if loop attributes have been defined
     // explicitly.
@@ -3495,9 +3495,9 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
   // Keep all loop hints from the original loop on the vector loop (we'll
   // replace the vectorizer-specific hints below).
   if (MDNode *LID = OrigLoop->getLoopID())
-    L->setLoopID(LID);
+    L->setLoopID(LID); 
 
-  LoopVectorizeHints Hints(L, true, *ORE);
+  LoopVectorizeHints Hints(L, true, *ORE); 
   Hints.setAlreadyVectorized();
 
 #ifdef EXPENSIVE_CHECKS
@@ -3508,91 +3508,91 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
   return LoopVectorPreHeader;
 }
 
-BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
-  /*
-   In this function we generate a new loop. The new loop will contain
-   the vectorized instructions while the old loop will continue to run the
-   scalar remainder.
-
-       [ ] <-- loop iteration number check.
-    /   |
-   /    v
-  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
-  |  /  |
-  | /   v
-  ||   [ ]     <-- vector pre header.
-  |/    |
-  |     v
-  |    [  ] \
-  |    [  ]_|   <-- vector loop.
-  |     |
-  |     v
-  |   -[ ]   <--- middle-block.
-  |  /  |
-  | /   v
-  -|- >[ ]     <--- new preheader.
-   |    |
-   |    v
-   |   [ ] \
-   |   [ ]_|   <-- old scalar loop to handle remainder.
-    \   |
-     \  v
-      >[ ]     <-- exit block.
-   ...
-   */
-
-  // Get the metadata of the original loop before it gets modified.
-  MDNode *OrigLoopID = OrigLoop->getLoopID();
-
-  // Create an empty vector loop, and prepare basic blocks for the runtime
-  // checks.
-  Loop *Lp = createVectorLoopSkeleton("");
-
-  // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop. This check also covers the case where the
-  // backedge-taken count is uint##_max: adding one to it will overflow leading
-  // to an incorrect trip count of zero. In this (rare) case we will also jump
-  // to the scalar loop.
-  emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
-
-  // Generate the code to check any assumptions that we've made for SCEV
-  // expressions.
-  emitSCEVChecks(Lp, LoopScalarPreHeader);
-
-  // Generate the code that checks in runtime if arrays overlap. We put the
-  // checks into a separate block to make the more common case of few elements
-  // faster.
-  emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
-
-  // Some loops have a single integer induction variable, while other loops
-  // don't. One example is c++ iterators that often have multiple pointer
-  // induction variables. In the code below we also support a case where we
-  // don't have a single induction variable.
-  //
-  // We try to obtain an induction variable from the original loop as hard
-  // as possible. However if we don't find one that:
-  //   - is an integer
-  //   - counts from zero, stepping by one
-  //   - is the size of the widest induction variable type
-  // then we create a new one.
-  OldInduction = Legal->getPrimaryInduction();
-  Type *IdxTy = Legal->getWidestInductionType();
-  Value *StartIdx = ConstantInt::get(IdxTy, 0);
-  // The loop step is equal to the vectorization factor (num of SIMD elements)
-  // times the unroll factor (num of SIMD instructions).
-  Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
-  Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
-  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
-  Induction =
-      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
-                              getDebugLocFromInstOrOperands(OldInduction));
-
-  // Emit phis for the new starting index of the scalar loop.
-  createInductionResumeValues(Lp, CountRoundDown);
-
-  return completeLoopSkeleton(Lp, OrigLoopID);
-}
-
+BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 
+  /* 
+   In this function we generate a new loop. The new loop will contain 
+   the vectorized instructions while the old loop will continue to run the 
+   scalar remainder. 
+ 
+       [ ] <-- loop iteration number check. 
+    /   | 
+   /    v 
+  |    [ ] <-- vector loop bypass (may consist of multiple blocks). 
+  |  /  | 
+  | /   v 
+  ||   [ ]     <-- vector pre header. 
+  |/    | 
+  |     v 
+  |    [  ] \ 
+  |    [  ]_|   <-- vector loop. 
+  |     | 
+  |     v 
+  |   -[ ]   <--- middle-block. 
+  |  /  | 
+  | /   v 
+  -|- >[ ]     <--- new preheader. 
+   |    | 
+   |    v 
+   |   [ ] \ 
+   |   [ ]_|   <-- old scalar loop to handle remainder. 
+    \   | 
+     \  v 
+      >[ ]     <-- exit block. 
+   ... 
+   */ 
+ 
+  // Get the metadata of the original loop before it gets modified. 
+  MDNode *OrigLoopID = OrigLoop->getLoopID(); 
+ 
+  // Create an empty vector loop, and prepare basic blocks for the runtime 
+  // checks. 
+  Loop *Lp = createVectorLoopSkeleton(""); 
+ 
+  // Now, compare the new count to zero. If it is zero skip the vector loop and 
+  // jump to the scalar loop. This check also covers the case where the 
+  // backedge-taken count is uint##_max: adding one to it will overflow leading 
+  // to an incorrect trip count of zero. In this (rare) case we will also jump 
+  // to the scalar loop. 
+  emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 
+ 
+  // Generate the code to check any assumptions that we've made for SCEV 
+  // expressions. 
+  emitSCEVChecks(Lp, LoopScalarPreHeader); 
+ 
+  // Generate the code that checks in runtime if arrays overlap. We put the 
+  // checks into a separate block to make the more common case of few elements 
+  // faster. 
+  emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 
+ 
+  // Some loops have a single integer induction variable, while other loops 
+  // don't. One example is c++ iterators that often have multiple pointer 
+  // induction variables. In the code below we also support a case where we 
+  // don't have a single induction variable. 
+  // 
+  // We try to obtain an induction variable from the original loop as hard 
+  // as possible. However if we don't find one that: 
+  //   - is an integer 
+  //   - counts from zero, stepping by one 
+  //   - is the size of the widest induction variable type 
+  // then we create a new one. 
+  OldInduction = Legal->getPrimaryInduction(); 
+  Type *IdxTy = Legal->getWidestInductionType(); 
+  Value *StartIdx = ConstantInt::get(IdxTy, 0); 
+  // The loop step is equal to the vectorization factor (num of SIMD elements) 
+  // times the unroll factor (num of SIMD instructions). 
+  Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 
+  Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 
+  Induction = 
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 
+                              getDebugLocFromInstOrOperands(OldInduction)); 
+ 
+  // Emit phis for the new starting index of the scalar loop. 
+  createInductionResumeValues(Lp, CountRoundDown); 
+ 
+  return completeLoopSkeleton(Lp, OrigLoopID); 
+} 
+ 
 // Fix up external users of the induction variable. At this point, we are
 // in LCSSA form, with all external PHIs that use the IV having one input value,
 // coming from the remainder loop. We need those PHIs to also have a correct
@@ -3606,7 +3606,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
   // value (the value that feeds into the phi from the loop latch).
   // We allow both, but they, obviously, have different values.
 
-  assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
+  assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 
 
   DenseMap<Value *, Value *> MissingVals;
 
@@ -3712,10 +3712,10 @@ static void cse(BasicBlock *BB) {
   }
 }
 
-InstructionCost
-LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
-                                              bool &NeedToScalarize) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
+InstructionCost 
+LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 
+                                              bool &NeedToScalarize) { 
+  assert(!VF.isScalable() && "scalable vectors not yet supported."); 
   Function *F = CI->getCalledFunction();
   Type *ScalarRetTy = CI->getType();
   SmallVector<Type *, 4> Tys, ScalarTys;
@@ -3726,9 +3726,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
   // to be vectors, so we need to extract individual elements from there,
   // execute VF scalar calls, and then gather the result into the vector return
   // value.
-  InstructionCost ScalarCallCost =
-      TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
-  if (VF.isScalar())
+  InstructionCost ScalarCallCost = 
+      TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 
+  if (VF.isScalar()) 
     return ScalarCallCost;
 
   // Compute corresponding vector type for return value and arguments.
@@ -3738,33 +3738,33 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
 
   // Compute costs of unpacking argument values for the scalar calls and
   // packing the return values to a vector.
-  InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
+  InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 
 
-  InstructionCost Cost =
-      ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
+  InstructionCost Cost = 
+      ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 
 
   // If we can't emit a vector call for this function, then the currently found
   // cost is the cost we need to return.
   NeedToScalarize = true;
-  VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
+  VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 
   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
 
   if (!TLI || CI->isNoBuiltin() || !VecFunc)
     return Cost;
 
   // If the corresponding vector cost is cheaper, return its cost.
-  InstructionCost VectorCallCost =
-      TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
+  InstructionCost VectorCallCost = 
+      TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 
   if (VectorCallCost < Cost) {
     NeedToScalarize = false;
-    Cost = VectorCallCost;
+    Cost = VectorCallCost; 
   }
   return Cost;
 }
 
-InstructionCost
-LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
-                                                   ElementCount VF) {
+InstructionCost 
+LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 
+                                                   ElementCount VF) { 
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   assert(ID && "Expected intrinsic call!");
 
@@ -3804,8 +3804,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
       Type *ScalarTruncatedTy =
           IntegerType::get(OriginalTy->getContext(), KV.second);
       auto *TruncatedTy = FixedVectorType::get(
-          ScalarTruncatedTy,
-          cast<FixedVectorType>(OriginalTy)->getNumElements());
+          ScalarTruncatedTy, 
+          cast<FixedVectorType>(OriginalTy)->getNumElements()); 
       if (TruncatedTy == OriginalTy)
         continue;
 
@@ -3855,13 +3855,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
           break;
         }
       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
-        auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
-                             ->getNumElements();
+        auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 
+                             ->getNumElements(); 
         auto *O0 = B.CreateZExtOrTrunc(
             SI->getOperand(0),
             FixedVectorType::get(ScalarTruncatedTy, Elements0));
-        auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
-                             ->getNumElements();
+        auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 
+                             ->getNumElements(); 
         auto *O1 = B.CreateZExtOrTrunc(
             SI->getOperand(1),
             FixedVectorType::get(ScalarTruncatedTy, Elements1));
@@ -3871,16 +3871,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
         // Don't do anything with the operands, just extend the result.
         continue;
       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
-        auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
-                            ->getNumElements();
+        auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 
+                            ->getNumElements(); 
         auto *O0 = B.CreateZExtOrTrunc(
             IE->getOperand(0),
             FixedVectorType::get(ScalarTruncatedTy, Elements));
         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
-        auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
-                            ->getNumElements();
+        auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 
+                            ->getNumElements(); 
         auto *O0 = B.CreateZExtOrTrunc(
             EE->getOperand(0),
             FixedVectorType::get(ScalarTruncatedTy, Elements));
@@ -3922,7 +3922,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
 void InnerLoopVectorizer::fixVectorizedLoop() {
   // Insert truncates and extends for any truncated instructions as hints to
   // InstCombine.
-  if (VF.isVector())
+  if (VF.isVector()) 
     truncateToMinimalBitwidths();
 
   // Fix widened non-induction PHIs by setting up the PHI operands.
@@ -3963,13 +3963,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
   // profile is not inherently precise anyway. Note also possible bypass of
   // vector code caused by legality checks is ignored, assigning all the weight
   // to the vector loop, optimistically.
-  //
-  // For scalable vectorization we can't know at compile time how many iterations
-  // of the loop are handled in one vector iteration, so instead assume a pessimistic
-  // vscale of '1'.
-  setProfileInfoAfterUnrolling(
-      LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
-      LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
+  // 
+  // For scalable vectorization we can't know at compile time how many iterations 
+  // of the loop are handled in one vector iteration, so instead assume a pessimistic 
+  // vscale of '1'. 
+  setProfileInfoAfterUnrolling( 
+      LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 
+      LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 
 }
 
 void InnerLoopVectorizer::fixCrossIterationPHIs() {
@@ -4048,12 +4048,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // Create a vector from the initial value.
   auto *VectorInit = ScalarInit;
-  if (VF.isVector()) {
+  if (VF.isVector()) { 
     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
-    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+    assert(!VF.isScalable() && "VF is assumed to be non scalable."); 
     VectorInit = Builder.CreateInsertElement(
-        PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
-        Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
+        PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 
+        Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 
   }
 
   // We constructed a temporary phi node in the first phase of vectorization.
@@ -4094,11 +4094,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // We will construct a vector for the recurrence by combining the values for
   // the current and previous iterations. This is the required shuffle mask.
-  assert(!VF.isScalable());
-  SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
-  ShuffleMask[0] = VF.getKnownMinValue() - 1;
-  for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
-    ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
+  assert(!VF.isScalable()); 
+  SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 
+  ShuffleMask[0] = VF.getKnownMinValue() - 1; 
+  for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 
+    ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 
 
   // The vector from which to take the initial value for the current iteration
   // (actual or unrolled). Initially, this is the vector phi node.
@@ -4108,10 +4108,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   for (unsigned Part = 0; Part < UF; ++Part) {
     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
-    auto *Shuffle =
-        VF.isVector()
-            ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
-            : Incoming;
+    auto *Shuffle = 
+        VF.isVector() 
+            ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 
+            : Incoming; 
     PhiPart->replaceAllUsesWith(Shuffle);
     cast<Instruction>(PhiPart)->eraseFromParent();
     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
@@ -4124,11 +4124,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   // Extract the last vector element in the middle block. This will be the
   // initial value for the recurrence when jumping to the scalar loop.
   auto *ExtractForScalar = Incoming;
-  if (VF.isVector()) {
+  if (VF.isVector()) { 
     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
     ExtractForScalar = Builder.CreateExtractElement(
-        ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
-        "vector.recur.extract");
+        ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 
+        "vector.recur.extract"); 
   }
   // Extract the second last element in the middle block if the
   // Phi is used outside the loop. We need to extract the phi itself
@@ -4136,10 +4136,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   // will be the value when jumping to the exit block from the LoopMiddleBlock,
   // when the scalar loop is not run at all.
   Value *ExtractForPhiUsedOutsideLoop = nullptr;
-  if (VF.isVector())
+  if (VF.isVector()) 
     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
-        Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
-        "vector.recur.extract.for.phi");
+        Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 
+        "vector.recur.extract.for.phi"); 
   // When loop is unrolled without vectorizing, initialize
   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
   // `Incoming`. This is analogous to the vectorized case above: extracting the
@@ -4163,13 +4163,13 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   // vector recurrence we extracted in the middle block. Since the loop is in
   // LCSSA form, we just need to find all the phi nodes for the original scalar
   // recurrence in the exit block, and then add an edge for the middle block.
-  // Note that LCSSA does not imply single entry when the original scalar loop
-  // had multiple exiting edges (as we always run the last iteration in the
-  // scalar epilogue); in that case, the exiting path through middle will be
-  // dynamically dead and the value picked for the phi doesn't matter.
-  for (PHINode &LCSSAPhi : LoopExitBlock->phis())
-    if (any_of(LCSSAPhi.incoming_values(),
-               [Phi](Value *V) { return V == Phi; }))
+  // Note that LCSSA does not imply single entry when the original scalar loop 
+  // had multiple exiting edges (as we always run the last iteration in the 
+  // scalar epilogue); in that case, the exiting path through middle will be 
+  // dynamically dead and the value picked for the phi doesn't matter. 
+  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 
+    if (any_of(LCSSAPhi.incoming_values(), 
+               [Phi](Value *V) { return V == Phi; })) 
       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
 }
 
@@ -4179,11 +4179,11 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
          "Unable to find the reduction variable");
   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
 
-  RecurKind RK = RdxDesc.getRecurrenceKind();
+  RecurKind RK = RdxDesc.getRecurrenceKind(); 
   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
   setDebugLocFromInst(Builder, ReductionStartValue);
-  bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
+  bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 
 
   // This is the vector-clone of the value that leaves the loop.
   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
@@ -4215,9 +4215,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
 
   // If tail is folded by masking, the vector value to leave the loop should be
   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
-  // instead of the former. For an inloop reduction the reduction will already
-  // be predicated, and does not need to be handled here.
-  if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
+  // instead of the former. For an inloop reduction the reduction will already 
+  // be predicated, and does not need to be handled here. 
+  if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *VecLoopExitInst =
           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
@@ -4231,31 +4231,31 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
       }
       assert(Sel && "Reduction exit feeds no select");
       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
-
-      // If the target can create a predicated operator for the reduction at no
-      // extra cost in the loop (for example a predicated vadd), it can be
-      // cheaper for the select to remain in the loop than be sunk out of it,
-      // and so use the select value for the phi instead of the old
-      // LoopExitValue.
-      RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
-      if (PreferPredicatedReductionSelect ||
-          TTI->preferPredicatedReductionSelect(
-              RdxDesc.getOpcode(), Phi->getType(),
-              TargetTransformInfo::ReductionFlags())) {
-        auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
-        VecRdxPhi->setIncomingValueForBlock(
-            LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
-      }
+ 
+      // If the target can create a predicated operator for the reduction at no 
+      // extra cost in the loop (for example a predicated vadd), it can be 
+      // cheaper for the select to remain in the loop than be sunk out of it, 
+      // and so use the select value for the phi instead of the old 
+      // LoopExitValue. 
+      RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 
+      if (PreferPredicatedReductionSelect || 
+          TTI->preferPredicatedReductionSelect( 
+              RdxDesc.getOpcode(), Phi->getType(), 
+              TargetTransformInfo::ReductionFlags())) { 
+        auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 
+        VecRdxPhi->setIncomingValueForBlock( 
+            LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 
+      } 
     }
   }
 
   // If the vector reduction can be performed in a smaller type, we truncate
   // then extend the loop exit value to enable InstCombine to evaluate the
   // entire expression in the smaller type.
-  if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
-    assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
-    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+  if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 
+    assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 
+    assert(!VF.isScalable() && "scalable vectors not yet supported."); 
+    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 
     Builder.SetInsertPoint(
         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
     VectorParts RdxParts(UF);
@@ -4282,7 +4282,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
 
   // Reduce all of the unrolled parts into a single vector.
   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
-  unsigned Op = RecurrenceDescriptor::getOpcode(RK);
+  unsigned Op = RecurrenceDescriptor::getOpcode(RK); 
 
   // The middle block terminator has already been assigned a DebugLoc here (the
   // OrigLoop's single latch terminator). We want the whole middle block to
@@ -4301,14 +4301,14 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
                               ReducedPartRdx, "bin.rdx"),
           RdxDesc.getFastMathFlags());
     else
-      ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
+      ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 
   }
 
-  // Create the reduction after the loop. Note that inloop reductions create the
-  // target reduction in the loop using a Reduction recipe.
-  if (VF.isVector() && !IsInLoopReductionPhi) {
+  // Create the reduction after the loop. Note that inloop reductions create the 
+  // target reduction in the loop using a Reduction recipe. 
+  if (VF.isVector() && !IsInLoopReductionPhi) { 
     ReducedPartRdx =
-        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
+        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 
     // If the reduction can be performed in a smaller type, we need to extend
     // the reduction to the wider type before we branch to the original loop.
     if (Phi->getType() != RdxDesc.getRecurrenceType())
@@ -4329,16 +4329,16 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   // Now, we need to fix the users of the reduction variable
   // inside and outside of the scalar remainder loop.
 
-  // We know that the loop is in LCSSA form. We need to update the PHI nodes
-  // in the exit blocks.  See comment on analogous loop in
-  // fixFirstOrderRecurrence for a more complete explaination of the logic.
-  for (PHINode &LCSSAPhi : LoopExitBlock->phis())
-    if (any_of(LCSSAPhi.incoming_values(),
-               [LoopExitInst](Value *V) { return V == LoopExitInst; }))
+  // We know that the loop is in LCSSA form. We need to update the PHI nodes 
+  // in the exit blocks.  See comment on analogous loop in 
+  // fixFirstOrderRecurrence for a more complete explaination of the logic. 
+  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 
+    if (any_of(LCSSAPhi.incoming_values(), 
+               [LoopExitInst](Value *V) { return V == LoopExitInst; })) 
       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
 
-  // Fix the scalar loop reduction variable with the incoming reduction sum
-  // from the vector body and from the backedge value.
+  // Fix the scalar loop reduction variable with the incoming reduction sum 
+  // from the vector body and from the backedge value. 
   int IncomingEdgeBlockIdx =
     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
@@ -4350,8 +4350,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
 
 void InnerLoopVectorizer::clearReductionWrapFlags(
     RecurrenceDescriptor &RdxDesc) {
-  RecurKind RK = RdxDesc.getRecurrenceKind();
-  if (RK != RecurKind::Add && RK != RecurKind::Mul)
+  RecurKind RK = RdxDesc.getRecurrenceKind(); 
+  if (RK != RecurKind::Add && RK != RecurKind::Mul) 
     return;
 
   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
@@ -4380,27 +4380,27 @@ void InnerLoopVectorizer::clearReductionWrapFlags(
 
 void InnerLoopVectorizer::fixLCSSAPHIs() {
   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
-    if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
-      // Some phis were already hand updated by the reduction and recurrence
-      // code above, leave them alone.
-      continue;
-
-    auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
-    // Non-instruction incoming values will have only one value.
-    unsigned LastLane = 0;
-    if (isa<Instruction>(IncomingValue))
-      LastLane = Cost->isUniformAfterVectorization(
-                     cast<Instruction>(IncomingValue), VF)
-                     ? 0
-                     : VF.getKnownMinValue() - 1;
-    assert((!VF.isScalable() || LastLane == 0) &&
-           "scalable vectors dont support non-uniform scalars yet");
-    // Can be a loop invariant incoming value or the last scalar value to be
-    // extracted from the vectorized loop.
-    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
-    Value *lastIncomingValue =
-      getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
-    LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
+    if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 
+      // Some phis were already hand updated by the reduction and recurrence 
+      // code above, leave them alone. 
+      continue; 
+ 
+    auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 
+    // Non-instruction incoming values will have only one value. 
+    unsigned LastLane = 0; 
+    if (isa<Instruction>(IncomingValue)) 
+      LastLane = Cost->isUniformAfterVectorization( 
+                     cast<Instruction>(IncomingValue), VF) 
+                     ? 0 
+                     : VF.getKnownMinValue() - 1; 
+    assert((!VF.isScalable() || LastLane == 0) && 
+           "scalable vectors dont support non-uniform scalars yet"); 
+    // Can be a loop invariant incoming value or the last scalar value to be 
+    // extracted from the vectorized loop. 
+    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 
+    Value *lastIncomingValue = 
+      getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 
+    LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 
   }
 }
 
@@ -4504,9 +4504,9 @@ void InnerLoopVectorizer::fixNonInductionPHIs() {
   }
 }
 
-void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
-                                   VPUser &Operands, unsigned UF,
-                                   ElementCount VF, bool IsPtrLoopInvariant,
+void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 
+                                   VPUser &Operands, unsigned UF, 
+                                   ElementCount VF, bool IsPtrLoopInvariant, 
                                    SmallBitVector &IsIndexLoopInvariant,
                                    VPTransformState &State) {
   // Construct a vector GEP by widening the operands of the scalar GEP as
@@ -4515,7 +4515,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
   // is vector-typed. Thus, to keep the representation compact, we only use
   // vector-typed operands for loop-varying values.
 
-  if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
+  if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 
     // If we are vectorizing, but the GEP has only loop-invariant operands,
     // the GEP we build (by only using vector-typed operands for
     // loop-varying values) would be a scalar pointer. Thus, to ensure we
@@ -4531,7 +4531,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
     auto *Clone = Builder.Insert(GEP->clone());
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
-      State.set(VPDef, GEP, EntryPart, Part);
+      State.set(VPDef, GEP, EntryPart, Part); 
       addMetadata(EntryPart, GEP);
     }
   } else {
@@ -4566,19 +4566,19 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
                                           Indices)
               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
-      assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
+      assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 
              "NewGEP is not a pointer vector");
-      State.set(VPDef, GEP, NewGEP, Part);
+      State.set(VPDef, GEP, NewGEP, Part); 
       addMetadata(NewGEP, GEP);
     }
   }
 }
 
-void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
-                                              RecurrenceDescriptor *RdxDesc,
-                                              Value *StartV, unsigned UF,
-                                              ElementCount VF) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
+void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 
+                                              RecurrenceDescriptor *RdxDesc, 
+                                              Value *StartV, unsigned UF, 
+                                              ElementCount VF) { 
+  assert(!VF.isScalable() && "scalable vectors not yet supported."); 
   PHINode *P = cast<PHINode>(PN);
   if (EnableVPlanNativePath) {
     // Currently we enter here in the VPlan-native path for non-induction
@@ -4586,7 +4586,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
     // Create a vector phi with no operands - the vector phi operands will be
     // set at the end of vector code generation.
     Type *VecTy =
-        (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
+        (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 
     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
     OrigPHIsToFix.push_back(P);
@@ -4601,60 +4601,60 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
   // this value when we vectorize all of the instructions that use the PHI.
-  if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
-    Value *Iden = nullptr;
-    bool ScalarPHI =
-        (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
-    Type *VecTy =
-        ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
-
-    if (RdxDesc) {
-      assert(Legal->isReductionVariable(P) && StartV &&
-             "RdxDesc should only be set for reduction variables; in that case "
-             "a StartV is also required");
-      RecurKind RK = RdxDesc->getRecurrenceKind();
-      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
-        // MinMax reduction have the start value as their identify.
-        if (ScalarPHI) {
-          Iden = StartV;
-        } else {
-          IRBuilderBase::InsertPointGuard IPBuilder(Builder);
-          Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
-          StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident");
-        }
-      } else {
-        Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
-            RK, VecTy->getScalarType());
-        Iden = IdenC;
-
-        if (!ScalarPHI) {
-          Iden = ConstantVector::getSplat(VF, IdenC);
-          IRBuilderBase::InsertPointGuard IPBuilder(Builder);
-          Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
-          Constant *Zero = Builder.getInt32(0);
-          StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
-        }
-      }
-    }
-
+  if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 
+    Value *Iden = nullptr; 
+    bool ScalarPHI = 
+        (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 
+    Type *VecTy = 
+        ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 
+ 
+    if (RdxDesc) { 
+      assert(Legal->isReductionVariable(P) && StartV && 
+             "RdxDesc should only be set for reduction variables; in that case " 
+             "a StartV is also required"); 
+      RecurKind RK = RdxDesc->getRecurrenceKind(); 
+      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 
+        // MinMax reduction have the start value as their identify. 
+        if (ScalarPHI) { 
+          Iden = StartV; 
+        } else { 
+          IRBuilderBase::InsertPointGuard IPBuilder(Builder); 
+          Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 
+          StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident"); 
+        } 
+      } else { 
+        Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 
+            RK, VecTy->getScalarType()); 
+        Iden = IdenC; 
+ 
+        if (!ScalarPHI) { 
+          Iden = ConstantVector::getSplat(VF, IdenC); 
+          IRBuilderBase::InsertPointGuard IPBuilder(Builder); 
+          Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 
+          Constant *Zero = Builder.getInt32(0); 
+          StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 
+        } 
+      } 
+    } 
+ 
     for (unsigned Part = 0; Part < UF; ++Part) {
       // This is phase one of vectorizing PHIs.
       Value *EntryPart = PHINode::Create(
           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
-      if (StartV) {
-        // Make sure to add the reduction start value only to the
-        // first unroll part.
-        Value *StartVal = (Part == 0) ? StartV : Iden;
-        cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
-      }
+      if (StartV) { 
+        // Make sure to add the reduction start value only to the 
+        // first unroll part. 
+        Value *StartVal = (Part == 0) ? StartV : Iden; 
+        cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 
+      } 
     }
     return;
   }
 
-  assert(!Legal->isReductionVariable(P) &&
-         "reductions should be handled above");
-
+  assert(!Legal->isReductionVariable(P) && 
+         "reductions should be handled above"); 
+ 
   setDebugLocFromInst(Builder, P);
 
   // This PHINode must be an induction variable.
@@ -4675,74 +4675,74 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
   case InductionDescriptor::IK_PtrInduction: {
     // Handle the pointer induction variable case.
     assert(P->getType()->isPointerTy() && "Unexpected type.");
-
-    if (Cost->isScalarAfterVectorization(P, VF)) {
-      // This is the normalized GEP that starts counting at zero.
-      Value *PtrInd =
-          Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
-      // Determine the number of scalars we need to generate for each unroll
-      // iteration. If the instruction is uniform, we only need to generate the
-      // first lane. Otherwise, we generate all VF values.
-      unsigned Lanes =
-          Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-          Constant *Idx = ConstantInt::get(PtrInd->getType(),
-                                           Lane + Part * VF.getKnownMinValue());
-          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
-          Value *SclrGep =
-              emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
-          SclrGep->setName("next.gep");
-          VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
-        }
+ 
+    if (Cost->isScalarAfterVectorization(P, VF)) { 
+      // This is the normalized GEP that starts counting at zero. 
+      Value *PtrInd = 
+          Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 
+      // Determine the number of scalars we need to generate for each unroll 
+      // iteration. If the instruction is uniform, we only need to generate the 
+      // first lane. Otherwise, we generate all VF values. 
+      unsigned Lanes = 
+          Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 
+      for (unsigned Part = 0; Part < UF; ++Part) { 
+        for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 
+          Constant *Idx = ConstantInt::get(PtrInd->getType(), 
+                                           Lane + Part * VF.getKnownMinValue()); 
+          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 
+          Value *SclrGep = 
+              emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 
+          SclrGep->setName("next.gep"); 
+          VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 
+        } 
       }
-      return;
-    }
-    assert(isa<SCEVConstant>(II.getStep()) &&
-           "Induction step not a SCEV constant!");
-    Type *PhiType = II.getStep()->getType();
-
-    // Build a pointer phi
-    Value *ScalarStartValue = II.getStartValue();
-    Type *ScStValueType = ScalarStartValue->getType();
-    PHINode *NewPointerPhi =
-        PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
-    NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
-
-    // A pointer induction, performed by using a gep
-    BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
-    Instruction *InductionLoc = LoopLatch->getTerminator();
-    const SCEV *ScalarStep = II.getStep();
-    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
-    Value *ScalarStepValue =
-        Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
-    Value *InductionGEP = GetElementPtrInst::Create(
-        ScStValueType->getPointerElementType(), NewPointerPhi,
-        Builder.CreateMul(
-            ScalarStepValue,
-            ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
-        "ptr.ind", InductionLoc);
-    NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
-
-    // Create UF many actual address geps that use the pointer
-    // phi as base and a vectorized version of the step value
-    // (<step*0, ..., step*N>) as offset.
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      SmallVector<Constant *, 8> Indices;
-      // Create a vector of consecutive numbers from zero to VF.
-      for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
-        Indices.push_back(
-            ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
-      Constant *StartOffset = ConstantVector::get(Indices);
-
-      Value *GEP = Builder.CreateGEP(
-          ScStValueType->getPointerElementType(), NewPointerPhi,
-          Builder.CreateMul(
-              StartOffset,
-              Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
-              "vector.gep"));
-      VectorLoopValueMap.setVectorValue(P, Part, GEP);
+      return; 
     }
+    assert(isa<SCEVConstant>(II.getStep()) && 
+           "Induction step not a SCEV constant!"); 
+    Type *PhiType = II.getStep()->getType(); 
+ 
+    // Build a pointer phi 
+    Value *ScalarStartValue = II.getStartValue(); 
+    Type *ScStValueType = ScalarStartValue->getType(); 
+    PHINode *NewPointerPhi = 
+        PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 
+    NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 
+ 
+    // A pointer induction, performed by using a gep 
+    BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 
+    Instruction *InductionLoc = LoopLatch->getTerminator(); 
+    const SCEV *ScalarStep = II.getStep(); 
+    SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 
+    Value *ScalarStepValue = 
+        Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 
+    Value *InductionGEP = GetElementPtrInst::Create( 
+        ScStValueType->getPointerElementType(), NewPointerPhi, 
+        Builder.CreateMul( 
+            ScalarStepValue, 
+            ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 
+        "ptr.ind", InductionLoc); 
+    NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 
+ 
+    // Create UF many actual address geps that use the pointer 
+    // phi as base and a vectorized version of the step value 
+    // (<step*0, ..., step*N>) as offset. 
+    for (unsigned Part = 0; Part < UF; ++Part) { 
+      SmallVector<Constant *, 8> Indices; 
+      // Create a vector of consecutive numbers from zero to VF. 
+      for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 
+        Indices.push_back( 
+            ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 
+      Constant *StartOffset = ConstantVector::get(Indices); 
+ 
+      Value *GEP = Builder.CreateGEP( 
+          ScStValueType->getPointerElementType(), NewPointerPhi, 
+          Builder.CreateMul( 
+              StartOffset, 
+              Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 
+              "vector.gep")); 
+      VectorLoopValueMap.setVectorValue(P, Part, GEP); 
+    } 
   }
   }
 }
@@ -4765,8 +4765,8 @@ static bool mayDivideByZero(Instruction &I) {
   return !CInt || CInt->isZero();
 }
 
-void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
-                                           VPUser &User,
+void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 
+                                           VPUser &User, 
                                            VPTransformState &State) {
   switch (I.getOpcode()) {
   case Instruction::Call:
@@ -4808,7 +4808,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
         VecOp->copyIRFlags(&I);
 
       // Use this vector value for all users of the original instruction.
-      State.set(Def, &I, V, Part);
+      State.set(Def, &I, V, Part); 
       addMetadata(V, &I);
     }
 
@@ -4832,7 +4832,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
       } else {
         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
       }
-      State.set(Def, &I, C, Part);
+      State.set(Def, &I, C, Part); 
       addMetadata(C, &I);
     }
 
@@ -4856,12 +4856,12 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
 
     /// Vectorize casts.
     Type *DestTy =
-        (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
+        (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 
 
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *A = State.get(User.getOperand(0), Part);
       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
-      State.set(Def, &I, Cast, Part);
+      State.set(Def, &I, Cast, Part); 
       addMetadata(Cast, &I);
     }
     break;
@@ -4873,8 +4873,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
   } // end of switch.
 }
 
-void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
-                                               VPUser &ArgOperands,
+void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 
+                                               VPUser &ArgOperands, 
                                                VPTransformState &State) {
   assert(!isa<DbgInfoIntrinsic>(I) &&
          "DbgInfoIntrinsic should have been dropped during VPlan construction");
@@ -4885,7 +4885,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
 
   SmallVector<Type *, 4> Tys;
   for (Value *ArgOperand : CI->arg_operands())
-    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
+    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 
 
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
@@ -4893,13 +4893,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
   // version of the instruction.
   // Is it beneficial to perform intrinsic call compared to lib call?
   bool NeedToScalarize = false;
-  InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
-  InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
-  bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
+  InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 
+  InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 
+  bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 
   assert((UseVectorIntrinsic || !NeedToScalarize) &&
          "Instruction should be scalarized elsewhere.");
-  assert(IntrinsicCost.isValid() && CallCost.isValid() &&
-         "Cannot have invalid costs while widening");
+  assert(IntrinsicCost.isValid() && CallCost.isValid() && 
+         "Cannot have invalid costs while widening"); 
 
   for (unsigned Part = 0; Part < UF; ++Part) {
     SmallVector<Value *, 4> Args;
@@ -4918,15 +4918,15 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
     if (UseVectorIntrinsic) {
       // Use vector version of the intrinsic.
       Type *TysForDecl[] = {CI->getType()};
-      if (VF.isVector()) {
-        assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-        TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
-      }
+      if (VF.isVector()) { 
+        assert(!VF.isScalable() && "VF is assumed to be non scalable."); 
+        TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 
+      } 
       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
       assert(VectorF && "Can't retrieve vector intrinsic.");
     } else {
       // Use vector version of the function call.
-      const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
+      const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 
 #ifndef NDEBUG
       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
              "Can't create vector function.");
@@ -4940,12 +4940,12 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
       if (isa<FPMathOperator>(V))
         V->copyFastMathFlags(CI);
 
-      State.set(Def, &I, V, Part);
+      State.set(Def, &I, V, Part); 
       addMetadata(V, &I);
   }
 }
 
-void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
+void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 
                                                  VPUser &Operands,
                                                  bool InvariantCond,
                                                  VPTransformState &State) {
@@ -4964,16 +4964,16 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
     Value *Op0 = State.get(Operands.getOperand(1), Part);
     Value *Op1 = State.get(Operands.getOperand(2), Part);
     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
-    State.set(VPDef, &I, Sel, Part);
+    State.set(VPDef, &I, Sel, Part); 
     addMetadata(Sel, &I);
   }
 }
 
-void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
+void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 
   // We should not collect Scalars more than once per VF. Right now, this
   // function is called from collectUniformsAndScalars(), which already does
   // this check. Collecting Scalars for VF=1 does not make any sense.
-  assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
+  assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 
          "This function should not be visited twice for the same VF");
 
   SmallSetVector<Instruction *, 8> Worklist;
@@ -4982,7 +4982,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   // accesses that will remain scalar.
   SmallSetVector<Instruction *, 8> ScalarPtrs;
   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
-  auto *Latch = TheLoop->getLoopLatch();
+  auto *Latch = TheLoop->getLoopLatch(); 
 
   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
   // The pointer operands of loads and stores will be scalar as long as the
@@ -5008,33 +5008,33 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
            !TheLoop->isLoopInvariant(V);
   };
 
-  auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
-    if (!isa<PHINode>(Ptr) ||
-        !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
-      return false;
-    auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
-    if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
-      return false;
-    return isScalarUse(MemAccess, Ptr);
-  };
-
-  // A helper that evaluates a memory access's use of a pointer. If the
-  // pointer is actually the pointer induction of a loop, it is being
-  // inserted into Worklist. If the use will be a scalar use, and the
-  // pointer is only used by memory accesses, we place the pointer in
-  // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
+  auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 
+    if (!isa<PHINode>(Ptr) || 
+        !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 
+      return false; 
+    auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 
+    if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 
+      return false; 
+    return isScalarUse(MemAccess, Ptr); 
+  }; 
+ 
+  // A helper that evaluates a memory access's use of a pointer. If the 
+  // pointer is actually the pointer induction of a loop, it is being 
+  // inserted into Worklist. If the use will be a scalar use, and the 
+  // pointer is only used by memory accesses, we place the pointer in 
+  // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 
   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
-    if (isScalarPtrInduction(MemAccess, Ptr)) {
-      Worklist.insert(cast<Instruction>(Ptr));
-      Instruction *Update = cast<Instruction>(
-          cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
-      Worklist.insert(Update);
-      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
-                        << "\n");
-      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
-                        << "\n");
-      return;
-    }
+    if (isScalarPtrInduction(MemAccess, Ptr)) { 
+      Worklist.insert(cast<Instruction>(Ptr)); 
+      Instruction *Update = cast<Instruction>( 
+          cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 
+      Worklist.insert(Update); 
+      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 
+                        << "\n"); 
+      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 
+                        << "\n"); 
+      return; 
+    } 
     // We only care about bitcast and getelementptr instructions contained in
     // the loop.
     if (!isLoopVaryingBitCastOrGEP(Ptr))
@@ -5058,9 +5058,9 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   };
 
   // We seed the scalars analysis with three classes of instructions: (1)
-  // instructions marked uniform-after-vectorization and (2) bitcast,
-  // getelementptr and (pointer) phi instructions used by memory accesses
-  // requiring a scalar use.
+  // instructions marked uniform-after-vectorization and (2) bitcast, 
+  // getelementptr and (pointer) phi instructions used by memory accesses 
+  // requiring a scalar use. 
   //
   // (1) Add to the worklist all instructions that have been identified as
   // uniform-after-vectorization.
@@ -5156,8 +5156,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   Scalars[VF].insert(Worklist.begin(), Worklist.end());
 }
 
-bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
-                                                         ElementCount VF) {
+bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 
+                                                         ElementCount VF) { 
   if (!blockNeedsPredication(I->getParent()))
     return false;
   switch(I->getOpcode()) {
@@ -5171,7 +5171,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
     auto *Ty = getMemInstValueType(I);
     // We have already decided how to vectorize this instruction, get that
     // result.
-    if (VF.isVector()) {
+    if (VF.isVector()) { 
       InstWidening WideningDecision = getWideningDecision(I, VF);
       assert(WideningDecision != CM_Unknown &&
              "Widening decision should be ready at this moment");
@@ -5192,8 +5192,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
   return false;
 }
 
-bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
-    Instruction *I, ElementCount VF) {
+bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 
+    Instruction *I, ElementCount VF) { 
   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
   assert(getWideningDecision(I, VF) == CM_Unknown &&
          "Decision should not be set yet.");
@@ -5204,7 +5204,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   // requires padding and will be scalarized.
   auto &DL = I->getModule()->getDataLayout();
   auto *ScalarTy = getMemInstValueType(I);
-  if (hasIrregularType(ScalarTy, DL))
+  if (hasIrregularType(ScalarTy, DL)) 
     return false;
 
   // Check if masking is required.
@@ -5229,8 +5229,8 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
                           : TTI.isLegalMaskedStore(Ty, Alignment);
 }
 
-bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
-    Instruction *I, ElementCount VF) {
+bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 
+    Instruction *I, ElementCount VF) { 
   // Get and ensure we have a valid memory instruction.
   LoadInst *LI = dyn_cast<LoadInst>(I);
   StoreInst *SI = dyn_cast<StoreInst>(I);
@@ -5251,19 +5251,19 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
   // requires padding and will be scalarized.
   auto &DL = I->getModule()->getDataLayout();
   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
-  if (hasIrregularType(ScalarTy, DL))
+  if (hasIrregularType(ScalarTy, DL)) 
     return false;
 
   return true;
 }
 
-void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
+void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 
   // We should not collect Uniforms more than once per VF. Right now,
   // this function is called from collectUniformsAndScalars(), which
   // already does this check. Collecting Uniforms for VF=1 does not make any
   // sense.
 
-  assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
+  assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 
          "This function should not be visited twice for the same VF");
 
   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
@@ -5289,11 +5289,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // replicating region where only a single instance out of VF should be formed.
   // TODO: optimize such seldom cases if found important, see PR40816.
   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
-    if (isOutOfScope(I)) {
-      LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
-                        << *I << "\n");
-      return;
-    }
+    if (isOutOfScope(I)) { 
+      LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 
+                        << *I << "\n"); 
+      return; 
+    } 
     if (isScalarWithPredication(I, VF)) {
       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
                         << *I << "\n");
@@ -5310,71 +5310,71 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
     addToWorklistIfAllowed(Cmp);
 
-  auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
+  auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 
     InstWidening WideningDecision = getWideningDecision(I, VF);
     assert(WideningDecision != CM_Unknown &&
            "Widening decision should be ready at this moment");
 
-    // A uniform memory op is itself uniform.  We exclude uniform stores
-    // here as they demand the last lane, not the first one.
-    if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
-      assert(WideningDecision == CM_Scalarize);
-      return true;
-    }
-
+    // A uniform memory op is itself uniform.  We exclude uniform stores 
+    // here as they demand the last lane, not the first one. 
+    if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 
+      assert(WideningDecision == CM_Scalarize); 
+      return true; 
+    } 
+ 
     return (WideningDecision == CM_Widen ||
             WideningDecision == CM_Widen_Reverse ||
             WideningDecision == CM_Interleave);
   };
-
-
-  // Returns true if Ptr is the pointer operand of a memory access instruction
-  // I, and I is known to not require scalarization.
-  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
-    return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
-  };
-
-  // Holds a list of values which are known to have at least one uniform use.
-  // Note that there may be other uses which aren't uniform.  A "uniform use"
-  // here is something which only demands lane 0 of the unrolled iterations;
-  // it does not imply that all lanes produce the same value (e.g. this is not
-  // the usual meaning of uniform)
-  SmallPtrSet<Value *, 8> HasUniformUse;
-
-  // Scan the loop for instructions which are either a) known to have only
-  // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
+ 
+ 
+  // Returns true if Ptr is the pointer operand of a memory access instruction 
+  // I, and I is known to not require scalarization. 
+  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 
+    return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 
+  }; 
+ 
+  // Holds a list of values which are known to have at least one uniform use. 
+  // Note that there may be other uses which aren't uniform.  A "uniform use" 
+  // here is something which only demands lane 0 of the unrolled iterations; 
+  // it does not imply that all lanes produce the same value (e.g. this is not 
+  // the usual meaning of uniform) 
+  SmallPtrSet<Value *, 8> HasUniformUse; 
+ 
+  // Scan the loop for instructions which are either a) known to have only 
+  // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
       // If there's no pointer operand, there's nothing to do.
-      auto *Ptr = getLoadStorePointerOperand(&I);
+      auto *Ptr = getLoadStorePointerOperand(&I); 
       if (!Ptr)
         continue;
 
-      // A uniform memory op is itself uniform.  We exclude uniform stores
-      // here as they demand the last lane, not the first one.
-      if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
-        addToWorklistIfAllowed(&I);
+      // A uniform memory op is itself uniform.  We exclude uniform stores 
+      // here as they demand the last lane, not the first one. 
+      if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 
+        addToWorklistIfAllowed(&I); 
 
-      if (isUniformDecision(&I, VF)) {
-        assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
-        HasUniformUse.insert(Ptr);
-      }
+      if (isUniformDecision(&I, VF)) { 
+        assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 
+        HasUniformUse.insert(Ptr); 
+      } 
     }
 
-  // Add to the worklist any operands which have *only* uniform (e.g. lane 0
-  // demanding) users.  Since loops are assumed to be in LCSSA form, this
-  // disallows uses outside the loop as well.
-  for (auto *V : HasUniformUse) {
-    if (isOutOfScope(V))
-      continue;
-    auto *I = cast<Instruction>(V);
-    auto UsersAreMemAccesses =
-      llvm::all_of(I->users(), [&](User *U) -> bool {
-        return isVectorizedMemAccessUse(cast<Instruction>(U), V);
-      });
-    if (UsersAreMemAccesses)
-      addToWorklistIfAllowed(I);
-  }
+  // Add to the worklist any operands which have *only* uniform (e.g. lane 0 
+  // demanding) users.  Since loops are assumed to be in LCSSA form, this 
+  // disallows uses outside the loop as well. 
+  for (auto *V : HasUniformUse) { 
+    if (isOutOfScope(V)) 
+      continue; 
+    auto *I = cast<Instruction>(V); 
+    auto UsersAreMemAccesses = 
+      llvm::all_of(I->users(), [&](User *U) -> bool { 
+        return isVectorizedMemAccessUse(cast<Instruction>(U), V); 
+      }); 
+    if (UsersAreMemAccesses) 
+      addToWorklistIfAllowed(I); 
+  } 
 
   // Expand Worklist in topological order: whenever a new instruction
   // is added , its users should be already inside Worklist.  It ensures
@@ -5397,7 +5397,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
       auto *OI = cast<Instruction>(OV);
       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
             auto *J = cast<Instruction>(U);
-            return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
+            return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 
           }))
         addToWorklistIfAllowed(OI);
     }
@@ -5475,8 +5475,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
   return false;
 }
 
-Optional<ElementCount>
-LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
+Optional<ElementCount> 
+LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 
   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
     // TODO: It may by useful to do since it's still likely to be dynamically
     // uniform if the target can skip.
@@ -5498,9 +5498,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 
   switch (ScalarEpilogueStatus) {
   case CM_ScalarEpilogueAllowed:
-    return computeFeasibleMaxVF(TC, UserVF);
-  case CM_ScalarEpilogueNotAllowedUsePredicate:
-    LLVM_FALLTHROUGH;
+    return computeFeasibleMaxVF(TC, UserVF); 
+  case CM_ScalarEpilogueNotAllowedUsePredicate: 
+    LLVM_FALLTHROUGH; 
   case CM_ScalarEpilogueNotNeededUsePredicate:
     LLVM_DEBUG(
         dbgs() << "LV: vector predicate hint/switch found.\n"
@@ -5521,26 +5521,26 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     // for size.
     if (runtimeChecksRequired())
       return None;
-
+ 
     break;
   }
 
-  // The only loops we can vectorize without a scalar epilogue, are loops with
-  // a bottom-test and a single exiting block. We'd have to handle the fact
-  // that not every instruction executes on the last iteration.  This will
-  // require a lane mask which varies through the vector loop body.  (TODO)
-  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-    // If there was a tail-folding hint/switch, but we can't fold the tail by
-    // masking, fallback to a vectorization with a scalar epilogue.
-    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
-      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
-                           "scalar epilogue instead.\n");
-      ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-      return computeFeasibleMaxVF(TC, UserVF);
-    }
-    return None;
-  }
-
+  // The only loops we can vectorize without a scalar epilogue, are loops with 
+  // a bottom-test and a single exiting block. We'd have to handle the fact 
+  // that not every instruction executes on the last iteration.  This will 
+  // require a lane mask which varies through the vector loop body.  (TODO) 
+  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 
+    // If there was a tail-folding hint/switch, but we can't fold the tail by 
+    // masking, fallback to a vectorization with a scalar epilogue. 
+    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 
+      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 
+                           "scalar epilogue instead.\n"); 
+      ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 
+      return computeFeasibleMaxVF(TC, UserVF); 
+    } 
+    return None; 
+  } 
+ 
   // Now try the tail folding
 
   // Invalidate interleave groups that require an epilogue if we can't mask
@@ -5553,22 +5553,22 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
   }
 
-  ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
-  assert(!MaxVF.isScalable() &&
-         "Scalable vectors do not yet support tail folding");
-  assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
-         "MaxVF must be a power of 2");
-  unsigned MaxVFtimesIC =
-      UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
-  // Avoid tail folding if the trip count is known to be a multiple of any VF we
-  // chose.
-  ScalarEvolution *SE = PSE.getSE();
-  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
-  const SCEV *ExitCount = SE->getAddExpr(
-      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
-  const SCEV *Rem = SE->getURemExpr(
-      ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
-  if (Rem->isZero()) {
+  ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 
+  assert(!MaxVF.isScalable() && 
+         "Scalable vectors do not yet support tail folding"); 
+  assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 
+         "MaxVF must be a power of 2"); 
+  unsigned MaxVFtimesIC = 
+      UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 
+  // Avoid tail folding if the trip count is known to be a multiple of any VF we 
+  // chose. 
+  ScalarEvolution *SE = PSE.getSE(); 
+  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 
+  const SCEV *ExitCount = SE->getAddExpr( 
+      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 
+  const SCEV *Rem = SE->getURemExpr( 
+      ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 
+  if (Rem->isZero()) { 
     // Accept MaxVF if we do not have a tail.
     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
     return MaxVF;
@@ -5583,20 +5583,20 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return MaxVF;
   }
 
-  // If there was a tail-folding hint/switch, but we can't fold the tail by
-  // masking, fallback to a vectorization with a scalar epilogue.
-  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
-    LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
-                         "scalar epilogue instead.\n");
-    ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-    return MaxVF;
-  }
-
-  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
-    LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
-    return None;
-  }
-
+  // If there was a tail-folding hint/switch, but we can't fold the tail by 
+  // masking, fallback to a vectorization with a scalar epilogue. 
+  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 
+    LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 
+                         "scalar epilogue instead.\n"); 
+    ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 
+    return MaxVF; 
+  } 
+ 
+  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 
+    LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 
+    return None; 
+  } 
+ 
   if (TC == 0) {
     reportVectorizationFailure(
         "Unable to calculate the loop count due to complex control flow",
@@ -5614,33 +5614,33 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   return None;
 }
 
-ElementCount
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
-                                                 ElementCount UserVF) {
-  bool IgnoreScalableUserVF = UserVF.isScalable() &&
-                              !TTI.supportsScalableVectors() &&
-                              !ForceTargetSupportsScalableVectors;
-  if (IgnoreScalableUserVF) {
-    LLVM_DEBUG(
-        dbgs() << "LV: Ignoring VF=" << UserVF
-               << " because target does not support scalable vectors.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
-                                        TheLoop->getStartLoc(),
-                                        TheLoop->getHeader())
-             << "Ignoring VF=" << ore::NV("UserVF", UserVF)
-             << " because target does not support scalable vectors.";
-    });
-  }
-
-  // Beyond this point two scenarios are handled. If UserVF isn't specified
-  // then a suitable VF is chosen. If UserVF is specified and there are
-  // dependencies, check if it's legal. However, if a UserVF is specified and
-  // there are no dependencies, then there's nothing to do.
-  if (UserVF.isNonZero() && !IgnoreScalableUserVF &&
-      Legal->isSafeForAnyVectorWidth())
-    return UserVF;
-
+ElementCount 
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 
+                                                 ElementCount UserVF) { 
+  bool IgnoreScalableUserVF = UserVF.isScalable() && 
+                              !TTI.supportsScalableVectors() && 
+                              !ForceTargetSupportsScalableVectors; 
+  if (IgnoreScalableUserVF) { 
+    LLVM_DEBUG( 
+        dbgs() << "LV: Ignoring VF=" << UserVF 
+               << " because target does not support scalable vectors.\n"); 
+    ORE->emit([&]() { 
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 
+                                        TheLoop->getStartLoc(), 
+                                        TheLoop->getHeader()) 
+             << "Ignoring VF=" << ore::NV("UserVF", UserVF) 
+             << " because target does not support scalable vectors."; 
+    }); 
+  } 
+ 
+  // Beyond this point two scenarios are handled. If UserVF isn't specified 
+  // then a suitable VF is chosen. If UserVF is specified and there are 
+  // dependencies, check if it's legal. However, if a UserVF is specified and 
+  // there are no dependencies, then there's nothing to do. 
+  if (UserVF.isNonZero() && !IgnoreScalableUserVF && 
+      Legal->isSafeForAnyVectorWidth()) 
+    return UserVF; 
+ 
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5650,63 +5650,63 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
   // the memory accesses that is most restrictive (involved in the smallest
   // dependence distance).
-  unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
-
-  // If the user vectorization factor is legally unsafe, clamp it to a safe
-  // value. Otherwise, return as is.
-  if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
-    unsigned MaxSafeElements =
-        PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
-    ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
-
-    if (UserVF.isScalable()) {
-      Optional<unsigned> MaxVScale = TTI.getMaxVScale();
-
-      // Scale VF by vscale before checking if it's safe.
-      MaxSafeVF = ElementCount::getScalable(
-          MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
-
-      if (MaxSafeVF.isZero()) {
-        // The dependence distance is too small to use scalable vectors,
-        // fallback on fixed.
-        LLVM_DEBUG(
-            dbgs()
-            << "LV: Max legal vector width too small, scalable vectorization "
-               "unfeasible. Using fixed-width vectorization instead.\n");
-        ORE->emit([&]() {
-          return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
-                                            TheLoop->getStartLoc(),
-                                            TheLoop->getHeader())
-                 << "Max legal vector width too small, scalable vectorization "
-                 << "unfeasible. Using fixed-width vectorization instead.";
-        });
-        return computeFeasibleMaxVF(
-            ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
-      }
-    }
-
-    LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
-
-    if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
-      return UserVF;
-
-    LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
-                      << " is unsafe, clamping to max safe VF=" << MaxSafeVF
-                      << ".\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
-                                        TheLoop->getStartLoc(),
-                                        TheLoop->getHeader())
-             << "User-specified vectorization factor "
-             << ore::NV("UserVectorizationFactor", UserVF)
-             << " is unsafe, clamping to maximum safe vectorization factor "
-             << ore::NV("VectorizationFactor", MaxSafeVF);
-    });
-    return MaxSafeVF;
-  }
-
-  WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
-
+  unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 
+
+  // If the user vectorization factor is legally unsafe, clamp it to a safe 
+  // value. Otherwise, return as is. 
+  if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 
+    unsigned MaxSafeElements = 
+        PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 
+    ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 
+
+    if (UserVF.isScalable()) { 
+      Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 
+ 
+      // Scale VF by vscale before checking if it's safe. 
+      MaxSafeVF = ElementCount::getScalable( 
+          MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 
+ 
+      if (MaxSafeVF.isZero()) { 
+        // The dependence distance is too small to use scalable vectors, 
+        // fallback on fixed. 
+        LLVM_DEBUG( 
+            dbgs() 
+            << "LV: Max legal vector width too small, scalable vectorization " 
+               "unfeasible. Using fixed-width vectorization instead.\n"); 
+        ORE->emit([&]() { 
+          return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 
+                                            TheLoop->getStartLoc(), 
+                                            TheLoop->getHeader()) 
+                 << "Max legal vector width too small, scalable vectorization " 
+                 << "unfeasible. Using fixed-width vectorization instead."; 
+        }); 
+        return computeFeasibleMaxVF( 
+            ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 
+      } 
+    } 
+ 
+    LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 
+ 
+    if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 
+      return UserVF; 
+ 
+    LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 
+                      << " is unsafe, clamping to max safe VF=" << MaxSafeVF 
+                      << ".\n"); 
+    ORE->emit([&]() { 
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 
+                                        TheLoop->getStartLoc(), 
+                                        TheLoop->getHeader()) 
+             << "User-specified vectorization factor " 
+             << ore::NV("UserVectorizationFactor", UserVF) 
+             << " is unsafe, clamping to maximum safe vectorization factor " 
+             << ore::NV("VectorizationFactor", MaxSafeVF); 
+    }); 
+    return MaxSafeVF; 
+  } 
+ 
+  WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 
+ 
   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
   // Note that both WidestRegister and WidestType may not be a powers of 2.
   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
@@ -5716,13 +5716,13 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
                     << WidestRegister << " bits.\n");
 
-  assert(MaxVectorSize <= WidestRegister &&
-         "Did not expect to pack so many elements"
-         " into one vector!");
+  assert(MaxVectorSize <= WidestRegister && 
+         "Did not expect to pack so many elements" 
+         " into one vector!"); 
   if (MaxVectorSize == 0) {
     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
     MaxVectorSize = 1;
-    return ElementCount::getFixed(MaxVectorSize);
+    return ElementCount::getFixed(MaxVectorSize); 
   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
              isPowerOf2_32(ConstTripCount)) {
     // We need to clamp the VF to be the ConstTripCount. There is no point in
@@ -5730,7 +5730,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
                       << ConstTripCount << "\n");
     MaxVectorSize = ConstTripCount;
-    return ElementCount::getFixed(MaxVectorSize);
+    return ElementCount::getFixed(MaxVectorSize); 
   }
 
   unsigned MaxVF = MaxVectorSize;
@@ -5738,10 +5738,10 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
     // Collect all viable vectorization factors larger than the default MaxVF
     // (i.e. MaxVectorSize).
-    SmallVector<ElementCount, 8> VFs;
+    SmallVector<ElementCount, 8> VFs; 
     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
-      VFs.push_back(ElementCount::getFixed(VS));
+      VFs.push_back(ElementCount::getFixed(VS)); 
 
     // For each VF calculate its register usage.
     auto RUs = calculateRegisterUsage(VFs);
@@ -5756,7 +5756,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
           Selected = false;
       }
       if (Selected) {
-        MaxVF = VFs[i].getKnownMinValue();
+        MaxVF = VFs[i].getKnownMinValue(); 
         break;
       }
     }
@@ -5768,39 +5768,39 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
       }
     }
   }
-  return ElementCount::getFixed(MaxVF);
+  return ElementCount::getFixed(MaxVF); 
 }
 
 VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
-  // FIXME: This can be fixed for scalable vectors later, because at this stage
-  // the LoopVectorizer will only consider vectorizing a loop with scalable
-  // vectors when the loop has a hint to enable vectorization for a given VF.
-  assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
-
-  InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
-  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
-  assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
-
+LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 
+  // FIXME: This can be fixed for scalable vectors later, because at this stage 
+  // the LoopVectorizer will only consider vectorizing a loop with scalable 
+  // vectors when the loop has a hint to enable vectorization for a given VF. 
+  assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 
+ 
+  InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 
+  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 
+  assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 
+ 
   unsigned Width = 1;
-  const float ScalarCost = *ExpectedCost.getValue();
-  float Cost = ScalarCost;
+  const float ScalarCost = *ExpectedCost.getValue(); 
+  float Cost = ScalarCost; 
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
-  if (ForceVectorization && MaxVF.isVector()) {
+  if (ForceVectorization && MaxVF.isVector()) { 
     // Ignore scalar width, because the user explicitly wants vectorization.
     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
     // evaluation.
     Cost = std::numeric_limits<float>::max();
   }
 
-  for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
+  for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
-    VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
-    assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
-    float VectorCost = *C.first.getValue() / (float)i;
+    VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 
+    assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 
+    float VectorCost = *C.first.getValue() / (float)i; 
     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
                       << " costs: " << (int)VectorCost << ".\n");
     if (!C.second && !ForceVectorization) {
@@ -5809,13 +5809,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
                  << " because it will not generate any vector instructions.\n");
       continue;
     }
-
-    // If profitable add it to ProfitableVF list.
-    if (VectorCost < ScalarCost) {
-      ProfitableVFs.push_back(VectorizationFactor(
-          {ElementCount::getFixed(i), (unsigned)VectorCost}));
-    }
-
+ 
+    // If profitable add it to ProfitableVF list. 
+    if (VectorCost < ScalarCost) { 
+      ProfitableVFs.push_back(VectorizationFactor( 
+          {ElementCount::getFixed(i), (unsigned)VectorCost})); 
+    } 
+ 
     if (VectorCost < Cost) {
       Cost = VectorCost;
       Width = i;
@@ -5834,131 +5834,131 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
              << "LV: Vectorization seems to be not beneficial, "
              << "but was forced by a user.\n");
   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
-  VectorizationFactor Factor = {ElementCount::getFixed(Width),
-                                (unsigned)(Width * Cost)};
+  VectorizationFactor Factor = {ElementCount::getFixed(Width), 
+                                (unsigned)(Width * Cost)}; 
   return Factor;
 }
 
-bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
-    const Loop &L, ElementCount VF) const {
-  // Cross iteration phis such as reductions need special handling and are
-  // currently unsupported.
-  if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
-        return Legal->isFirstOrderRecurrence(&Phi) ||
-               Legal->isReductionVariable(&Phi);
-      }))
-    return false;
-
-  // Phis with uses outside of the loop require special handling and are
-  // currently unsupported.
-  for (auto &Entry : Legal->getInductionVars()) {
-    // Look for uses of the value of the induction at the last iteration.
-    Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
-    for (User *U : PostInc->users())
-      if (!L.contains(cast<Instruction>(U)))
-        return false;
-    // Look for uses of penultimate value of the induction.
-    for (User *U : Entry.first->users())
-      if (!L.contains(cast<Instruction>(U)))
-        return false;
-  }
-
-  // Induction variables that are widened require special handling that is
-  // currently not supported.
-  if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
-        return !(this->isScalarAfterVectorization(Entry.first, VF) ||
-                 this->isProfitableToScalarize(Entry.first, VF));
-      }))
-    return false;
-
-  return true;
-}
-
-bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
-    const ElementCount VF) const {
-  // FIXME: We need a much better cost-model to take different parameters such
-  // as register pressure, code size increase and cost of extra branches into
-  // account. For now we apply a very crude heuristic and only consider loops
-  // with vectorization factors larger than a certain value.
-  // We also consider epilogue vectorization unprofitable for targets that don't
-  // consider interleaving beneficial (eg. MVE).
-  if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
-    return false;
-  if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
-    return true;
-  return false;
-}
-
-VectorizationFactor
-LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
-    const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
-  VectorizationFactor Result = VectorizationFactor::Disabled();
-  if (!EnableEpilogueVectorization) {
-    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
-    return Result;
-  }
-
-  if (!isScalarEpilogueAllowed()) {
-    LLVM_DEBUG(
-        dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
-                  "allowed.\n";);
-    return Result;
-  }
-
-  // FIXME: This can be fixed for scalable vectors later, because at this stage
-  // the LoopVectorizer will only consider vectorizing a loop with scalable
-  // vectors when the loop has a hint to enable vectorization for a given VF.
-  if (MainLoopVF.isScalable()) {
-    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
-                         "yet supported.\n");
-    return Result;
-  }
-
-  // Not really a cost consideration, but check for unsupported cases here to
-  // simplify the logic.
-  if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
-    LLVM_DEBUG(
-        dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
-                  "not a supported candidate.\n";);
-    return Result;
-  }
-
-  if (EpilogueVectorizationForceVF > 1) {
-    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
-    if (LVP.hasPlanWithVFs(
-            {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
-      return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
-    else {
-      LLVM_DEBUG(
-          dbgs()
-              << "LEV: Epilogue vectorization forced factor is not viable.\n";);
-      return Result;
-    }
-  }
-
-  if (TheLoop->getHeader()->getParent()->hasOptSize() ||
-      TheLoop->getHeader()->getParent()->hasMinSize()) {
-    LLVM_DEBUG(
-        dbgs()
-            << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
-    return Result;
-  }
-
-  if (!isEpilogueVectorizationProfitable(MainLoopVF))
-    return Result;
-
-  for (auto &NextVF : ProfitableVFs)
-    if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
-        (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
-        LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
-      Result = NextVF;
-
-  if (Result != VectorizationFactor::Disabled())
-    LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
-                      << Result.Width.getFixedValue() << "\n";);
-  return Result;
-}
-
+bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 
+    const Loop &L, ElementCount VF) const { 
+  // Cross iteration phis such as reductions need special handling and are 
+  // currently unsupported. 
+  if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 
+        return Legal->isFirstOrderRecurrence(&Phi) || 
+               Legal->isReductionVariable(&Phi); 
+      })) 
+    return false; 
+ 
+  // Phis with uses outside of the loop require special handling and are 
+  // currently unsupported. 
+  for (auto &Entry : Legal->getInductionVars()) { 
+    // Look for uses of the value of the induction at the last iteration. 
+    Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 
+    for (User *U : PostInc->users()) 
+      if (!L.contains(cast<Instruction>(U))) 
+        return false; 
+    // Look for uses of penultimate value of the induction. 
+    for (User *U : Entry.first->users()) 
+      if (!L.contains(cast<Instruction>(U))) 
+        return false; 
+  } 
+ 
+  // Induction variables that are widened require special handling that is 
+  // currently not supported. 
+  if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 
+        return !(this->isScalarAfterVectorization(Entry.first, VF) || 
+                 this->isProfitableToScalarize(Entry.first, VF)); 
+      })) 
+    return false; 
+ 
+  return true; 
+} 
+ 
+bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 
+    const ElementCount VF) const { 
+  // FIXME: We need a much better cost-model to take different parameters such 
+  // as register pressure, code size increase and cost of extra branches into 
+  // account. For now we apply a very crude heuristic and only consider loops 
+  // with vectorization factors larger than a certain value. 
+  // We also consider epilogue vectorization unprofitable for targets that don't 
+  // consider interleaving beneficial (eg. MVE). 
+  if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 
+    return false; 
+  if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 
+    return true; 
+  return false; 
+} 
+ 
+VectorizationFactor 
+LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 
+    const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 
+  VectorizationFactor Result = VectorizationFactor::Disabled(); 
+  if (!EnableEpilogueVectorization) { 
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 
+    return Result; 
+  } 
+ 
+  if (!isScalarEpilogueAllowed()) { 
+    LLVM_DEBUG( 
+        dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 
+                  "allowed.\n";); 
+    return Result; 
+  } 
+ 
+  // FIXME: This can be fixed for scalable vectors later, because at this stage 
+  // the LoopVectorizer will only consider vectorizing a loop with scalable 
+  // vectors when the loop has a hint to enable vectorization for a given VF. 
+  if (MainLoopVF.isScalable()) { 
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 
+                         "yet supported.\n"); 
+    return Result; 
+  } 
+ 
+  // Not really a cost consideration, but check for unsupported cases here to 
+  // simplify the logic. 
+  if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 
+    LLVM_DEBUG( 
+        dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 
+                  "not a supported candidate.\n";); 
+    return Result; 
+  } 
+ 
+  if (EpilogueVectorizationForceVF > 1) { 
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 
+    if (LVP.hasPlanWithVFs( 
+            {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 
+      return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 
+    else { 
+      LLVM_DEBUG( 
+          dbgs() 
+              << "LEV: Epilogue vectorization forced factor is not viable.\n";); 
+      return Result; 
+    } 
+  } 
+ 
+  if (TheLoop->getHeader()->getParent()->hasOptSize() || 
+      TheLoop->getHeader()->getParent()->hasMinSize()) { 
+    LLVM_DEBUG( 
+        dbgs() 
+            << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 
+    return Result; 
+  } 
+ 
+  if (!isEpilogueVectorizationProfitable(MainLoopVF)) 
+    return Result; 
+ 
+  for (auto &NextVF : ProfitableVFs) 
+    if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 
+        (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 
+        LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 
+      Result = NextVF; 
+ 
+  if (Result != VectorizationFactor::Disabled()) 
+    LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 
+                      << Result.Width.getFixedValue() << "\n";); 
+  return Result; 
+} 
+ 
 std::pair<unsigned, unsigned>
 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   unsigned MinWidth = -1U;
@@ -5985,11 +5985,11 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
         if (!Legal->isReductionVariable(PN))
           continue;
         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
-        if (PreferInLoopReductions ||
-            TTI.preferInLoopReduction(RdxDesc.getOpcode(),
-                                      RdxDesc.getRecurrenceType(),
-                                      TargetTransformInfo::ReductionFlags()))
-          continue;
+        if (PreferInLoopReductions || 
+            TTI.preferInLoopReduction(RdxDesc.getOpcode(), 
+                                      RdxDesc.getRecurrenceType(), 
+                                      TargetTransformInfo::ReductionFlags())) 
+          continue; 
         T = RdxDesc.getRecurrenceType();
       }
 
@@ -6020,7 +6020,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   return {MinWidth, MaxWidth};
 }
 
-unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
+unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 
                                                            unsigned LoopCost) {
   // -- The interleave heuristics --
   // We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -6043,15 +6043,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   if (Legal->getMaxSafeDepDistBytes() != -1U)
     return 1;
 
-  auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
-  const bool HasReductions = !Legal->getReductionVars().empty();
+  auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 
+  const bool HasReductions = !Legal->getReductionVars().empty(); 
   // Do not interleave loops with a relatively small known or estimated trip
-  // count. But we will interleave when InterleaveSmallLoopScalarReduction is
-  // enabled, and the code has scalar reductions(HasReductions && VF = 1),
-  // because with the above conditions interleaving can expose ILP and break
-  // cross iteration dependences for reductions.
-  if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
-      !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
+  // count. But we will interleave when InterleaveSmallLoopScalarReduction is 
+  // enabled, and the code has scalar reductions(HasReductions && VF = 1), 
+  // because with the above conditions interleaving can expose ILP and break 
+  // cross iteration dependences for reductions. 
+  if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 
+      !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 
     return 1;
 
   RegisterUsage R = calculateRegisterUsage({VF})[0];
@@ -6079,7 +6079,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
                       << " registers of "
                       << TTI.getRegisterClassName(pair.first) << " register class\n");
-    if (VF.isScalar()) {
+    if (VF.isScalar()) { 
       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
         TargetNumRegisters = ForceTargetNumScalarRegs;
     } else {
@@ -6103,11 +6103,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   }
 
   // Clamp the interleave ranges to reasonable counts.
-  unsigned MaxInterleaveCount =
-      TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
+  unsigned MaxInterleaveCount = 
+      TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 
 
   // Check if the user has overridden the max.
-  if (VF.isScalar()) {
+  if (VF.isScalar()) { 
     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
   } else {
@@ -6116,47 +6116,47 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   }
 
   // If trip count is known or estimated compile time constant, limit the
-  // interleave count to be less than the trip count divided by VF, provided it
-  // is at least 1.
-  //
-  // For scalable vectors we can't know if interleaving is beneficial. It may
-  // not be beneficial for small loops if none of the lanes in the second vector
-  // iterations is enabled. However, for larger loops, there is likely to be a
-  // similar benefit as for fixed-width vectors. For now, we choose to leave
-  // the InterleaveCount as if vscale is '1', although if some information about
-  // the vector is known (e.g. min vector size), we can make a better decision.
+  // interleave count to be less than the trip count divided by VF, provided it 
+  // is at least 1. 
+  // 
+  // For scalable vectors we can't know if interleaving is beneficial. It may 
+  // not be beneficial for small loops if none of the lanes in the second vector 
+  // iterations is enabled. However, for larger loops, there is likely to be a 
+  // similar benefit as for fixed-width vectors. For now, we choose to leave 
+  // the InterleaveCount as if vscale is '1', although if some information about 
+  // the vector is known (e.g. min vector size), we can make a better decision. 
   if (BestKnownTC) {
-    MaxInterleaveCount =
-        std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
-    // Make sure MaxInterleaveCount is greater than 0.
-    MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
+    MaxInterleaveCount = 
+        std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 
+    // Make sure MaxInterleaveCount is greater than 0. 
+    MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 
   }
 
-  assert(MaxInterleaveCount > 0 &&
-         "Maximum interleave count must be greater than 0");
+  assert(MaxInterleaveCount > 0 && 
+         "Maximum interleave count must be greater than 0"); 
 
   // Clamp the calculated IC to be between the 1 and the max interleave count
   // that the target and trip count allows.
   if (IC > MaxInterleaveCount)
     IC = MaxInterleaveCount;
-  else
-    // Make sure IC is greater than 0.
-    IC = std::max(1u, IC);
-
-  assert(IC > 0 && "Interleave count must be greater than 0.");
-
-  // If we did not calculate the cost for VF (because the user selected the VF)
-  // then we calculate the cost of VF here.
-  if (LoopCost == 0) {
-    assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
-    LoopCost = *expectedCost(VF).first.getValue();
-  }
-
-  assert(LoopCost && "Non-zero loop cost expected");
-
+  else 
+    // Make sure IC is greater than 0. 
+    IC = std::max(1u, IC); 
+
+  assert(IC > 0 && "Interleave count must be greater than 0."); 
+ 
+  // If we did not calculate the cost for VF (because the user selected the VF) 
+  // then we calculate the cost of VF here. 
+  if (LoopCost == 0) { 
+    assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 
+    LoopCost = *expectedCost(VF).first.getValue(); 
+  } 
+ 
+  assert(LoopCost && "Non-zero loop cost expected"); 
+ 
   // Interleave if we vectorized this loop and there is a reduction that could
   // benefit from interleaving.
-  if (VF.isVector() && HasReductions) {
+  if (VF.isVector() && HasReductions) { 
     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
     return IC;
   }
@@ -6164,15 +6164,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   // Note that if we've already vectorized the loop we will have done the
   // runtime check and so interleaving won't require further checks.
   bool InterleavingRequiresRuntimePointerCheck =
-      (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
+      (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 
 
   // We want to interleave small loops in order to reduce the loop overhead and
   // potentially expose ILP opportunities.
-  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
-                    << "LV: IC is " << IC << '\n'
-                    << "LV: VF is " << VF << '\n');
-  const bool AggressivelyInterleaveReductions =
-      TTI.enableAggressiveInterleaving(HasReductions);
+  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 
+                    << "LV: IC is " << IC << '\n' 
+                    << "LV: VF is " << VF << '\n'); 
+  const bool AggressivelyInterleaveReductions = 
+      TTI.enableAggressiveInterleaving(HasReductions); 
   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
     // We assume that the cost overhead is 1 and we use the cost model
     // to estimate the cost of the loop and interleave until the cost of the
@@ -6191,7 +6191,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     // by this point), we can increase the critical path length if the loop
     // we're interleaving is inside another loop. Limit, by default to 2, so the
     // critical path only gets increased by one reduction operation.
-    if (HasReductions && TheLoop->getLoopDepth() > 1) {
+    if (HasReductions && TheLoop->getLoopDepth() > 1) { 
       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
       SmallIC = std::min(SmallIC, F);
       StoresIC = std::min(StoresIC, F);
@@ -6205,23 +6205,23 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
       return std::max(StoresIC, LoadsIC);
     }
 
-    // If there are scalar reductions and TTI has enabled aggressive
-    // interleaving for reductions, we will interleave to expose ILP.
-    if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
-        AggressivelyInterleaveReductions) {
-      LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
-      // Interleave no less than SmallIC but not as aggressive as the normal IC
-      // to satisfy the rare situation when resources are too limited.
-      return std::max(IC / 2, SmallIC);
-    } else {
-      LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
-      return SmallIC;
-    }
+    // If there are scalar reductions and TTI has enabled aggressive 
+    // interleaving for reductions, we will interleave to expose ILP. 
+    if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 
+        AggressivelyInterleaveReductions) { 
+      LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 
+      // Interleave no less than SmallIC but not as aggressive as the normal IC 
+      // to satisfy the rare situation when resources are too limited. 
+      return std::max(IC / 2, SmallIC); 
+    } else { 
+      LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 
+      return SmallIC; 
+    } 
   }
 
   // Interleave if this is a large loop (small loops are already dealt with by
   // this point) that could benefit from interleaving.
-  if (AggressivelyInterleaveReductions) {
+  if (AggressivelyInterleaveReductions) { 
     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
     return IC;
   }
@@ -6231,7 +6231,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
 }
 
 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
-LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
+LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 
   // This function calculates the register usage by measuring the highest number
   // of values that are alive at a single location. Obviously, this is a very
   // rough estimation. We scan the loop in a topological order in order and
@@ -6309,11 +6309,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
 
   // A lambda that gets the register usage for the given type and VF.
-  const auto &TTICapture = TTI;
-  auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
-    if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
+  const auto &TTICapture = TTI; 
+  auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 
+    if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 
       return 0U;
-    return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
+    return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 
   };
 
   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
@@ -6337,7 +6337,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
       // Count the number of live intervals.
       SmallMapVector<unsigned, unsigned, 4> RegUsage;
 
-      if (VFs[j].isScalar()) {
+      if (VFs[j].isScalar()) { 
         for (auto Inst : OpenIntervals) {
           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
           if (RegUsage.find(ClassID) == RegUsage.end())
@@ -6366,7 +6366,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
           }
         }
       }
-
+ 
       for (auto& pair : RegUsage) {
         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
@@ -6384,12 +6384,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
 
   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
     SmallMapVector<unsigned, unsigned, 4> Invariant;
-
+ 
     for (auto Inst : LoopInvariants) {
-      unsigned Usage =
-          VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
-      unsigned ClassID =
-          TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
+      unsigned Usage = 
+          VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 
+      unsigned ClassID = 
+          TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 
       if (Invariant.find(ClassID) == Invariant.end())
         Invariant[ClassID] = Usage;
       else
@@ -6437,13 +6437,13 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
           NumPredStores > NumberOfStoresToPredicate);
 }
 
-void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
+void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 
   // If we aren't vectorizing the loop, or if we've already collected the
   // instructions to scalarize, there's nothing to do. Collection may already
   // have occurred if we have a user-selected VF and are now computing the
   // expected cost for interleaving.
-  if (VF.isScalar() || VF.isZero() ||
-      InstsToScalarize.find(VF) != InstsToScalarize.end())
+  if (VF.isScalar() || VF.isZero() || 
+      InstsToScalarize.find(VF) != InstsToScalarize.end()) 
     return;
 
   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
@@ -6472,13 +6472,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
 }
 
 int LoopVectorizationCostModel::computePredInstDiscount(
-    Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
+    Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 
   assert(!isUniformAfterVectorization(PredInst, VF) &&
          "Instruction marked uniform-after-vectorization will be predicated");
 
   // Initialize the discount to zero, meaning that the scalar version and the
   // vector version cost the same.
-  InstructionCost Discount = 0;
+  InstructionCost Discount = 0; 
 
   // Holds instructions to analyze. The instructions we visit are mapped in
   // ScalarCosts. Those instructions are the ones that would be scalarized if
@@ -6533,27 +6533,27 @@ int LoopVectorizationCostModel::computePredInstDiscount(
 
     // Compute the cost of the vector instruction. Note that this cost already
     // includes the scalarization overhead of the predicated instruction.
-    InstructionCost VectorCost = getInstructionCost(I, VF).first;
+    InstructionCost VectorCost = getInstructionCost(I, VF).first; 
 
     // Compute the cost of the scalarized instruction. This cost is the cost of
     // the instruction as if it wasn't if-converted and instead remained in the
     // predicated block. We will scale this cost by block probability after
     // computing the scalarization overhead.
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
-    InstructionCost ScalarCost =
-        VF.getKnownMinValue() *
-        getInstructionCost(I, ElementCount::getFixed(1)).first;
+    assert(!VF.isScalable() && "scalable vectors not yet supported."); 
+    InstructionCost ScalarCost = 
+        VF.getKnownMinValue() * 
+        getInstructionCost(I, ElementCount::getFixed(1)).first; 
 
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
       ScalarCost += TTI.getScalarizationOverhead(
           cast<VectorType>(ToVectorTy(I->getType(), VF)),
-          APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
-      assert(!VF.isScalable() && "scalable vectors not yet supported.");
-      ScalarCost +=
-          VF.getKnownMinValue() *
-          TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
+          APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 
+      assert(!VF.isScalable() && "scalable vectors not yet supported."); 
+      ScalarCost += 
+          VF.getKnownMinValue() * 
+          TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 
     }
 
     // Compute the scalarization overhead of needed extractelement
@@ -6566,12 +6566,12 @@ int LoopVectorizationCostModel::computePredInstDiscount(
                "Instruction has non-scalar type");
         if (canBeScalarized(J))
           Worklist.push_back(J);
-        else if (needsExtract(J, VF)) {
-          assert(!VF.isScalable() && "scalable vectors not yet supported.");
+        else if (needsExtract(J, VF)) { 
+          assert(!VF.isScalable() && "scalable vectors not yet supported."); 
           ScalarCost += TTI.getScalarizationOverhead(
               cast<VectorType>(ToVectorTy(J->getType(), VF)),
-              APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
-        }
+              APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 
+        } 
       }
 
     // Scale the total scalar cost by block probability.
@@ -6583,11 +6583,11 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     ScalarCosts[I] = ScalarCost;
   }
 
-  return *Discount.getValue();
+  return *Discount.getValue(); 
 }
 
 LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::expectedCost(ElementCount VF) {
+LoopVectorizationCostModel::expectedCost(ElementCount VF) { 
   VectorizationCostTy Cost;
 
   // For each block.
@@ -6597,15 +6597,15 @@ LoopVectorizationCostModel::expectedCost(ElementCount VF) {
     // For each instruction in the old loop.
     for (Instruction &I : BB->instructionsWithoutDebug()) {
       // Skip ignored values.
-      if (ValuesToIgnore.count(&I) ||
-          (VF.isVector() && VecValuesToIgnore.count(&I)))
+      if (ValuesToIgnore.count(&I) || 
+          (VF.isVector() && VecValuesToIgnore.count(&I))) 
         continue;
 
       VectorizationCostTy C = getInstructionCost(&I, VF);
 
       // Check if we should override the cost.
       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
-        C.first = InstructionCost(ForceTargetInstructionCost);
+        C.first = InstructionCost(ForceTargetInstructionCost); 
 
       BlockCost.first += C.first;
       BlockCost.second |= C.second;
@@ -6618,10 +6618,10 @@ LoopVectorizationCostModel::expectedCost(ElementCount VF) {
     // if-converted. This means that the block's instructions (aside from
     // stores and instructions that may divide by zero) will now be
     // unconditionally executed. For the scalar case, we may not always execute
-    // the predicated block, if it is an if-else block. Thus, scale the block's
-    // cost by the probability of executing it. blockNeedsPredication from
-    // Legal is used so as to not include all blocks in tail folded loops.
-    if (VF.isScalar() && Legal->blockNeedsPredication(BB))
+    // the predicated block, if it is an if-else block. Thus, scale the block's 
+    // cost by the probability of executing it. blockNeedsPredication from 
+    // Legal is used so as to not include all blocks in tail folded loops. 
+    if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 
       BlockCost.first /= getReciprocalPredBlockProb();
 
     Cost.first += BlockCost.first;
@@ -6666,12 +6666,12 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
          Legal->hasStride(I->getOperand(1));
 }
 
-InstructionCost
-LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
-                                                        ElementCount VF) {
-  assert(VF.isVector() &&
-         "Scalarization cost of instruction implies vectorization.");
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
+InstructionCost 
+LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 
+                                                        ElementCount VF) { 
+  assert(VF.isVector() && 
+         "Scalarization cost of instruction implies vectorization."); 
+  assert(!VF.isScalable() && "scalable vectors not yet supported."); 
   Type *ValTy = getMemInstValueType(I);
   auto SE = PSE.getSE();
 
@@ -6684,15 +6684,15 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
 
   // Get the cost of the scalar memory instruction and address computation.
-  InstructionCost Cost =
-      VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+  InstructionCost Cost = 
+      VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 
 
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
   const Align Alignment = getLoadStoreAlignment(I);
-  Cost += VF.getKnownMinValue() *
-          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
-                              AS, TTI::TCK_RecipThroughput);
+  Cost += VF.getKnownMinValue() * 
+          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 
+                              AS, TTI::TCK_RecipThroughput); 
 
   // Get the overhead of the extractelement and insertelement instructions
   // we might create due to scalarization.
@@ -6713,9 +6713,9 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   return Cost;
 }
 
-InstructionCost
-LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
-                                                    ElementCount VF) {
+InstructionCost 
+LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 
+                                                    ElementCount VF) { 
   Type *ValTy = getMemInstValueType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   Value *Ptr = getLoadStorePointerOperand(I);
@@ -6726,7 +6726,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
          "Stride should be 1 or -1 for consecutive memory access");
   const Align Alignment = getLoadStoreAlignment(I);
-  InstructionCost Cost = 0;
+  InstructionCost Cost = 0; 
   if (Legal->isMaskRequired(I))
     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
                                       CostKind);
@@ -6740,11 +6740,11 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
   return Cost;
 }
 
-InstructionCost
-LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
-                                                ElementCount VF) {
-  assert(Legal->isUniformMemOp(*I));
-
+InstructionCost 
+LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 
+                                                ElementCount VF) { 
+  assert(Legal->isUniformMemOp(*I)); 
+ 
   Type *ValTy = getMemInstValueType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   const Align Alignment = getLoadStoreAlignment(I);
@@ -6765,12 +6765,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
          (isLoopInvariantStoreValue
               ? 0
               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
-                                       VF.getKnownMinValue() - 1));
+                                       VF.getKnownMinValue() - 1)); 
 }
 
-InstructionCost
-LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
-                                                 ElementCount VF) {
+InstructionCost 
+LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 
+                                                 ElementCount VF) { 
   Type *ValTy = getMemInstValueType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   const Align Alignment = getLoadStoreAlignment(I);
@@ -6782,9 +6782,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
              TargetTransformInfo::TCK_RecipThroughput, I);
 }
 
-InstructionCost
-LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
-                                                   ElementCount VF) {
+InstructionCost 
+LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 
+                                                   ElementCount VF) { 
   Type *ValTy = getMemInstValueType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   unsigned AS = getLoadStoreAddressSpace(I);
@@ -6793,8 +6793,8 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   assert(Group && "Fail to get an interleaved access group.");
 
   unsigned InterleaveFactor = Group->getFactor();
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
-  auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
+  assert(!VF.isScalable() && "scalable vectors not yet supported."); 
+  auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 
 
   // Holds the indices of existing members in an interleaved load group.
   // An interleaved store group doesn't need this as it doesn't allow gaps.
@@ -6808,7 +6808,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   // Calculate the cost of the whole interleaved group.
   bool UseMaskForGaps =
       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
-  InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
+  InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 
       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
 
@@ -6822,122 +6822,122 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   return Cost;
 }
 
-InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
-    Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
-  // Early exit for no inloop reductions
-  if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
-    return InstructionCost::getInvalid();
-  auto *VectorTy = cast<VectorType>(Ty);
-
-  // We are looking for a pattern of, and finding the minimal acceptable cost:
-  //  reduce(mul(ext(A), ext(B))) or
-  //  reduce(mul(A, B)) or
-  //  reduce(ext(A)) or
-  //  reduce(A).
-  // The basic idea is that we walk down the tree to do that, finding the root
-  // reduction instruction in InLoopReductionImmediateChains. From there we find
-  // the pattern of mul/ext and test the cost of the entire pattern vs the cost
-  // of the components. If the reduction cost is lower then we return it for the
-  // reduction instruction and 0 for the other instructions in the pattern. If
-  // it is not we return an invalid cost specifying the orignal cost method
-  // should be used.
-  Instruction *RetI = I;
-  if ((RetI->getOpcode() == Instruction::SExt ||
-       RetI->getOpcode() == Instruction::ZExt)) {
-    if (!RetI->hasOneUser())
-      return InstructionCost::getInvalid();
-    RetI = RetI->user_back();
-  }
-  if (RetI->getOpcode() == Instruction::Mul &&
-      RetI->user_back()->getOpcode() == Instruction::Add) {
-    if (!RetI->hasOneUser())
-      return InstructionCost::getInvalid();
-    RetI = RetI->user_back();
-  }
-
-  // Test if the found instruction is a reduction, and if not return an invalid
-  // cost specifying the parent to use the original cost modelling.
-  if (!InLoopReductionImmediateChains.count(RetI))
-    return InstructionCost::getInvalid();
-
-  // Find the reduction this chain is a part of and calculate the basic cost of
-  // the reduction on its own.
-  Instruction *LastChain = InLoopReductionImmediateChains[RetI];
-  Instruction *ReductionPhi = LastChain;
-  while (!isa<PHINode>(ReductionPhi))
-    ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
-
-  RecurrenceDescriptor RdxDesc =
-      Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
-  unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(),
-                                                     VectorTy, false, CostKind);
-
-  // Get the operand that was not the reduction chain and match it to one of the
-  // patterns, returning the better cost if it is found.
-  Instruction *RedOp = RetI->getOperand(1) == LastChain
-                           ? dyn_cast<Instruction>(RetI->getOperand(0))
-                           : dyn_cast<Instruction>(RetI->getOperand(1));
-
-  VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
-
-  if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) &&
-      !TheLoop->isLoopInvariant(RedOp)) {
-    bool IsUnsigned = isa<ZExtInst>(RedOp);
-    auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
-    InstructionCost RedCost = TTI.getExtendedAddReductionCost(
-        /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
-        CostKind);
-
-    unsigned ExtCost =
-        TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
-                             TTI::CastContextHint::None, CostKind, RedOp);
-    if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
-      return I == RetI ? *RedCost.getValue() : 0;
-  } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) {
-    Instruction *Mul = RedOp;
-    Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0));
-    Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1));
-    if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) &&
-        Op0->getOpcode() == Op1->getOpcode() &&
-        Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
-        !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
-      bool IsUnsigned = isa<ZExtInst>(Op0);
-      auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
-      // reduce(mul(ext, ext))
-      unsigned ExtCost =
-          TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
-                               TTI::CastContextHint::None, CostKind, Op0);
-      unsigned MulCost =
-          TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
-
-      InstructionCost RedCost = TTI.getExtendedAddReductionCost(
-          /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
-          CostKind);
-
-      if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
-        return I == RetI ? *RedCost.getValue() : 0;
-    } else {
-      unsigned MulCost =
-          TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
-
-      InstructionCost RedCost = TTI.getExtendedAddReductionCost(
-          /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
-          CostKind);
-
-      if (RedCost.isValid() && RedCost < MulCost + BaseCost)
-        return I == RetI ? *RedCost.getValue() : 0;
-    }
-  }
-
-  return I == RetI ? BaseCost : InstructionCost::getInvalid();
-}
-
-InstructionCost
-LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
-                                                     ElementCount VF) {
+InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 
+    Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 
+  // Early exit for no inloop reductions 
+  if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 
+    return InstructionCost::getInvalid(); 
+  auto *VectorTy = cast<VectorType>(Ty); 
+ 
+  // We are looking for a pattern of, and finding the minimal acceptable cost: 
+  //  reduce(mul(ext(A), ext(B))) or 
+  //  reduce(mul(A, B)) or 
+  //  reduce(ext(A)) or 
+  //  reduce(A). 
+  // The basic idea is that we walk down the tree to do that, finding the root 
+  // reduction instruction in InLoopReductionImmediateChains. From there we find 
+  // the pattern of mul/ext and test the cost of the entire pattern vs the cost 
+  // of the components. If the reduction cost is lower then we return it for the 
+  // reduction instruction and 0 for the other instructions in the pattern. If 
+  // it is not we return an invalid cost specifying the orignal cost method 
+  // should be used. 
+  Instruction *RetI = I; 
+  if ((RetI->getOpcode() == Instruction::SExt || 
+       RetI->getOpcode() == Instruction::ZExt)) { 
+    if (!RetI->hasOneUser()) 
+      return InstructionCost::getInvalid(); 
+    RetI = RetI->user_back(); 
+  } 
+  if (RetI->getOpcode() == Instruction::Mul && 
+      RetI->user_back()->getOpcode() == Instruction::Add) { 
+    if (!RetI->hasOneUser()) 
+      return InstructionCost::getInvalid(); 
+    RetI = RetI->user_back(); 
+  } 
+ 
+  // Test if the found instruction is a reduction, and if not return an invalid 
+  // cost specifying the parent to use the original cost modelling. 
+  if (!InLoopReductionImmediateChains.count(RetI)) 
+    return InstructionCost::getInvalid(); 
+ 
+  // Find the reduction this chain is a part of and calculate the basic cost of 
+  // the reduction on its own. 
+  Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 
+  Instruction *ReductionPhi = LastChain; 
+  while (!isa<PHINode>(ReductionPhi)) 
+    ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 
+ 
+  RecurrenceDescriptor RdxDesc = 
+      Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 
+  unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), 
+                                                     VectorTy, false, CostKind); 
+ 
+  // Get the operand that was not the reduction chain and match it to one of the 
+  // patterns, returning the better cost if it is found. 
+  Instruction *RedOp = RetI->getOperand(1) == LastChain 
+                           ? dyn_cast<Instruction>(RetI->getOperand(0)) 
+                           : dyn_cast<Instruction>(RetI->getOperand(1)); 
+ 
+  VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 
+ 
+  if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 
+      !TheLoop->isLoopInvariant(RedOp)) { 
+    bool IsUnsigned = isa<ZExtInst>(RedOp); 
+    auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 
+    InstructionCost RedCost = TTI.getExtendedAddReductionCost( 
+        /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 
+        CostKind); 
+ 
+    unsigned ExtCost = 
+        TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 
+                             TTI::CastContextHint::None, CostKind, RedOp); 
+    if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 
+      return I == RetI ? *RedCost.getValue() : 0; 
+  } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 
+    Instruction *Mul = RedOp; 
+    Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 
+    Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 
+    if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 
+        Op0->getOpcode() == Op1->getOpcode() && 
+        Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 
+        !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 
+      bool IsUnsigned = isa<ZExtInst>(Op0); 
+      auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 
+      // reduce(mul(ext, ext)) 
+      unsigned ExtCost = 
+          TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 
+                               TTI::CastContextHint::None, CostKind, Op0); 
+      unsigned MulCost = 
+          TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 
+ 
+      InstructionCost RedCost = TTI.getExtendedAddReductionCost( 
+          /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 
+          CostKind); 
+ 
+      if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 
+        return I == RetI ? *RedCost.getValue() : 0; 
+    } else { 
+      unsigned MulCost = 
+          TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 
+ 
+      InstructionCost RedCost = TTI.getExtendedAddReductionCost( 
+          /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 
+          CostKind); 
+ 
+      if (RedCost.isValid() && RedCost < MulCost + BaseCost) 
+        return I == RetI ? *RedCost.getValue() : 0; 
+    } 
+  } 
+ 
+  return I == RetI ? BaseCost : InstructionCost::getInvalid(); 
+} 
+ 
+InstructionCost 
+LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 
+                                                     ElementCount VF) { 
   // Calculate scalar cost only. Vectorization cost should be ready at this
   // moment.
-  if (VF.isScalar()) {
+  if (VF.isScalar()) { 
     Type *ValTy = getMemInstValueType(I);
     const Align Alignment = getLoadStoreAlignment(I);
     unsigned AS = getLoadStoreAddressSpace(I);
@@ -6950,52 +6950,52 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
 }
 
 LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::getInstructionCost(Instruction *I,
-                                               ElementCount VF) {
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, 
+                                               ElementCount VF) { 
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
   if (isUniformAfterVectorization(I, VF))
-    VF = ElementCount::getFixed(1);
+    VF = ElementCount::getFixed(1); 
 
-  if (VF.isVector() && isProfitableToScalarize(I, VF))
+  if (VF.isVector() && isProfitableToScalarize(I, VF)) 
     return VectorizationCostTy(InstsToScalarize[VF][I], false);
 
   // Forced scalars do not have any scalarization overhead.
   auto ForcedScalar = ForcedScalars.find(VF);
-  if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
+  if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 
     auto InstSet = ForcedScalar->second;
     if (InstSet.count(I))
-      return VectorizationCostTy(
-          (getInstructionCost(I, ElementCount::getFixed(1)).first *
-           VF.getKnownMinValue()),
-          false);
+      return VectorizationCostTy( 
+          (getInstructionCost(I, ElementCount::getFixed(1)).first * 
+           VF.getKnownMinValue()), 
+          false); 
   }
 
   Type *VectorTy;
-  InstructionCost C = getInstructionCost(I, VF, VectorTy);
+  InstructionCost C = getInstructionCost(I, VF, VectorTy); 
 
   bool TypeNotScalarized =
-      VF.isVector() && VectorTy->isVectorTy() &&
-      TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
+      VF.isVector() && VectorTy->isVectorTy() && 
+      TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 
   return VectorizationCostTy(C, TypeNotScalarized);
 }
 
-InstructionCost
-LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
-                                                     ElementCount VF) {
+InstructionCost 
+LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 
+                                                     ElementCount VF) { 
 
-  assert(!VF.isScalable() &&
-         "cannot compute scalarization overhead for scalable vectorization");
-  if (VF.isScalar())
+  assert(!VF.isScalable() && 
+         "cannot compute scalarization overhead for scalable vectorization"); 
+  if (VF.isScalar()) 
     return 0;
 
-  InstructionCost Cost = 0;
+  InstructionCost Cost = 0; 
   Type *RetTy = ToVectorTy(I->getType(), VF);
   if (!RetTy->isVoidTy() &&
       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
     Cost += TTI.getScalarizationOverhead(
-        cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
-        true, false);
+        cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 
+        true, false); 
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -7012,11 +7012,11 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
   // Skip operands that do not require extraction/scalarization and do not incur
   // any overhead.
   return Cost + TTI.getOperandsScalarizationOverhead(
-                    filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
+                    filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 
 }
 
-void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
-  if (VF.isScalar())
+void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 
+  if (VF.isScalar()) 
     return;
   NumPredStores = 0;
   for (BasicBlock *BB : TheLoop->blocks()) {
@@ -7033,19 +7033,19 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
         NumPredStores++;
 
-      if (Legal->isUniformMemOp(I)) {
+      if (Legal->isUniformMemOp(I)) { 
         // TODO: Avoid replicating loads and stores instead of
         // relying on instcombine to remove them.
         // Load: Scalar load + broadcast
         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
-        InstructionCost Cost = getUniformMemOpCost(&I, VF);
+        InstructionCost Cost = getUniformMemOpCost(&I, VF); 
         setWideningDecision(&I, VF, CM_Scalarize, Cost);
         continue;
       }
 
       // We assume that widening is the best solution when possible.
       if (memoryInstructionCanBeWidened(&I, VF)) {
-        InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
+        InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 
         int ConsecutiveStride =
                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
@@ -7057,7 +7057,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
       }
 
       // Choose between Interleaving, Gather/Scatter or Scalarization.
-      InstructionCost InterleaveCost = std::numeric_limits<int>::max();
+      InstructionCost InterleaveCost = std::numeric_limits<int>::max(); 
       unsigned NumAccesses = 1;
       if (isAccessInterleaved(&I)) {
         auto Group = getInterleavedAccessGroup(&I);
@@ -7072,17 +7072,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
           InterleaveCost = getInterleaveGroupCost(&I, VF);
       }
 
-      InstructionCost GatherScatterCost =
+      InstructionCost GatherScatterCost = 
           isLegalGatherOrScatter(&I)
               ? getGatherScatterCost(&I, VF) * NumAccesses
-              : std::numeric_limits<int>::max();
+              : std::numeric_limits<int>::max(); 
 
-      InstructionCost ScalarizationCost =
+      InstructionCost ScalarizationCost = 
           getMemInstScalarizationCost(&I, VF) * NumAccesses;
 
       // Choose better solution for the current VF,
       // write down this decision and use it during vectorization.
-      InstructionCost Cost;
+      InstructionCost Cost; 
       InstWidening Decision;
       if (InterleaveCost <= GatherScatterCost &&
           InterleaveCost < ScalarizationCost) {
@@ -7126,7 +7126,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
 
   // Add all instructions used to generate the addresses.
   SmallVector<Instruction *, 4> Worklist;
-  append_range(Worklist, AddrDefs);
+  append_range(Worklist, AddrDefs); 
   while (!Worklist.empty()) {
     Instruction *I = Worklist.pop_back_val();
     for (auto &Op : I->operands())
@@ -7145,18 +7145,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
       InstWidening Decision = getWideningDecision(I, VF);
       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
         // Scalarize a widened load of address.
-        setWideningDecision(
-            I, VF, CM_Scalarize,
-            (VF.getKnownMinValue() *
-             getMemoryInstructionCost(I, ElementCount::getFixed(1))));
+        setWideningDecision( 
+            I, VF, CM_Scalarize, 
+            (VF.getKnownMinValue() * 
+             getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 
       else if (auto Group = getInterleavedAccessGroup(I)) {
         // Scalarize an interleave group of address loads.
         for (unsigned I = 0; I < Group->getFactor(); ++I) {
           if (Instruction *Member = Group->getMember(I))
-            setWideningDecision(
-                Member, VF, CM_Scalarize,
-                (VF.getKnownMinValue() *
-                 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
+            setWideningDecision( 
+                Member, VF, CM_Scalarize, 
+                (VF.getKnownMinValue() * 
+                 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 
         }
       }
     } else
@@ -7166,9 +7166,9 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
   }
 }
 
-InstructionCost
-LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
-                                               Type *&VectorTy) {
+InstructionCost 
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 
+                                               Type *&VectorTy) { 
   Type *RetTy = I->getType();
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
@@ -7190,22 +7190,22 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     // blocks requires also an extract of its vector compare i1 element.
     bool ScalarPredicatedBB = false;
     BranchInst *BI = cast<BranchInst>(I);
-    if (VF.isVector() && BI->isConditional() &&
+    if (VF.isVector() && BI->isConditional() && 
         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
       ScalarPredicatedBB = true;
 
     if (ScalarPredicatedBB) {
       // Return cost for branches around scalarized and predicated blocks.
-      assert(!VF.isScalable() && "scalable vectors not yet supported.");
+      assert(!VF.isScalable() && "scalable vectors not yet supported."); 
       auto *Vec_i1Ty =
-          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
-      return (TTI.getScalarizationOverhead(
-                  Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
-                  false, true) +
-              (TTI.getCFInstrCost(Instruction::Br, CostKind) *
-               VF.getKnownMinValue()));
-    } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
+          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 
+      return (TTI.getScalarizationOverhead( 
+                  Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 
+                  false, true) + 
+              (TTI.getCFInstrCost(Instruction::Br, CostKind) * 
+               VF.getKnownMinValue())); 
+    } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 
       // The back-edge branch will remain, as will all scalar branches.
       return TTI.getCFInstrCost(Instruction::Br, CostKind);
     else
@@ -7220,20 +7220,20 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
 
     // First-order recurrences are replaced by vector shuffles inside the loop.
     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
-    if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
-      return TTI.getShuffleCost(
-          TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
-          VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
+    if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 
+      return TTI.getShuffleCost( 
+          TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 
+          VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 
 
     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
     // converted into select instructions. We require N - 1 selects per phi
     // node, where N is the number of incoming values.
-    if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
+    if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 
       return (Phi->getNumIncomingValues() - 1) *
              TTI.getCmpSelInstrCost(
                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
-                 CmpInst::BAD_ICMP_PREDICATE, CostKind);
+                 CmpInst::BAD_ICMP_PREDICATE, CostKind); 
 
     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
   }
@@ -7245,19 +7245,19 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     // vector lane. Get the scalarization cost and scale this amount by the
     // probability of executing the predicated block. If the instruction is not
     // predicated, we fall through to the next case.
-    if (VF.isVector() && isScalarWithPredication(I)) {
-      InstructionCost Cost = 0;
+    if (VF.isVector() && isScalarWithPredication(I)) { 
+      InstructionCost Cost = 0; 
 
       // These instructions have a non-void type, so account for the phi nodes
       // that we will create. This cost is likely to be zero. The phi node
       // cost, if any, should be scaled by the block probability because it
       // models a copy at the end of each predicated block.
-      Cost += VF.getKnownMinValue() *
-              TTI.getCFInstrCost(Instruction::PHI, CostKind);
+      Cost += VF.getKnownMinValue() * 
+              TTI.getCFInstrCost(Instruction::PHI, CostKind); 
 
       // The cost of the non-predicated instruction.
-      Cost += VF.getKnownMinValue() *
-              TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
+      Cost += VF.getKnownMinValue() * 
+              TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 
 
       // The cost of insertelement and extractelement instructions needed for
       // scalarization.
@@ -7286,13 +7286,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     // Since we will replace the stride by 1 the multiplication should go away.
     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
       return 0;
-
-    // Detect reduction patterns
-    InstructionCost RedCost;
-    if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
-            .isValid())
-      return RedCost;
-
+ 
+    // Detect reduction patterns 
+    InstructionCost RedCost; 
+    if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 
+            .isValid()) 
+      return RedCost; 
+ 
     // Certain instructions can be cheaper to vectorize if they have a constant
     // second vector operand. One example of this are shifts on x86.
     Value *Op2 = I->getOperand(1);
@@ -7303,15 +7303,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       Op2VK = TargetTransformInfo::OK_UniformValue;
 
     SmallVector<const Value *, 4> Operands(I->operand_values());
-    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 
     return N * TTI.getArithmeticInstrCost(
                    I->getOpcode(), VectorTy, CostKind,
                    TargetTransformInfo::OK_AnyValue,
                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
   }
   case Instruction::FNeg: {
-    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
+    assert(!VF.isScalable() && "VF is assumed to be non scalable."); 
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 
     return N * TTI.getArithmeticInstrCost(
                    I->getOpcode(), VectorTy, CostKind,
                    TargetTransformInfo::OK_AnyValue,
@@ -7325,9 +7325,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
     Type *CondTy = SI->getCondition()->getType();
     if (!ScalarCond)
-      CondTy = VectorType::get(CondTy, VF);
+      CondTy = VectorType::get(CondTy, VF); 
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
-                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
+                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
@@ -7336,18 +7336,18 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
     VectorTy = ToVectorTy(ValTy, VF);
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
-                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 
+                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 
   }
   case Instruction::Store:
   case Instruction::Load: {
-    ElementCount Width = VF;
-    if (Width.isVector()) {
+    ElementCount Width = VF; 
+    if (Width.isVector()) { 
       InstWidening Decision = getWideningDecision(I, Width);
       assert(Decision != CM_Unknown &&
              "CM decision should be taken at this point");
       if (Decision == CM_Scalarize)
-        Width = ElementCount::getFixed(1);
+        Width = ElementCount::getFixed(1); 
     }
     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
     return getMemoryInstructionCost(I, VF);
@@ -7364,62 +7364,62 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
-    // Computes the CastContextHint from a Load/Store instruction.
-    auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
-      assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
-             "Expected a load or a store!");
-
-      if (VF.isScalar() || !TheLoop->contains(I))
-        return TTI::CastContextHint::Normal;
-
-      switch (getWideningDecision(I, VF)) {
-      case LoopVectorizationCostModel::CM_GatherScatter:
-        return TTI::CastContextHint::GatherScatter;
-      case LoopVectorizationCostModel::CM_Interleave:
-        return TTI::CastContextHint::Interleave;
-      case LoopVectorizationCostModel::CM_Scalarize:
-      case LoopVectorizationCostModel::CM_Widen:
-        return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
-                                        : TTI::CastContextHint::Normal;
-      case LoopVectorizationCostModel::CM_Widen_Reverse:
-        return TTI::CastContextHint::Reversed;
-      case LoopVectorizationCostModel::CM_Unknown:
-        llvm_unreachable("Instr did not go through cost modelling?");
-      }
-
-      llvm_unreachable("Unhandled case!");
-    };
-
-    unsigned Opcode = I->getOpcode();
-    TTI::CastContextHint CCH = TTI::CastContextHint::None;
-    // For Trunc, the context is the only user, which must be a StoreInst.
-    if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
-      if (I->hasOneUse())
-        if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
-          CCH = ComputeCCH(Store);
-    }
-    // For Z/Sext, the context is the operand, which must be a LoadInst.
-    else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
-             Opcode == Instruction::FPExt) {
-      if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
-        CCH = ComputeCCH(Load);
-    }
-
+    // Computes the CastContextHint from a Load/Store instruction. 
+    auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 
+      assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 
+             "Expected a load or a store!"); 
+ 
+      if (VF.isScalar() || !TheLoop->contains(I)) 
+        return TTI::CastContextHint::Normal; 
+ 
+      switch (getWideningDecision(I, VF)) { 
+      case LoopVectorizationCostModel::CM_GatherScatter: 
+        return TTI::CastContextHint::GatherScatter; 
+      case LoopVectorizationCostModel::CM_Interleave: 
+        return TTI::CastContextHint::Interleave; 
+      case LoopVectorizationCostModel::CM_Scalarize: 
+      case LoopVectorizationCostModel::CM_Widen: 
+        return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 
+                                        : TTI::CastContextHint::Normal; 
+      case LoopVectorizationCostModel::CM_Widen_Reverse: 
+        return TTI::CastContextHint::Reversed; 
+      case LoopVectorizationCostModel::CM_Unknown: 
+        llvm_unreachable("Instr did not go through cost modelling?"); 
+      } 
+ 
+      llvm_unreachable("Unhandled case!"); 
+    }; 
+ 
+    unsigned Opcode = I->getOpcode(); 
+    TTI::CastContextHint CCH = TTI::CastContextHint::None; 
+    // For Trunc, the context is the only user, which must be a StoreInst. 
+    if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 
+      if (I->hasOneUse()) 
+        if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 
+          CCH = ComputeCCH(Store); 
+    } 
+    // For Z/Sext, the context is the operand, which must be a LoadInst. 
+    else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 
+             Opcode == Instruction::FPExt) { 
+      if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 
+        CCH = ComputeCCH(Load); 
+    } 
+ 
     // We optimize the truncation of induction variables having constant
     // integer steps. The cost of these truncations is the same as the scalar
     // operation.
     if (isOptimizableIVTruncate(I, VF)) {
       auto *Trunc = cast<TruncInst>(I);
       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
-                                  Trunc->getSrcTy(), CCH, CostKind, Trunc);
+                                  Trunc->getSrcTy(), CCH, CostKind, Trunc); 
     }
 
-    // Detect reduction patterns
-    InstructionCost RedCost;
-    if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
-            .isValid())
-      return RedCost;
-
+    // Detect reduction patterns 
+    InstructionCost RedCost; 
+    if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 
+            .isValid()) 
+      return RedCost; 
+ 
     Type *SrcScalarTy = I->getOperand(0)->getType();
     Type *SrcVecTy =
         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
@@ -7430,39 +7430,39 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       //
       // Calculate the modified src and dest types.
       Type *MinVecTy = VectorTy;
-      if (Opcode == Instruction::Trunc) {
+      if (Opcode == Instruction::Trunc) { 
         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
         VectorTy =
             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
-      } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
+      } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 
         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
         VectorTy =
             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
       }
     }
 
-    assert(!VF.isScalable() && "VF is assumed to be non scalable");
-    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
-    return N *
-           TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
+    assert(!VF.isScalable() && "VF is assumed to be non scalable"); 
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 
+    return N * 
+           TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 
   }
   case Instruction::Call: {
     bool NeedToScalarize;
     CallInst *CI = cast<CallInst>(I);
-    InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
-    if (getVectorIntrinsicIDForCall(CI, TLI)) {
-      InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
-      return std::min(CallCost, IntrinsicCost);
-    }
+    InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 
+    if (getVectorIntrinsicIDForCall(CI, TLI)) { 
+      InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 
+      return std::min(CallCost, IntrinsicCost); 
+    } 
     return CallCost;
   }
-  case Instruction::ExtractValue:
-    return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
+  case Instruction::ExtractValue: 
+    return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 
   default:
     // The cost of executing VF copies of the scalar instruction. This opcode
     // is unknown. Assume that it is the same as 'mul'.
-    return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
-                                       Instruction::Mul, VectorTy, CostKind) +
+    return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 
+                                       Instruction::Mul, VectorTy, CostKind) + 
            getScalarizationOverhead(I, VF);
   } // end of switch.
 }
@@ -7515,7 +7515,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
   // detection.
   for (auto &Reduction : Legal->getReductionVars()) {
     RecurrenceDescriptor &RedDes = Reduction.second;
-    const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
+    const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
   // Ignore type-casting instructions we identified during induction
@@ -7527,43 +7527,43 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
   }
 }
 
-void LoopVectorizationCostModel::collectInLoopReductions() {
-  for (auto &Reduction : Legal->getReductionVars()) {
-    PHINode *Phi = Reduction.first;
-    RecurrenceDescriptor &RdxDesc = Reduction.second;
-
-    // We don't collect reductions that are type promoted (yet).
-    if (RdxDesc.getRecurrenceType() != Phi->getType())
-      continue;
-
-    // If the target would prefer this reduction to happen "in-loop", then we
-    // want to record it as such.
-    unsigned Opcode = RdxDesc.getOpcode();
-    if (!PreferInLoopReductions &&
-        !TTI.preferInLoopReduction(Opcode, Phi->getType(),
-                                   TargetTransformInfo::ReductionFlags()))
-      continue;
-
-    // Check that we can correctly put the reductions into the loop, by
-    // finding the chain of operations that leads from the phi to the loop
-    // exit value.
-    SmallVector<Instruction *, 4> ReductionOperations =
-        RdxDesc.getReductionOpChain(Phi, TheLoop);
-    bool InLoop = !ReductionOperations.empty();
-    if (InLoop) {
-      InLoopReductionChains[Phi] = ReductionOperations;
-      // Add the elements to InLoopReductionImmediateChains for cost modelling.
-      Instruction *LastChain = Phi;
-      for (auto *I : ReductionOperations) {
-        InLoopReductionImmediateChains[I] = LastChain;
-        LastChain = I;
-      }
-    }
-    LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
-                      << " reduction for phi: " << *Phi << "\n");
-  }
-}
-
+void LoopVectorizationCostModel::collectInLoopReductions() { 
+  for (auto &Reduction : Legal->getReductionVars()) { 
+    PHINode *Phi = Reduction.first; 
+    RecurrenceDescriptor &RdxDesc = Reduction.second; 
+ 
+    // We don't collect reductions that are type promoted (yet). 
+    if (RdxDesc.getRecurrenceType() != Phi->getType()) 
+      continue; 
+ 
+    // If the target would prefer this reduction to happen "in-loop", then we 
+    // want to record it as such. 
+    unsigned Opcode = RdxDesc.getOpcode(); 
+    if (!PreferInLoopReductions && 
+        !TTI.preferInLoopReduction(Opcode, Phi->getType(), 
+                                   TargetTransformInfo::ReductionFlags())) 
+      continue; 
+ 
+    // Check that we can correctly put the reductions into the loop, by 
+    // finding the chain of operations that leads from the phi to the loop 
+    // exit value. 
+    SmallVector<Instruction *, 4> ReductionOperations = 
+        RdxDesc.getReductionOpChain(Phi, TheLoop); 
+    bool InLoop = !ReductionOperations.empty(); 
+    if (InLoop) { 
+      InLoopReductionChains[Phi] = ReductionOperations; 
+      // Add the elements to InLoopReductionImmediateChains for cost modelling. 
+      Instruction *LastChain = Phi; 
+      for (auto *I : ReductionOperations) { 
+        InLoopReductionImmediateChains[I] = LastChain; 
+        LastChain = I; 
+      } 
+    } 
+    LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 
+                      << " reduction for phi: " << *Phi << "\n"); 
+  } 
+} 
+ 
 // TODO: we could return a pair of values that specify the max VF and
 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
@@ -7577,40 +7577,40 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
 }
 
 VectorizationFactor
-LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
-  assert(!UserVF.isScalable() && "scalable vectors not yet supported");
-  ElementCount VF = UserVF;
+LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 
+  assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 
+  ElementCount VF = UserVF; 
   // Outer loop handling: They may require CFG and instruction level
   // transformations before even evaluating whether vectorization is profitable.
   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
   // the vectorization pipeline.
-  if (!OrigLoop->isInnermost()) {
+  if (!OrigLoop->isInnermost()) { 
     // If the user doesn't provide a vectorization factor, determine a
     // reasonable one.
-    if (UserVF.isZero()) {
-      VF = ElementCount::getFixed(
-          determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
+    if (UserVF.isZero()) { 
+      VF = ElementCount::getFixed( 
+          determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 
       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
 
       // Make sure we have a VF > 1 for stress testing.
-      if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
+      if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 
         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
                           << "overriding computed VF.\n");
-        VF = ElementCount::getFixed(4);
+        VF = ElementCount::getFixed(4); 
       }
     }
     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
-    assert(isPowerOf2_32(VF.getKnownMinValue()) &&
-           "VF needs to be a power of two");
-    LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
-                      << "VF " << VF << " to build VPlans.\n");
+    assert(isPowerOf2_32(VF.getKnownMinValue()) && 
+           "VF needs to be a power of two"); 
+    LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 
+                      << "VF " << VF << " to build VPlans.\n"); 
     buildVPlans(VF, VF);
 
     // For VPlan build stress testing, we bail out after VPlan construction.
     if (VPlanBuildStressTest)
       return VectorizationFactor::Disabled();
 
-    return {VF, 0 /*Cost*/};
+    return {VF, 0 /*Cost*/}; 
   }
 
   LLVM_DEBUG(
@@ -7619,10 +7619,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   return VectorizationFactor::Disabled();
 }
 
-Optional<VectorizationFactor>
-LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
-  assert(OrigLoop->isInnermost() && "Inner loop expected.");
-  Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
+Optional<VectorizationFactor> 
+LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 
+  assert(OrigLoop->isInnermost() && "Inner loop expected."); 
+  Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 
   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
     return None;
 
@@ -7640,55 +7640,55 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.invalidateCostModelingDecisions();
   }
 
-  ElementCount MaxVF = MaybeMaxVF.getValue();
-  assert(MaxVF.isNonZero() && "MaxVF is zero.");
-
-  bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
-  if (!UserVF.isZero() &&
-      (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
-    // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
-    // VFs here, this should be reverted to only use legal UserVFs once the
-    // loop below supports scalable VFs.
-    ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
-    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
-                      << " VF " << VF << ".\n");
-    assert(isPowerOf2_32(VF.getKnownMinValue()) &&
-           "VF needs to be a power of two");
+  ElementCount MaxVF = MaybeMaxVF.getValue(); 
+  assert(MaxVF.isNonZero() && "MaxVF is zero."); 
+ 
+  bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 
+  if (!UserVF.isZero() && 
+      (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 
+    // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 
+    // VFs here, this should be reverted to only use legal UserVFs once the 
+    // loop below supports scalable VFs. 
+    ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 
+    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 
+                      << " VF " << VF << ".\n"); 
+    assert(isPowerOf2_32(VF.getKnownMinValue()) && 
+           "VF needs to be a power of two"); 
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
-    CM.selectUserVectorizationFactor(VF);
-    CM.collectInLoopReductions();
-    buildVPlansWithVPRecipes(VF, VF);
+    CM.selectUserVectorizationFactor(VF); 
+    CM.collectInLoopReductions(); 
+    buildVPlansWithVPRecipes(VF, VF); 
     LLVM_DEBUG(printPlans(dbgs()));
-    return {{VF, 0}};
+    return {{VF, 0}}; 
   }
 
-  assert(!MaxVF.isScalable() &&
-         "Scalable vectors not yet supported beyond this point");
+  assert(!MaxVF.isScalable() && 
+         "Scalable vectors not yet supported beyond this point"); 
 
-  for (ElementCount VF = ElementCount::getFixed(1);
-       ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
+  for (ElementCount VF = ElementCount::getFixed(1); 
+       ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 
     // Collect Uniform and Scalar instructions after vectorization with VF.
     CM.collectUniformsAndScalars(VF);
 
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
-    if (VF.isVector())
+    if (VF.isVector()) 
       CM.collectInstsToScalarize(VF);
   }
 
-  CM.collectInLoopReductions();
-
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
+  CM.collectInLoopReductions(); 
+ 
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 
   LLVM_DEBUG(printPlans(dbgs()));
-  if (MaxVF.isScalar())
+  if (MaxVF.isScalar()) 
     return VectorizationFactor::Disabled();
 
   // Select the optimal vectorization factor.
   return CM.selectVectorizationFactor(MaxVF);
 }
 
-void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
+void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 
   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
                     << '\n');
   BestVF = VF;
@@ -7707,23 +7707,23 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
   VPCallbackILV CallbackILV(ILV);
 
-  assert(BestVF.hasValue() && "Vectorization Factor is missing");
-
-  VPTransformState State{*BestVF,
-                         BestUF,
-                         OrigLoop,
-                         LI,
-                         DT,
-                         ILV.Builder,
-                         ILV.VectorLoopValueMap,
-                         &ILV,
-                         CallbackILV};
+  assert(BestVF.hasValue() && "Vectorization Factor is missing"); 
+ 
+  VPTransformState State{*BestVF, 
+                         BestUF, 
+                         OrigLoop, 
+                         LI, 
+                         DT, 
+                         ILV.Builder, 
+                         ILV.VectorLoopValueMap, 
+                         &ILV, 
+                         CallbackILV}; 
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
   State.TripCount = ILV.getOrCreateTripCount(nullptr);
   State.CanonicalIV = ILV.Induction;
 
-  ILV.printDebugTracesAtStart();
-
+  ILV.printDebugTracesAtStart(); 
+ 
   //===------------------------------------------------===//
   //
   // Notice: any optimization or new instruction that go
@@ -7739,48 +7739,48 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
   // 3. Fix the vectorized code: take care of header phi's, live-outs,
   //    predication, updating analyses.
   ILV.fixVectorizedLoop();
-
-  ILV.printDebugTracesAtEnd();
+ 
+  ILV.printDebugTracesAtEnd(); 
 }
 
 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
 
-  // We create new control-flow for the vectorized loop, so the original exit
-  // conditions will be dead after vectorization if it's only used by the
-  // terminator
-  SmallVector<BasicBlock*> ExitingBlocks;
-  OrigLoop->getExitingBlocks(ExitingBlocks);
-  for (auto *BB : ExitingBlocks) {
-    auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
-    if (!Cmp || !Cmp->hasOneUse())
-      continue;
-
-    // TODO: we should introduce a getUniqueExitingBlocks on Loop
-    if (!DeadInstructions.insert(Cmp).second)
-      continue;
-
-    // The operands of the icmp is often a dead trunc, used by IndUpdate.
-    // TODO: can recurse through operands in general
-    for (Value *Op : Cmp->operands()) {
-      if (isa<TruncInst>(Op) && Op->hasOneUse())
-          DeadInstructions.insert(cast<Instruction>(Op));
-    }
-  }
-
+  // We create new control-flow for the vectorized loop, so the original exit 
+  // conditions will be dead after vectorization if it's only used by the 
+  // terminator 
+  SmallVector<BasicBlock*> ExitingBlocks; 
+  OrigLoop->getExitingBlocks(ExitingBlocks); 
+  for (auto *BB : ExitingBlocks) { 
+    auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 
+    if (!Cmp || !Cmp->hasOneUse()) 
+      continue; 
+
+    // TODO: we should introduce a getUniqueExitingBlocks on Loop 
+    if (!DeadInstructions.insert(Cmp).second) 
+      continue; 
+ 
+    // The operands of the icmp is often a dead trunc, used by IndUpdate. 
+    // TODO: can recurse through operands in general 
+    for (Value *Op : Cmp->operands()) { 
+      if (isa<TruncInst>(Op) && Op->hasOneUse()) 
+          DeadInstructions.insert(cast<Instruction>(Op)); 
+    } 
+  } 
+ 
   // We create new "steps" for induction variable updates to which the original
   // induction variables map. An original update instruction will be dead if
   // all its users except the induction variable are dead.
-  auto *Latch = OrigLoop->getLoopLatch();
+  auto *Latch = OrigLoop->getLoopLatch(); 
   for (auto &Induction : Legal->getInductionVars()) {
     PHINode *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
-
-    // If the tail is to be folded by masking, the primary induction variable,
-    // if exists, isn't dead: it will be used for masking. Don't kill it.
-    if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
-      continue;
-
+ 
+    // If the tail is to be folded by masking, the primary induction variable, 
+    // if exists, isn't dead: it will be used for masking. Don't kill it. 
+    if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 
+      continue; 
+ 
     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
         }))
@@ -7855,284 +7855,284 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
   }
 }
 
-//===--------------------------------------------------------------------===//
-// EpilogueVectorizerMainLoop
-//===--------------------------------------------------------------------===//
-
-/// This function is partially responsible for generating the control flow
-/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
-  MDNode *OrigLoopID = OrigLoop->getLoopID();
-  Loop *Lp = createVectorLoopSkeleton("");
-
-  // Generate the code to check the minimum iteration count of the vector
-  // epilogue (see below).
-  EPI.EpilogueIterationCountCheck =
-      emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
-  EPI.EpilogueIterationCountCheck->setName("iter.check");
-
-  // Generate the code to check any assumptions that we've made for SCEV
-  // expressions.
-  BasicBlock *SavedPreHeader = LoopVectorPreHeader;
-  emitSCEVChecks(Lp, LoopScalarPreHeader);
-
-  // If a safety check was generated save it.
-  if (SavedPreHeader != LoopVectorPreHeader)
-    EPI.SCEVSafetyCheck = SavedPreHeader;
-
-  // Generate the code that checks at runtime if arrays overlap. We put the
-  // checks into a separate block to make the more common case of few elements
-  // faster.
-  SavedPreHeader = LoopVectorPreHeader;
-  emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
-
-  // If a safety check was generated save/overwite it.
-  if (SavedPreHeader != LoopVectorPreHeader)
-    EPI.MemSafetyCheck = SavedPreHeader;
-
-  // Generate the iteration count check for the main loop, *after* the check
-  // for the epilogue loop, so that the path-length is shorter for the case
-  // that goes directly through the vector epilogue. The longer-path length for
-  // the main loop is compensated for, by the gain from vectorizing the larger
-  // trip count. Note: the branch will get updated later on when we vectorize
-  // the epilogue.
-  EPI.MainLoopIterationCountCheck =
-      emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
-
-  // Generate the induction variable.
-  OldInduction = Legal->getPrimaryInduction();
-  Type *IdxTy = Legal->getWidestInductionType();
-  Value *StartIdx = ConstantInt::get(IdxTy, 0);
-  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
-  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
-  EPI.VectorTripCount = CountRoundDown;
-  Induction =
-      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
-                              getDebugLocFromInstOrOperands(OldInduction));
-
-  // Skip induction resume value creation here because they will be created in
-  // the second pass. If we created them here, they wouldn't be used anyway,
-  // because the vplan in the second pass still contains the inductions from the
-  // original loop.
-
-  return completeLoopSkeleton(Lp, OrigLoopID);
-}
-
-void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
-  LLVM_DEBUG({
-    dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
-           << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
-           << ", Main Loop UF:" << EPI.MainLoopUF
-           << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
-           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
-  });
-}
-
-void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
-  DEBUG_WITH_TYPE(VerboseDebug, {
-    dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
-  });
-}
-
-BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
-    Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
-  assert(L && "Expected valid Loop.");
-  assert(Bypass && "Expected valid bypass basic block.");
-  unsigned VFactor =
-      ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
-  unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
-  Value *Count = getOrCreateTripCount(L);
-  // Reuse existing vector loop preheader for TC checks.
-  // Note that new preheader block is generated for vector loop.
-  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
-  IRBuilder<> Builder(TCCheckBlock->getTerminator());
-
-  // Generate code to check if the loop's trip count is less than VF * UF of the
-  // main vector loop.
-  auto P =
-      Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
-
-  Value *CheckMinIters = Builder.CreateICmp(
-      P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
-      "min.iters.check");
-
-  if (!ForEpilogue)
-    TCCheckBlock->setName("vector.main.loop.iter.check");
-
-  // Create new preheader for vector loop.
-  LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
-                                   DT, LI, nullptr, "vector.ph");
-
-  if (ForEpilogue) {
-    assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
-                                 DT->getNode(Bypass)->getIDom()) &&
-           "TC check is expected to dominate Bypass");
-
-    // Update dominator for Bypass & LoopExit.
-    DT->changeImmediateDominator(Bypass, TCCheckBlock);
-    DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
-
-    LoopBypassBlocks.push_back(TCCheckBlock);
-
-    // Save the trip count so we don't have to regenerate it in the
-    // vec.epilog.iter.check. This is safe to do because the trip count
-    // generated here dominates the vector epilog iter check.
-    EPI.TripCount = Count;
-  }
-
-  ReplaceInstWithInst(
-      TCCheckBlock->getTerminator(),
-      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
-
-  return TCCheckBlock;
-}
-
-//===--------------------------------------------------------------------===//
-// EpilogueVectorizerEpilogueLoop
-//===--------------------------------------------------------------------===//
-
-/// This function is partially responsible for generating the control flow
-/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *
-EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
-  MDNode *OrigLoopID = OrigLoop->getLoopID();
-  Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
-
-  // Now, compare the remaining count and if there aren't enough iterations to
-  // execute the vectorized epilogue skip to the scalar part.
-  BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
-  VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
-  LoopVectorPreHeader =
-      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
-                 LI, nullptr, "vec.epilog.ph");
-  emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
-                                          VecEpilogueIterationCountCheck);
-
-  // Adjust the control flow taking the state info from the main loop
-  // vectorization into account.
-  assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
-         "expected this to be saved from the previous pass.");
-  EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
-      VecEpilogueIterationCountCheck, LoopVectorPreHeader);
-
-  DT->changeImmediateDominator(LoopVectorPreHeader,
-                               EPI.MainLoopIterationCountCheck);
-
-  EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
-      VecEpilogueIterationCountCheck, LoopScalarPreHeader);
-
-  if (EPI.SCEVSafetyCheck)
-    EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
-        VecEpilogueIterationCountCheck, LoopScalarPreHeader);
-  if (EPI.MemSafetyCheck)
-    EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
-        VecEpilogueIterationCountCheck, LoopScalarPreHeader);
-
-  DT->changeImmediateDominator(
-      VecEpilogueIterationCountCheck,
-      VecEpilogueIterationCountCheck->getSinglePredecessor());
-
-  DT->changeImmediateDominator(LoopScalarPreHeader,
-                               EPI.EpilogueIterationCountCheck);
-  DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
-
-  // Keep track of bypass blocks, as they feed start values to the induction
-  // phis in the scalar loop preheader.
-  if (EPI.SCEVSafetyCheck)
-    LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
-  if (EPI.MemSafetyCheck)
-    LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
-  LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
-
-  // Generate a resume induction for the vector epilogue and put it in the
-  // vector epilogue preheader
-  Type *IdxTy = Legal->getWidestInductionType();
-  PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
-                                         LoopVectorPreHeader->getFirstNonPHI());
-  EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
-  EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
-                           EPI.MainLoopIterationCountCheck);
-
-  // Generate the induction variable.
-  OldInduction = Legal->getPrimaryInduction();
-  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
-  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
-  Value *StartIdx = EPResumeVal;
-  Induction =
-      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
-                              getDebugLocFromInstOrOperands(OldInduction));
-
-  // Generate induction resume values. These variables save the new starting
-  // indexes for the scalar loop. They are used to test if there are any tail
-  // iterations left once the vector loop has completed.
-  // Note that when the vectorized epilogue is skipped due to iteration count
-  // check, then the resume value for the induction variable comes from
-  // the trip count of the main vector loop, hence passing the AdditionalBypass
-  // argument.
-  createInductionResumeValues(Lp, CountRoundDown,
-                              {VecEpilogueIterationCountCheck,
-                               EPI.VectorTripCount} /* AdditionalBypass */);
-
-  AddRuntimeUnrollDisableMetaData(Lp);
-  return completeLoopSkeleton(Lp, OrigLoopID);
-}
-
-BasicBlock *
-EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
-    Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
-
-  assert(EPI.TripCount &&
-         "Expected trip count to have been safed in the first pass.");
-  assert(
-      (!isa<Instruction>(EPI.TripCount) ||
-       DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
-      "saved trip count does not dominate insertion point.");
-  Value *TC = EPI.TripCount;
-  IRBuilder<> Builder(Insert->getTerminator());
-  Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
-
-  // Generate code to check if the loop's trip count is less than VF * UF of the
-  // vector epilogue loop.
-  auto P =
-      Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
-
-  Value *CheckMinIters = Builder.CreateICmp(
-      P, Count,
-      ConstantInt::get(Count->getType(),
-                       EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
-      "min.epilog.iters.check");
-
-  ReplaceInstWithInst(
-      Insert->getTerminator(),
-      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
-
-  LoopBypassBlocks.push_back(Insert);
-  return Insert;
-}
-
-void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
-  LLVM_DEBUG({
-    dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
-           << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
-           << ", Main Loop UF:" << EPI.MainLoopUF
-           << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
-           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
-  });
-}
-
-void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
-  DEBUG_WITH_TYPE(VerboseDebug, {
-    dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
-  });
-}
-
+//===--------------------------------------------------------------------===// 
+// EpilogueVectorizerMainLoop 
+//===--------------------------------------------------------------------===// 
+ 
+/// This function is partially responsible for generating the control flow 
+/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 
+BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 
+  MDNode *OrigLoopID = OrigLoop->getLoopID(); 
+  Loop *Lp = createVectorLoopSkeleton(""); 
+ 
+  // Generate the code to check the minimum iteration count of the vector 
+  // epilogue (see below). 
+  EPI.EpilogueIterationCountCheck = 
+      emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 
+  EPI.EpilogueIterationCountCheck->setName("iter.check"); 
+ 
+  // Generate the code to check any assumptions that we've made for SCEV 
+  // expressions. 
+  BasicBlock *SavedPreHeader = LoopVectorPreHeader; 
+  emitSCEVChecks(Lp, LoopScalarPreHeader); 
+ 
+  // If a safety check was generated save it. 
+  if (SavedPreHeader != LoopVectorPreHeader) 
+    EPI.SCEVSafetyCheck = SavedPreHeader; 
+ 
+  // Generate the code that checks at runtime if arrays overlap. We put the 
+  // checks into a separate block to make the more common case of few elements 
+  // faster. 
+  SavedPreHeader = LoopVectorPreHeader; 
+  emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 
+ 
+  // If a safety check was generated save/overwite it. 
+  if (SavedPreHeader != LoopVectorPreHeader) 
+    EPI.MemSafetyCheck = SavedPreHeader; 
+ 
+  // Generate the iteration count check for the main loop, *after* the check 
+  // for the epilogue loop, so that the path-length is shorter for the case 
+  // that goes directly through the vector epilogue. The longer-path length for 
+  // the main loop is compensated for, by the gain from vectorizing the larger 
+  // trip count. Note: the branch will get updated later on when we vectorize 
+  // the epilogue. 
+  EPI.MainLoopIterationCountCheck = 
+      emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 
+ 
+  // Generate the induction variable. 
+  OldInduction = Legal->getPrimaryInduction(); 
+  Type *IdxTy = Legal->getWidestInductionType(); 
+  Value *StartIdx = ConstantInt::get(IdxTy, 0); 
+  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 
+  EPI.VectorTripCount = CountRoundDown; 
+  Induction = 
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 
+                              getDebugLocFromInstOrOperands(OldInduction)); 
+ 
+  // Skip induction resume value creation here because they will be created in 
+  // the second pass. If we created them here, they wouldn't be used anyway, 
+  // because the vplan in the second pass still contains the inductions from the 
+  // original loop. 
+ 
+  return completeLoopSkeleton(Lp, OrigLoopID); 
+} 
+ 
+void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 
+  LLVM_DEBUG({ 
+    dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 
+           << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 
+           << ", Main Loop UF:" << EPI.MainLoopUF 
+           << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 
+           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 
+  }); 
+} 
+ 
+void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 
+  DEBUG_WITH_TYPE(VerboseDebug, { 
+    dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 
+  }); 
+} 
+ 
+BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 
+    Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 
+  assert(L && "Expected valid Loop."); 
+  assert(Bypass && "Expected valid bypass basic block."); 
+  unsigned VFactor = 
+      ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 
+  unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 
+  Value *Count = getOrCreateTripCount(L); 
+  // Reuse existing vector loop preheader for TC checks. 
+  // Note that new preheader block is generated for vector loop. 
+  BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 
+  IRBuilder<> Builder(TCCheckBlock->getTerminator()); 
+ 
+  // Generate code to check if the loop's trip count is less than VF * UF of the 
+  // main vector loop. 
+  auto P = 
+      Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 
+ 
+  Value *CheckMinIters = Builder.CreateICmp( 
+      P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 
+      "min.iters.check"); 
+ 
+  if (!ForEpilogue) 
+    TCCheckBlock->setName("vector.main.loop.iter.check"); 
+ 
+  // Create new preheader for vector loop. 
+  LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 
+                                   DT, LI, nullptr, "vector.ph"); 
+ 
+  if (ForEpilogue) { 
+    assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 
+                                 DT->getNode(Bypass)->getIDom()) && 
+           "TC check is expected to dominate Bypass"); 
+ 
+    // Update dominator for Bypass & LoopExit. 
+    DT->changeImmediateDominator(Bypass, TCCheckBlock); 
+    DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 
+ 
+    LoopBypassBlocks.push_back(TCCheckBlock); 
+ 
+    // Save the trip count so we don't have to regenerate it in the 
+    // vec.epilog.iter.check. This is safe to do because the trip count 
+    // generated here dominates the vector epilog iter check. 
+    EPI.TripCount = Count; 
+  } 
+ 
+  ReplaceInstWithInst( 
+      TCCheckBlock->getTerminator(), 
+      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 
+ 
+  return TCCheckBlock; 
+} 
+ 
+//===--------------------------------------------------------------------===// 
+// EpilogueVectorizerEpilogueLoop 
+//===--------------------------------------------------------------------===// 
+ 
+/// This function is partially responsible for generating the control flow 
+/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 
+BasicBlock * 
+EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 
+  MDNode *OrigLoopID = OrigLoop->getLoopID(); 
+  Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 
+ 
+  // Now, compare the remaining count and if there aren't enough iterations to 
+  // execute the vectorized epilogue skip to the scalar part. 
+  BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 
+  VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 
+  LoopVectorPreHeader = 
+      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 
+                 LI, nullptr, "vec.epilog.ph"); 
+  emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 
+                                          VecEpilogueIterationCountCheck); 
+ 
+  // Adjust the control flow taking the state info from the main loop 
+  // vectorization into account. 
+  assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 
+         "expected this to be saved from the previous pass."); 
+  EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 
+      VecEpilogueIterationCountCheck, LoopVectorPreHeader); 
+ 
+  DT->changeImmediateDominator(LoopVectorPreHeader, 
+                               EPI.MainLoopIterationCountCheck); 
+ 
+  EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 
+      VecEpilogueIterationCountCheck, LoopScalarPreHeader); 
+ 
+  if (EPI.SCEVSafetyCheck) 
+    EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 
+        VecEpilogueIterationCountCheck, LoopScalarPreHeader); 
+  if (EPI.MemSafetyCheck) 
+    EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 
+        VecEpilogueIterationCountCheck, LoopScalarPreHeader); 
+ 
+  DT->changeImmediateDominator( 
+      VecEpilogueIterationCountCheck, 
+      VecEpilogueIterationCountCheck->getSinglePredecessor()); 
+ 
+  DT->changeImmediateDominator(LoopScalarPreHeader, 
+                               EPI.EpilogueIterationCountCheck); 
+  DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 
+ 
+  // Keep track of bypass blocks, as they feed start values to the induction 
+  // phis in the scalar loop preheader. 
+  if (EPI.SCEVSafetyCheck) 
+    LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 
+  if (EPI.MemSafetyCheck) 
+    LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 
+  LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 
+ 
+  // Generate a resume induction for the vector epilogue and put it in the 
+  // vector epilogue preheader 
+  Type *IdxTy = Legal->getWidestInductionType(); 
+  PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 
+                                         LoopVectorPreHeader->getFirstNonPHI()); 
+  EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 
+  EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 
+                           EPI.MainLoopIterationCountCheck); 
+ 
+  // Generate the induction variable. 
+  OldInduction = Legal->getPrimaryInduction(); 
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 
+  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 
+  Value *StartIdx = EPResumeVal; 
+  Induction = 
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 
+                              getDebugLocFromInstOrOperands(OldInduction)); 
+ 
+  // Generate induction resume values. These variables save the new starting 
+  // indexes for the scalar loop. They are used to test if there are any tail 
+  // iterations left once the vector loop has completed. 
+  // Note that when the vectorized epilogue is skipped due to iteration count 
+  // check, then the resume value for the induction variable comes from 
+  // the trip count of the main vector loop, hence passing the AdditionalBypass 
+  // argument. 
+  createInductionResumeValues(Lp, CountRoundDown, 
+                              {VecEpilogueIterationCountCheck, 
+                               EPI.VectorTripCount} /* AdditionalBypass */); 
+ 
+  AddRuntimeUnrollDisableMetaData(Lp); 
+  return completeLoopSkeleton(Lp, OrigLoopID); 
+} 
+ 
+BasicBlock * 
+EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 
+    Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 
+ 
+  assert(EPI.TripCount && 
+         "Expected trip count to have been safed in the first pass."); 
+  assert( 
+      (!isa<Instruction>(EPI.TripCount) || 
+       DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 
+      "saved trip count does not dominate insertion point."); 
+  Value *TC = EPI.TripCount; 
+  IRBuilder<> Builder(Insert->getTerminator()); 
+  Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 
+ 
+  // Generate code to check if the loop's trip count is less than VF * UF of the 
+  // vector epilogue loop. 
+  auto P = 
+      Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 
+ 
+  Value *CheckMinIters = Builder.CreateICmp( 
+      P, Count, 
+      ConstantInt::get(Count->getType(), 
+                       EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 
+      "min.epilog.iters.check"); 
+ 
+  ReplaceInstWithInst( 
+      Insert->getTerminator(), 
+      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 
+ 
+  LoopBypassBlocks.push_back(Insert); 
+  return Insert; 
+} 
+ 
+void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 
+  LLVM_DEBUG({ 
+    dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 
+           << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 
+           << ", Main Loop UF:" << EPI.MainLoopUF 
+           << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 
+           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 
+  }); 
+} 
+ 
+void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 
+  DEBUG_WITH_TYPE(VerboseDebug, { 
+    dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 
+  }); 
+} 
+ 
 bool LoopVectorizationPlanner::getDecisionAndClampRange(
-    const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
-  assert(!Range.isEmpty() && "Trying to test an empty VF range.");
+    const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 
+  assert(!Range.isEmpty() && "Trying to test an empty VF range."); 
   bool PredicateAtRangeStart = Predicate(Range.Start);
 
-  for (ElementCount TmpVF = Range.Start * 2;
-       ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
+  for (ElementCount TmpVF = Range.Start * 2; 
+       ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 
     if (Predicate(TmpVF) != PredicateAtRangeStart) {
       Range.End = TmpVF;
       break;
@@ -8146,11 +8146,11 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
 /// of VF's starting at a given VF and extending it as much as possible. Each
 /// vectorization decision can potentially shorten this sub-range during
 /// buildVPlan().
-void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
-                                           ElementCount MaxVF) {
-  auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
-  for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
-    VFRange SubRange = {VF, MaxVFPlusOne};
+void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 
+                                           ElementCount MaxVF) { 
+  auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 
+  for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 
+    VFRange SubRange = {VF, MaxVFPlusOne}; 
     VPlans.push_back(buildVPlan(SubRange));
     VF = SubRange.End;
   }
@@ -8175,27 +8175,27 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
     return EdgeMaskCache[Edge] = SrcMask;
 
-  // If source is an exiting block, we know the exit edge is dynamically dead
-  // in the vector loop, and thus we don't need to restrict the mask.  Avoid
-  // adding uses of an otherwise potentially dead instruction.
-  if (OrigLoop->isLoopExiting(Src))
-    return EdgeMaskCache[Edge] = SrcMask;
-
-  VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
+  // If source is an exiting block, we know the exit edge is dynamically dead 
+  // in the vector loop, and thus we don't need to restrict the mask.  Avoid 
+  // adding uses of an otherwise potentially dead instruction. 
+  if (OrigLoop->isLoopExiting(Src)) 
+    return EdgeMaskCache[Edge] = SrcMask; 
+ 
+  VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 
   assert(EdgeMask && "No Edge Mask found for condition");
 
   if (BI->getSuccessor(0) != Dst)
     EdgeMask = Builder.createNot(EdgeMask);
 
-  if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
-    // The condition is 'SrcMask && EdgeMask', which is equivalent to
-    // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
-    // The select version does not introduce new UB if SrcMask is false and
-    // EdgeMask is poison. Using 'and' here introduces undefined behavior.
-    VPValue *False = Plan->getOrAddVPValue(
-        ConstantInt::getFalse(BI->getCondition()->getType()));
-    EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
-  }
+  if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 
+    // The condition is 'SrcMask && EdgeMask', which is equivalent to 
+    // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 
+    // The select version does not introduce new UB if SrcMask is false and 
+    // EdgeMask is poison. Using 'and' here introduces undefined behavior. 
+    VPValue *False = Plan->getOrAddVPValue( 
+        ConstantInt::getFalse(BI->getCondition()->getType())); 
+    EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 
+  } 
 
   return EdgeMaskCache[Edge] = EdgeMask;
 }
@@ -8216,34 +8216,34 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
     if (!CM.blockNeedsPredication(BB))
       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
 
-    // Create the block in mask as the first non-phi instruction in the block.
-    VPBuilder::InsertPointGuard Guard(Builder);
-    auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
-    Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
-
+    // Create the block in mask as the first non-phi instruction in the block. 
+    VPBuilder::InsertPointGuard Guard(Builder); 
+    auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 
+    Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 
+ 
     // Introduce the early-exit compare IV <= BTC to form header block mask.
     // This is used instead of IV < TC because TC may wrap, unlike BTC.
     // Start by constructing the desired canonical IV.
     VPValue *IV = nullptr;
     if (Legal->getPrimaryInduction())
-      IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
+      IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 
     else {
       auto IVRecipe = new VPWidenCanonicalIVRecipe();
-      Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
+      Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 
       IV = IVRecipe->getVPValue();
     }
     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
     bool TailFolded = !CM.isScalarEpilogueAllowed();
-
-    if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
-      // While ActiveLaneMask is a binary op that consumes the loop tripcount
-      // as a second argument, we only pass the IV here and extract the
-      // tripcount from the transform state where codegen of the VP instructions
-      // happen.
-      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
-    } else {
+ 
+    if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 
+      // While ActiveLaneMask is a binary op that consumes the loop tripcount 
+      // as a second argument, we only pass the IV here and extract the 
+      // tripcount from the transform state where codegen of the VP instructions 
+      // happen. 
+      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 
+    } else { 
       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
-    }
+    } 
     return BlockMaskCache[BB] = BlockMask;
   }
 
@@ -8264,13 +8264,13 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
   return BlockMaskCache[BB] = BlockMask;
 }
 
-VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
-                                                VPlanPtr &Plan) {
+VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 
+                                                VPlanPtr &Plan) { 
   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
          "Must be called with either a load or store");
 
-  auto willWiden = [&](ElementCount VF) -> bool {
-    if (VF.isScalar())
+  auto willWiden = [&](ElementCount VF) -> bool { 
+    if (VF.isScalar()) 
       return false;
     LoopVectorizationCostModel::InstWidening Decision =
         CM.getWideningDecision(I, VF);
@@ -8301,22 +8301,22 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
 }
 
 VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
+VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 
   // Check if this is an integer or fp induction. If so, build the recipe that
   // produces its scalar and vector values.
   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
-      II.getKind() == InductionDescriptor::IK_FpInduction) {
-    VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
-    return new VPWidenIntOrFpInductionRecipe(Phi, Start);
-  }
+      II.getKind() == InductionDescriptor::IK_FpInduction) { 
+    VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 
+    return new VPWidenIntOrFpInductionRecipe(Phi, Start); 
+  } 
 
   return nullptr;
 }
 
 VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
-                                                VPlan &Plan) const {
+VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 
+                                                VPlan &Plan) const { 
   // Optimize the special case where the source is a constant integer
   // induction variable. Notice that we can only optimize the 'trunc' case
   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -8325,21 +8325,21 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
   // Determine whether \p K is a truncation based on an induction variable that
   // can be optimized.
   auto isOptimizableIVTruncate =
-      [&](Instruction *K) -> std::function<bool(ElementCount)> {
-    return [=](ElementCount VF) -> bool {
-      return CM.isOptimizableIVTruncate(K, VF);
-    };
+      [&](Instruction *K) -> std::function<bool(ElementCount)> { 
+    return [=](ElementCount VF) -> bool { 
+      return CM.isOptimizableIVTruncate(K, VF); 
+    }; 
   };
 
   if (LoopVectorizationPlanner::getDecisionAndClampRange(
-          isOptimizableIVTruncate(I), Range)) {
-
-    InductionDescriptor II =
-        Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
-    VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
+          isOptimizableIVTruncate(I), Range)) { 
+ 
+    InductionDescriptor II = 
+        Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 
+    VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 
     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
-                                             Start, I);
-  }
+                                             Start, I); 
+  } 
   return nullptr;
 }
 
@@ -8368,9 +8368,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
                                                    VPlan &Plan) const {
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [this, CI](ElementCount VF) {
-        return CM.isScalarWithPredication(CI, VF);
-      },
+      [this, CI](ElementCount VF) { 
+        return CM.isScalarWithPredication(CI, VF); 
+      }, 
       Range);
 
   if (IsPredicated)
@@ -8378,23 +8378,23 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
 
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
-             ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
-             ID == Intrinsic::pseudoprobe ||
-             ID == Intrinsic::experimental_noalias_scope_decl))
+             ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 
+             ID == Intrinsic::pseudoprobe || 
+             ID == Intrinsic::experimental_noalias_scope_decl)) 
     return nullptr;
 
-  auto willWiden = [&](ElementCount VF) -> bool {
+  auto willWiden = [&](ElementCount VF) -> bool { 
     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
     // The following case may be scalarized depending on the VF.
     // The flag shows whether we use Intrinsic or a usual Call for vectorized
     // version of the instruction.
     // Is it beneficial to perform intrinsic call compared to lib call?
     bool NeedToScalarize = false;
-    InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
-    InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
-    bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
-    assert(IntrinsicCost.isValid() && CallCost.isValid() &&
-           "Cannot have invalid costs while widening");
+    InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 
+    InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 
+    bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 
+    assert(IntrinsicCost.isValid() && CallCost.isValid() && 
+           "Cannot have invalid costs while widening"); 
     return UseVectorIntrinsic || !NeedToScalarize;
   };
 
@@ -8409,7 +8409,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
   // Instruction should be widened, unless it is scalar after vectorization,
   // scalarization is profitable or it is predicated.
-  auto WillScalarize = [this, I](ElementCount VF) -> bool {
+  auto WillScalarize = [this, I](ElementCount VF) -> bool { 
     return CM.isScalarAfterVectorization(I, VF) ||
            CM.isProfitableToScalarize(I, VF) ||
            CM.isScalarWithPredication(I, VF);
@@ -8472,17 +8472,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
     VPlanPtr &Plan) {
   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
+      [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 
       Range);
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
-      Range);
+      [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 
+      Range); 
 
   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
                                        IsUniform, IsPredicated);
   setRecipe(I, Recipe);
-  Plan->addVPValue(I, Recipe);
+  Plan->addVPValue(I, Recipe); 
 
   // Find if I uses a predicated instruction. If so, it will use its scalar
   // value. Avoid hoisting the insert-element which packs the scalar value into
@@ -8524,9 +8524,9 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
   assert(Instr->getParent() && "Predicated instruction not in any basic block");
   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
-  auto *PHIRecipe = Instr->getType()->isVoidTy()
-                        ? nullptr
-                        : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
+  auto *PHIRecipe = Instr->getType()->isVoidTy() 
+                        ? nullptr 
+                        : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 
   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
@@ -8554,21 +8554,21 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   if (auto Phi = dyn_cast<PHINode>(Instr)) {
     if (Phi->getParent() != OrigLoop->getHeader())
       return tryToBlend(Phi, Plan);
-    if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan)))
+    if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 
       return Recipe;
-
-    if (Legal->isReductionVariable(Phi)) {
-      RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
-      VPValue *StartV =
-          Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue());
-      return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV);
-    }
-
+ 
+    if (Legal->isReductionVariable(Phi)) { 
+      RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 
+      VPValue *StartV = 
+          Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 
+      return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 
+    } 
+ 
     return new VPWidenPHIRecipe(Phi);
   }
 
-  if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
-                                    cast<TruncInst>(Instr), Range, *Plan)))
+  if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 
+                                    cast<TruncInst>(Instr), Range, *Plan))) 
     return Recipe;
 
   if (!shouldWiden(Instr, Range))
@@ -8588,9 +8588,9 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, *Plan);
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
-                                                        ElementCount MaxVF) {
-  assert(OrigLoop->isInnermost() && "Inner loop expected.");
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 
+                                                        ElementCount MaxVF) { 
+  assert(OrigLoop->isInnermost() && "Inner loop expected."); 
 
   // Collect instructions from the original loop that will become trivially dead
   // in the vectorized loop. We don't need to vectorize these instructions. For
@@ -8613,17 +8613,17 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   for (Instruction *I : DeadInstructions)
     SinkAfter.erase(I);
 
-  auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
-  for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
-    VFRange SubRange = {VF, MaxVFPlusOne};
-    VPlans.push_back(
-        buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
+  auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 
+  for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 
+    VFRange SubRange = {VF, MaxVFPlusOne}; 
+    VPlans.push_back( 
+        buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 
     VF = SubRange.End;
   }
 }
 
 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
-    VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
+    VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 
     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
 
   // Hold a mapping from predicated instructions to their recipes, in order to
@@ -8646,28 +8646,28 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     RecipeBuilder.recordRecipeOf(Entry.first);
     RecipeBuilder.recordRecipeOf(Entry.second);
   }
-  for (auto &Reduction : CM.getInLoopReductionChains()) {
-    PHINode *Phi = Reduction.first;
-    RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
-    const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
-
-    RecipeBuilder.recordRecipeOf(Phi);
-    for (auto &R : ReductionOperations) {
-      RecipeBuilder.recordRecipeOf(R);
-      // For min/max reducitons, where we have a pair of icmp/select, we also
-      // need to record the ICmp recipe, so it can be removed later.
-      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
-        RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
-    }
-  }
-
+  for (auto &Reduction : CM.getInLoopReductionChains()) { 
+    PHINode *Phi = Reduction.first; 
+    RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 
+    const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 
+
+    RecipeBuilder.recordRecipeOf(Phi); 
+    for (auto &R : ReductionOperations) { 
+      RecipeBuilder.recordRecipeOf(R); 
+      // For min/max reducitons, where we have a pair of icmp/select, we also 
+      // need to record the ICmp recipe, so it can be removed later. 
+      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 
+        RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 
+    } 
+  } 
+ 
   // For each interleave group which is relevant for this (possibly trimmed)
   // Range, add it to the set of groups to be later applied to the VPlan and add
   // placeholders for its members' Recipes which we'll be replacing with a
   // single VPInterleaveRecipe.
   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
-    auto applyIG = [IG, this](ElementCount VF) -> bool {
-      return (VF.isVector() && // Query is illegal for VF == 1
+    auto applyIG = [IG, this](ElementCount VF) -> bool { 
+      return (VF.isVector() && // Query is illegal for VF == 1 
               CM.getWideningDecision(IG->getInsertPos(), VF) ==
                   LoopVectorizationCostModel::CM_Interleave);
     };
@@ -8715,11 +8715,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
 
       if (auto Recipe =
               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
-        for (auto *Def : Recipe->definedValues()) {
-          auto *UV = Def->getUnderlyingValue();
-          Plan->addVPValue(UV, Def);
-        }
-
+        for (auto *Def : Recipe->definedValues()) { 
+          auto *UV = Def->getUnderlyingValue(); 
+          Plan->addVPValue(UV, Def); 
+        } 
+ 
         RecipeBuilder.setRecipe(Instr, Recipe);
         VPBB->appendRecipe(Recipe);
         continue;
@@ -8755,18 +8755,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   for (auto &Entry : SinkAfter) {
     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
-    // If the target is in a replication region, make sure to move Sink to the
-    // block after it, not into the replication region itself.
-    if (auto *Region =
-            dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) {
-      if (Region->isReplicator()) {
-        assert(Region->getNumSuccessors() == 1 && "Expected SESE region!");
-        VPBasicBlock *NextBlock =
-            cast<VPBasicBlock>(Region->getSuccessors().front());
-        Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
-        continue;
-      }
-    }
+    // If the target is in a replication region, make sure to move Sink to the 
+    // block after it, not into the replication region itself. 
+    if (auto *Region = 
+            dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 
+      if (Region->isReplicator()) { 
+        assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 
+        VPBasicBlock *NextBlock = 
+            cast<VPBasicBlock>(Region->getSuccessors().front()); 
+        Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 
+        continue; 
+      } 
+    } 
     Sink->moveAfter(Target);
   }
 
@@ -8776,52 +8776,52 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   for (auto IG : InterleaveGroups) {
     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
         RecipeBuilder.getRecipe(IG->getInsertPos()));
-    SmallVector<VPValue *, 4> StoredValues;
-    for (unsigned i = 0; i < IG->getFactor(); ++i)
-      if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
-        StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
-
-    auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
-                                        Recipe->getMask());
-    VPIG->insertBefore(Recipe);
-    unsigned J = 0;
+    SmallVector<VPValue *, 4> StoredValues; 
+    for (unsigned i = 0; i < IG->getFactor(); ++i) 
+      if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 
+        StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 
+
+    auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 
+                                        Recipe->getMask()); 
+    VPIG->insertBefore(Recipe); 
+    unsigned J = 0; 
     for (unsigned i = 0; i < IG->getFactor(); ++i)
       if (Instruction *Member = IG->getMember(i)) {
-        if (!Member->getType()->isVoidTy()) {
-          VPValue *OriginalV = Plan->getVPValue(Member);
-          Plan->removeVPValueFor(Member);
-          Plan->addVPValue(Member, VPIG->getVPValue(J));
-          OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
-          J++;
-        }
+        if (!Member->getType()->isVoidTy()) { 
+          VPValue *OriginalV = Plan->getVPValue(Member); 
+          Plan->removeVPValueFor(Member); 
+          Plan->addVPValue(Member, VPIG->getVPValue(J)); 
+          OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 
+          J++; 
+        } 
         RecipeBuilder.getRecipe(Member)->eraseFromParent();
       }
   }
 
-  // Adjust the recipes for any inloop reductions.
-  if (Range.Start.isVector())
-    adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
-
+  // Adjust the recipes for any inloop reductions. 
+  if (Range.Start.isVector()) 
+    adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 
+ 
   // Finally, if tail is folded by masking, introduce selects between the phi
   // and the live-out instruction of each reduction, at the end of the latch.
-  if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
+  if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 
     Builder.setInsertPoint(VPBB);
     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
     for (auto &Reduction : Legal->getReductionVars()) {
-      if (CM.isInLoopReduction(Reduction.first))
-        continue;
-      VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
-      VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
+      if (CM.isInLoopReduction(Reduction.first)) 
+        continue; 
+      VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 
+      VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 
       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
     }
   }
 
   std::string PlanName;
   raw_string_ostream RSO(PlanName);
-  ElementCount VF = Range.Start;
+  ElementCount VF = Range.Start; 
   Plan->addVF(VF);
   RSO << "Initial VPlan for VF={" << VF;
-  for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
+  for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 
     Plan->addVF(VF);
     RSO << "," << VF;
   }
@@ -8837,7 +8837,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   // transformations before even evaluating whether vectorization is profitable.
   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
   // the vectorization pipeline.
-  assert(!OrigLoop->isInnermost());
+  assert(!OrigLoop->isInnermost()); 
   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
 
   // Create new empty VPlan
@@ -8847,8 +8847,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
   HCFGBuilder.buildHierarchicalCFG();
 
-  for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
-       VF *= 2)
+  for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 
+       VF *= 2) 
     Plan->addVF(VF);
 
   if (EnableVPlanPredication) {
@@ -8866,67 +8866,67 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   return Plan;
 }
 
-// Adjust the recipes for any inloop reductions. The chain of instructions
-// leading from the loop exit instr to the phi need to be converted to
-// reductions, with one operand being vector and the other being the scalar
-// reduction chain.
-void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
-    VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
-  for (auto &Reduction : CM.getInLoopReductionChains()) {
-    PHINode *Phi = Reduction.first;
-    RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
-    const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
-
-    // ReductionOperations are orders top-down from the phi's use to the
-    // LoopExitValue. We keep a track of the previous item (the Chain) to tell
-    // which of the two operands will remain scalar and which will be reduced.
-    // For minmax the chain will be the select instructions.
-    Instruction *Chain = Phi;
-    for (Instruction *R : ReductionOperations) {
-      VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
-      RecurKind Kind = RdxDesc.getRecurrenceKind();
-
-      VPValue *ChainOp = Plan->getVPValue(Chain);
-      unsigned FirstOpId;
-      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
-        assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
-               "Expected to replace a VPWidenSelectSC");
-        FirstOpId = 1;
-      } else {
-        assert(isa<VPWidenRecipe>(WidenRecipe) &&
-               "Expected to replace a VPWidenSC");
-        FirstOpId = 0;
-      }
-      unsigned VecOpId =
-          R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
-      VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
-
-      auto *CondOp = CM.foldTailByMasking()
-                         ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
-                         : nullptr;
-      VPReductionRecipe *RedRecipe = new VPReductionRecipe(
-          &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
-      WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
-      Plan->removeVPValueFor(R);
-      Plan->addVPValue(R, RedRecipe);
-      WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
-      WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
-      WidenRecipe->eraseFromParent();
-
-      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
-        VPRecipeBase *CompareRecipe =
-            RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
-        assert(isa<VPWidenRecipe>(CompareRecipe) &&
-               "Expected to replace a VPWidenSC");
-        assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
-               "Expected no remaining users");
-        CompareRecipe->eraseFromParent();
-      }
-      Chain = R;
-    }
-  }
-}
-
+// Adjust the recipes for any inloop reductions. The chain of instructions 
+// leading from the loop exit instr to the phi need to be converted to 
+// reductions, with one operand being vector and the other being the scalar 
+// reduction chain. 
+void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 
+    VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 
+  for (auto &Reduction : CM.getInLoopReductionChains()) { 
+    PHINode *Phi = Reduction.first; 
+    RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 
+    const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 
+ 
+    // ReductionOperations are orders top-down from the phi's use to the 
+    // LoopExitValue. We keep a track of the previous item (the Chain) to tell 
+    // which of the two operands will remain scalar and which will be reduced. 
+    // For minmax the chain will be the select instructions. 
+    Instruction *Chain = Phi; 
+    for (Instruction *R : ReductionOperations) { 
+      VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 
+      RecurKind Kind = RdxDesc.getRecurrenceKind(); 
+ 
+      VPValue *ChainOp = Plan->getVPValue(Chain); 
+      unsigned FirstOpId; 
+      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 
+        assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 
+               "Expected to replace a VPWidenSelectSC"); 
+        FirstOpId = 1; 
+      } else { 
+        assert(isa<VPWidenRecipe>(WidenRecipe) && 
+               "Expected to replace a VPWidenSC"); 
+        FirstOpId = 0; 
+      } 
+      unsigned VecOpId = 
+          R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 
+      VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 
+ 
+      auto *CondOp = CM.foldTailByMasking() 
+                         ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 
+                         : nullptr; 
+      VPReductionRecipe *RedRecipe = new VPReductionRecipe( 
+          &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 
+      WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 
+      Plan->removeVPValueFor(R); 
+      Plan->addVPValue(R, RedRecipe); 
+      WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 
+      WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 
+      WidenRecipe->eraseFromParent(); 
+ 
+      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 
+        VPRecipeBase *CompareRecipe = 
+            RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 
+        assert(isa<VPWidenRecipe>(CompareRecipe) && 
+               "Expected to replace a VPWidenSC"); 
+        assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 
+               "Expected no remaining users"); 
+        CompareRecipe->eraseFromParent(); 
+      } 
+      Chain = R; 
+    } 
+  } 
+} 
+ 
 Value* LoopVectorizationPlanner::VPCallbackILV::
 getOrCreateVectorValues(Value *V, unsigned Part) {
       return ILV.getOrCreateVectorValue(V, Part);
@@ -8954,35 +8954,35 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 
 void VPWidenCallRecipe::execute(VPTransformState &State) {
-  State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
-                                  *this, State);
+  State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 
+                                  *this, State); 
 }
 
 void VPWidenSelectRecipe::execute(VPTransformState &State) {
-  State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
-                                    this, *this, InvariantCond, State);
+  State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 
+                                    this, *this, InvariantCond, State); 
 }
 
 void VPWidenRecipe::execute(VPTransformState &State) {
-  State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
+  State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 
 }
 
 void VPWidenGEPRecipe::execute(VPTransformState &State) {
-  State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
-                      *this, State.UF, State.VF, IsPtrLoopInvariant,
+  State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 
+                      *this, State.UF, State.VF, IsPtrLoopInvariant, 
                       IsIndexLoopInvariant, State);
 }
 
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Int or FP induction being replicated.");
-  State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
-                                   Trunc);
+  State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 
+                                   Trunc); 
 }
 
 void VPWidenPHIRecipe::execute(VPTransformState &State) {
-  Value *StartV =
-      getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr;
-  State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF);
+  Value *StartV = 
+      getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; 
+  State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF); 
 }
 
 void VPBlendRecipe::execute(VPTransformState &State) {
@@ -9026,59 +9026,59 @@ void VPBlendRecipe::execute(VPTransformState &State) {
 
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Interleave group being replicated.");
-  State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
-                                      getStoredValues(), getMask());
-}
-
-void VPReductionRecipe::execute(VPTransformState &State) {
-  assert(!State.Instance && "Reduction being replicated.");
-  for (unsigned Part = 0; Part < State.UF; ++Part) {
-    RecurKind Kind = RdxDesc->getRecurrenceKind();
-    Value *NewVecOp = State.get(getVecOp(), Part);
-    if (VPValue *Cond = getCondOp()) {
-      Value *NewCond = State.get(Cond, Part);
-      VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
-      Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
-          Kind, VecTy->getElementType());
-      Constant *IdenVec =
-          ConstantVector::getSplat(VecTy->getElementCount(), Iden);
-      Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
-      NewVecOp = Select;
-    }
-    Value *NewRed =
-        createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
-    Value *PrevInChain = State.get(getChainOp(), Part);
-    Value *NextInChain;
-    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
-      NextInChain =
-          createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
-                         NewRed, PrevInChain);
-    } else {
-      NextInChain = State.Builder.CreateBinOp(
-          (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
-          PrevInChain);
-    }
-    State.set(this, getUnderlyingInstr(), NextInChain, Part);
-  }
-}
-
+  State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 
+                                      getStoredValues(), getMask()); 
+}
+
+void VPReductionRecipe::execute(VPTransformState &State) { 
+  assert(!State.Instance && "Reduction being replicated."); 
+  for (unsigned Part = 0; Part < State.UF; ++Part) { 
+    RecurKind Kind = RdxDesc->getRecurrenceKind(); 
+    Value *NewVecOp = State.get(getVecOp(), Part); 
+    if (VPValue *Cond = getCondOp()) { 
+      Value *NewCond = State.get(Cond, Part); 
+      VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 
+      Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 
+          Kind, VecTy->getElementType()); 
+      Constant *IdenVec = 
+          ConstantVector::getSplat(VecTy->getElementCount(), Iden); 
+      Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 
+      NewVecOp = Select; 
+    } 
+    Value *NewRed = 
+        createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 
+    Value *PrevInChain = State.get(getChainOp(), Part); 
+    Value *NextInChain; 
+    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 
+      NextInChain = 
+          createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 
+                         NewRed, PrevInChain); 
+    } else { 
+      NextInChain = State.Builder.CreateBinOp( 
+          (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 
+          PrevInChain); 
+    } 
+    State.set(this, getUnderlyingInstr(), NextInChain, Part); 
+  } 
+} 
+ 
 void VPReplicateRecipe::execute(VPTransformState &State) {
   if (State.Instance) { // Generate a single instance.
-    assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
-    State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
-                                    *State.Instance, IsPredicated, State);
+    assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 
+    State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 
+                                    *State.Instance, IsPredicated, State); 
     // Insert scalar instance packing it into a vector.
-    if (AlsoPack && State.VF.isVector()) {
-      // If we're constructing lane 0, initialize to start from poison.
+    if (AlsoPack && State.VF.isVector()) { 
+      // If we're constructing lane 0, initialize to start from poison. 
       if (State.Instance->Lane == 0) {
-        assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
-        Value *Poison = PoisonValue::get(
-            VectorType::get(getUnderlyingValue()->getType(), State.VF));
-        State.ValueMap.setVectorValue(getUnderlyingInstr(),
-                                      State.Instance->Part, Poison);
+        assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 
+        Value *Poison = PoisonValue::get( 
+            VectorType::get(getUnderlyingValue()->getType(), State.VF)); 
+        State.ValueMap.setVectorValue(getUnderlyingInstr(), 
+                                      State.Instance->Part, Poison); 
       }
-      State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
-                                           *State.Instance);
+      State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 
+                                           *State.Instance); 
     }
     return;
   }
@@ -9086,12 +9086,12 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
   // Generate scalar instances for all VF lanes of all UF parts, unless the
   // instruction is uniform inwhich case generate only the first lane for each
   // of the UF parts.
-  unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
-  assert((!State.VF.isScalable() || IsUniform) &&
-         "Can't scalarize a scalable vector");
+  unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 
+  assert((!State.VF.isScalable() || IsUniform) && 
+         "Can't scalarize a scalable vector"); 
   for (unsigned Part = 0; Part < State.UF; ++Part)
     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
-      State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
+      State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 
                                       IsPredicated, State);
 }
 
@@ -9123,8 +9123,8 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
 
 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
   assert(State.Instance && "Predicated instruction PHI works per instance.");
-  Instruction *ScalarPredInst =
-      cast<Instruction>(State.get(getOperand(0), *State.Instance));
+  Instruction *ScalarPredInst = 
+      cast<Instruction>(State.get(getOperand(0), *State.Instance)); 
   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
   assert(PredicatingBB && "Predicated block has no single predecessor.");
@@ -9136,8 +9136,8 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
   // also do that packing, thereby "hoisting" the insert-element sequence.
   // Otherwise, a phi node for the scalar value is needed.
   unsigned Part = State.Instance->Part;
-  Instruction *PredInst =
-      cast<Instruction>(getOperand(0)->getUnderlyingValue());
+  Instruction *PredInst = 
+      cast<Instruction>(getOperand(0)->getUnderlyingValue()); 
   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
@@ -9148,17 +9148,17 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
   } else {
     Type *PredInstType = PredInst->getType();
     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
-    Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB);
+    Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); 
     Phi->addIncoming(ScalarPredInst, PredicatedBB);
     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
   }
 }
 
 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
-  VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
-  State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
-                                        StoredValue ? nullptr : getVPValue(),
-                                        getAddr(), StoredValue, getMask());
+  VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 
+  State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 
+                                        StoredValue ? nullptr : getVPValue(), 
+                                        getAddr(), StoredValue, getMask()); 
 }
 
 // Determine how to lower the scalar epilogue, which depends on 1) optimising
@@ -9172,51 +9172,51 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
     LoopVectorizationLegality &LVL) {
   // 1) OptSize takes precedence over all other options, i.e. if this is set,
   // don't look at hints or options, and don't request a scalar epilogue.
-  // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
-  // LoopAccessInfo (due to code dependency and not being able to reliably get
-  // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
-  // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
-  // versioning when the vectorization is forced, unlike hasOptSize. So revert
-  // back to the old way and vectorize with versioning when forced. See D81345.)
-  if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
-                                                      PGSOQueryType::IRPass) &&
-                          Hints.getForce() != LoopVectorizeHints::FK_Enabled))
+  // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 
+  // LoopAccessInfo (due to code dependency and not being able to reliably get 
+  // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 
+  // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 
+  // versioning when the vectorization is forced, unlike hasOptSize. So revert 
+  // back to the old way and vectorize with versioning when forced. See D81345.) 
+  if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 
+                                                      PGSOQueryType::IRPass) && 
+                          Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 
     return CM_ScalarEpilogueNotAllowedOptSize;
 
-  // 2) If set, obey the directives
-  if (PreferPredicateOverEpilogue.getNumOccurrences()) {
-    switch (PreferPredicateOverEpilogue) {
-    case PreferPredicateTy::ScalarEpilogue:
-      return CM_ScalarEpilogueAllowed;
-    case PreferPredicateTy::PredicateElseScalarEpilogue:
-      return CM_ScalarEpilogueNotNeededUsePredicate;
-    case PreferPredicateTy::PredicateOrDontVectorize:
-      return CM_ScalarEpilogueNotAllowedUsePredicate;
-    };
-  }
-
-  // 3) If set, obey the hints
-  switch (Hints.getPredicate()) {
-  case LoopVectorizeHints::FK_Enabled:
-    return CM_ScalarEpilogueNotNeededUsePredicate;
-  case LoopVectorizeHints::FK_Disabled:
+  // 2) If set, obey the directives 
+  if (PreferPredicateOverEpilogue.getNumOccurrences()) { 
+    switch (PreferPredicateOverEpilogue) { 
+    case PreferPredicateTy::ScalarEpilogue: 
+      return CM_ScalarEpilogueAllowed; 
+    case PreferPredicateTy::PredicateElseScalarEpilogue: 
+      return CM_ScalarEpilogueNotNeededUsePredicate; 
+    case PreferPredicateTy::PredicateOrDontVectorize: 
+      return CM_ScalarEpilogueNotAllowedUsePredicate; 
+    }; 
+  } 
+
+  // 3) If set, obey the hints 
+  switch (Hints.getPredicate()) { 
+  case LoopVectorizeHints::FK_Enabled: 
+    return CM_ScalarEpilogueNotNeededUsePredicate; 
+  case LoopVectorizeHints::FK_Disabled: 
     return CM_ScalarEpilogueAllowed;
-  };
+  }; 
 
-  // 4) if the TTI hook indicates this is profitable, request predication.
-  if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
-                                       LVL.getLAI()))
+  // 4) if the TTI hook indicates this is profitable, request predication. 
+  if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 
+                                       LVL.getLAI())) 
     return CM_ScalarEpilogueNotNeededUsePredicate;
 
   return CM_ScalarEpilogueAllowed;
 }
 
-void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
-                           unsigned Part) {
-  set(Def, V, Part);
-  ILV->setVectorValue(IRDef, Part, V);
-}
-
+void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 
+                           unsigned Part) { 
+  set(Def, V, Part); 
+  ILV->setVectorValue(IRDef, Part, V); 
+} 
+ 
 // Process the loop in the VPlan-native vectorization path. This path builds
 // VPlan upfront in the vectorization pipeline, which allows to apply
 // VPlan-to-VPlan transformations from the very beginning without modifying the
@@ -9228,7 +9228,7 @@ static bool processLoopInVPlanNativePath(
     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
 
-  if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
+  if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 
     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
     return false;
   }
@@ -9247,7 +9247,7 @@ static bool processLoopInVPlanNativePath(
   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
 
   // Get user vectorization factor.
-  ElementCount UserVF = Hints.getWidth();
+  ElementCount UserVF = Hints.getWidth(); 
 
   // Plan how to best vectorize, return the best VF and its cost.
   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
@@ -9262,7 +9262,7 @@ static bool processLoopInVPlanNativePath(
   LVP.setBestPlan(VF.Width, 1);
 
   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
-                         &CM, BFI, PSI);
+                         &CM, BFI, PSI); 
   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                     << L->getHeader()->getParent()->getName() << "\"\n");
   LVP.executePlan(LB, DT);
@@ -9281,7 +9281,7 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
                               !EnableLoopVectorization) {}
 
 bool LoopVectorizePass::processLoop(Loop *L) {
-  assert((EnableVPlanNativePath || L->isInnermost()) &&
+  assert((EnableVPlanNativePath || L->isInnermost()) && 
          "VPlan-native path is not enabled. Only process inner loops.");
 
 #ifndef NDEBUG
@@ -9326,7 +9326,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Check if it is legal to vectorize the loop.
   LoopVectorizationRequirements Requirements(*ORE);
   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
-                                &Requirements, &Hints, DB, AC, BFI, PSI);
+                                &Requirements, &Hints, DB, AC, BFI, PSI); 
   if (!LVL.canVectorize(EnableVPlanNativePath)) {
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
     Hints.emitRemarkWithHints();
@@ -9343,11 +9343,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // even evaluating whether vectorization is profitable. Since we cannot modify
   // the incoming IR, we need to build VPlan upfront in the vectorization
   // pipeline.
-  if (!L->isInnermost())
+  if (!L->isInnermost()) 
     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
                                         ORE, BFI, PSI, Hints);
 
-  assert(L->isInnermost() && "Inner loop expected.");
+  assert(L->isInnermost() && "Inner loop expected."); 
 
   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
   // count by optimizing for size, to minimize overheads.
@@ -9412,7 +9412,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
 
   // Get user vectorization factor and interleave count.
-  ElementCount UserVF = Hints.getWidth();
+  ElementCount UserVF = Hints.getWidth(); 
   unsigned UserIC = Hints.getInterleave();
 
   // Plan how to best vectorize, return the best VF and its cost.
@@ -9437,7 +9437,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  if (VF.Width.isScalar()) {
+  if (VF.Width.isScalar()) { 
     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
     VecDiagMsg = std::make_pair(
         "VectorizationNotBeneficial",
@@ -9526,8 +9526,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     assert(IC > 1 && "interleave count should not be 1 or 0");
     // If we decided that it is not legal to vectorize the loop, then
     // interleave it.
-    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
-                               BFI, PSI);
+    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 
+                               BFI, PSI); 
     LVP.executePlan(Unroller, DT);
 
     ORE->emit([&]() {
@@ -9539,51 +9539,51 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   } else {
     // If we decided that it is *legal* to vectorize the loop, then do it.
 
-    // Consider vectorizing the epilogue too if it's profitable.
-    VectorizationFactor EpilogueVF =
-      CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
-    if (EpilogueVF.Width.isVector()) {
-
-      // The first pass vectorizes the main loop and creates a scalar epilogue
-      // to be vectorized by executing the plan (potentially with a different
-      // factor) again shortly afterwards.
-      EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
-                                        EpilogueVF.Width.getKnownMinValue(), 1);
-      EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
-                                         &LVL, &CM, BFI, PSI);
-
-      LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
-      LVP.executePlan(MainILV, DT);
-      ++LoopsVectorized;
-
-      simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
-      formLCSSARecursively(*L, *DT, LI, SE);
-
-      // Second pass vectorizes the epilogue and adjusts the control flow
-      // edges from the first pass.
-      LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
-      EPI.MainLoopVF = EPI.EpilogueVF;
-      EPI.MainLoopUF = EPI.EpilogueUF;
-      EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
-                                               ORE, EPI, &LVL, &CM, BFI, PSI);
-      LVP.executePlan(EpilogILV, DT);
-      ++LoopsEpilogueVectorized;
-
-      if (!MainILV.areSafetyChecksAdded())
-        DisableRuntimeUnroll = true;
-    } else {
-      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
-                             &LVL, &CM, BFI, PSI);
-      LVP.executePlan(LB, DT);
-      ++LoopsVectorized;
-
-      // Add metadata to disable runtime unrolling a scalar loop when there are
-      // no runtime checks about strides and memory. A scalar loop that is
-      // rarely used is not worth unrolling.
-      if (!LB.areSafetyChecksAdded())
-        DisableRuntimeUnroll = true;
-    }
-
+    // Consider vectorizing the epilogue too if it's profitable. 
+    VectorizationFactor EpilogueVF = 
+      CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 
+    if (EpilogueVF.Width.isVector()) { 
+
+      // The first pass vectorizes the main loop and creates a scalar epilogue 
+      // to be vectorized by executing the plan (potentially with a different 
+      // factor) again shortly afterwards. 
+      EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 
+                                        EpilogueVF.Width.getKnownMinValue(), 1); 
+      EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 
+                                         &LVL, &CM, BFI, PSI); 
+ 
+      LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 
+      LVP.executePlan(MainILV, DT); 
+      ++LoopsVectorized; 
+ 
+      simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 
+      formLCSSARecursively(*L, *DT, LI, SE); 
+ 
+      // Second pass vectorizes the epilogue and adjusts the control flow 
+      // edges from the first pass. 
+      LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 
+      EPI.MainLoopVF = EPI.EpilogueVF; 
+      EPI.MainLoopUF = EPI.EpilogueUF; 
+      EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 
+                                               ORE, EPI, &LVL, &CM, BFI, PSI); 
+      LVP.executePlan(EpilogILV, DT); 
+      ++LoopsEpilogueVectorized; 
+ 
+      if (!MainILV.areSafetyChecksAdded()) 
+        DisableRuntimeUnroll = true; 
+    } else { 
+      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 
+                             &LVL, &CM, BFI, PSI); 
+      LVP.executePlan(LB, DT); 
+      ++LoopsVectorized; 
+ 
+      // Add metadata to disable runtime unrolling a scalar loop when there are 
+      // no runtime checks about strides and memory. A scalar loop that is 
+      // rarely used is not worth unrolling. 
+      if (!LB.areSafetyChecksAdded()) 
+        DisableRuntimeUnroll = true; 
+    } 
+ 
     // Report the vectorization decision.
     ORE->emit([&]() {
       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
@@ -9696,8 +9696,8 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
         [&](Loop &L) -> const LoopAccessInfo & {
-      LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
-                                        TLI, TTI, nullptr, MSSA};
+      LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE, 
+                                        TLI, TTI, nullptr, MSSA}; 
       return LAM.getResult<LoopAccessAnalysis>(L, AR);
     };
     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0b63019791..7cc322d4b6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -26,16 +26,16 @@
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallString.h" 
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/AssumptionCache.h" 
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/IVDescriptors.h" 
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -80,7 +80,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
-#include "llvm/Support/InstructionCost.h"
+#include "llvm/Support/InstructionCost.h" 
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -128,10 +128,10 @@ static cl::opt<int>
 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
 
-static cl::opt<unsigned>
-MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
-    cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
-
+static cl::opt<unsigned> 
+MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, 
+    cl::desc("Maximum SLP vectorization factor (0=unlimited)")); 
+ 
 static cl::opt<int>
 MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
     cl::desc("Maximum depth of the lookup for consecutive stores."));
@@ -206,12 +206,12 @@ static bool allSameBlock(ArrayRef<Value *> VL) {
   if (!I0)
     return false;
   BasicBlock *BB = I0->getParent();
-  for (int I = 1, E = VL.size(); I < E; I++) {
-    auto *II = dyn_cast<Instruction>(VL[I]);
-    if (!II)
+  for (int I = 1, E = VL.size(); I < E; I++) { 
+    auto *II = dyn_cast<Instruction>(VL[I]); 
+    if (!II) 
       return false;
 
-    if (BB != II->getParent())
+    if (BB != II->getParent()) 
       return false;
   }
   return true;
@@ -236,16 +236,16 @@ static bool isSplat(ArrayRef<Value *> VL) {
   return true;
 }
 
-/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
+/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. 
 static bool isCommutative(Instruction *I) {
-  if (auto *Cmp = dyn_cast<CmpInst>(I))
-    return Cmp->isCommutative();
-  if (auto *BO = dyn_cast<BinaryOperator>(I))
-    return BO->isCommutative();
-  // TODO: This should check for generic Instruction::isCommutative(), but
-  //       we need to confirm that the caller code correctly handles Intrinsics
-  //       for example (does not have 2 operands).
-  return false;
+  if (auto *Cmp = dyn_cast<CmpInst>(I)) 
+    return Cmp->isCommutative(); 
+  if (auto *BO = dyn_cast<BinaryOperator>(I)) 
+    return BO->isCommutative(); 
+  // TODO: This should check for generic Instruction::isCommutative(), but 
+  //       we need to confirm that the caller code correctly handles Intrinsics 
+  //       for example (does not have 2 operands). 
+  return false; 
 }
 
 /// Checks if the vector of instructions can be represented as a shuffle, like:
@@ -257,7 +257,7 @@ static bool isCommutative(Instruction *I) {
 /// %x3x3 = mul i8 %x3, %x3
 /// %y1y1 = mul i8 %y1, %y1
 /// %y2y2 = mul i8 %y2, %y2
-/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
+/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 
 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
@@ -272,13 +272,13 @@ static bool isCommutative(Instruction *I) {
 /// %x3 = extractelement <4 x i8> %x, i32 3
 /// %y1 = extractelement <4 x i8> %y, i32 1
 /// %y2 = extractelement <4 x i8> %y, i32 2
-/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
+/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 
 /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
 /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
 /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
 /// %5 = mul <4 x i8> %4, %4
 /// %6 = extractelement <4 x i8> %5, i32 0
-/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
+/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 
 /// %7 = extractelement <4 x i8> %5, i32 1
 /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
 /// %8 = extractelement <4 x i8> %5, i32 2
@@ -292,8 +292,8 @@ static bool isCommutative(Instruction *I) {
 static Optional<TargetTransformInfo::ShuffleKind>
 isShuffle(ArrayRef<Value *> VL) {
   auto *EI0 = cast<ExtractElementInst>(VL[0]);
-  unsigned Size =
-      cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
+  unsigned Size = 
+      cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements(); 
   Value *Vec1 = nullptr;
   Value *Vec2 = nullptr;
   enum ShuffleMode { Unknown, Select, Permute };
@@ -302,7 +302,7 @@ isShuffle(ArrayRef<Value *> VL) {
     auto *EI = cast<ExtractElementInst>(VL[I]);
     auto *Vec = EI->getVectorOperand();
     // All vector operands must have the same number of vector elements.
-    if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
+    if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) 
       return None;
     auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
     if (!Idx)
@@ -311,7 +311,7 @@ isShuffle(ArrayRef<Value *> VL) {
     if (Idx->getValue().uge(Size))
       continue;
     unsigned IntIdx = Idx->getValue().getZExtValue();
-    // We can extractelement from undef or poison vector.
+    // We can extractelement from undef or poison vector. 
     if (isa<UndefValue>(Vec))
       continue;
     // For correct shuffling we have to have at most 2 different vector operands
@@ -508,7 +508,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
 }
 
 /// \returns the AA location that is being access by the instruction.
-static MemoryLocation getLocation(Instruction *I, AAResults *AA) {
+static MemoryLocation getLocation(Instruction *I, AAResults *AA) { 
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return MemoryLocation::get(SI);
   if (LoadInst *LI = dyn_cast<LoadInst>(I))
@@ -529,15 +529,15 @@ static bool isSimple(Instruction *I) {
 
 namespace llvm {
 
-static void inversePermutation(ArrayRef<unsigned> Indices,
-                               SmallVectorImpl<int> &Mask) {
-  Mask.clear();
-  const unsigned E = Indices.size();
-  Mask.resize(E, E + 1);
-  for (unsigned I = 0; I < E; ++I)
-    Mask[Indices[I]] = I;
-}
-
+static void inversePermutation(ArrayRef<unsigned> Indices, 
+                               SmallVectorImpl<int> &Mask) { 
+  Mask.clear(); 
+  const unsigned E = Indices.size(); 
+  Mask.resize(E, E + 1); 
+  for (unsigned I = 0; I < E; ++I) 
+    Mask[Indices[I]] = I; 
+} 
+ 
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -552,10 +552,10 @@ public:
   using StoreList = SmallVector<StoreInst *, 8>;
   using ExtraValueToDebugLocsMap =
       MapVector<Value *, SmallVector<Instruction *, 2>>;
-  using OrdersType = SmallVector<unsigned, 4>;
+  using OrdersType = SmallVector<unsigned, 4>; 
 
   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
-          TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
+          TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, 
           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
           const DataLayout *DL, OptimizationRemarkEmitter *ORE)
       : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
@@ -589,11 +589,11 @@ public:
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  InstructionCost getSpillCost() const;
+  InstructionCost getSpillCost() const; 
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  InstructionCost getTreeCost();
+  InstructionCost getTreeCost(); 
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -630,14 +630,14 @@ public:
 
   /// \returns The best order of instructions for vectorization.
   Optional<ArrayRef<unsigned>> bestOrder() const {
-    assert(llvm::all_of(
-               NumOpsWantToKeepOrder,
-               [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {
-                 return D.getFirst().size() ==
-                        VectorizableTree[0]->Scalars.size();
-               }) &&
-           "All orders must have the same size as number of instructions in "
-           "tree node.");
+    assert(llvm::all_of( 
+               NumOpsWantToKeepOrder, 
+               [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) { 
+                 return D.getFirst().size() == 
+                        VectorizableTree[0]->Scalars.size(); 
+               }) && 
+           "All orders must have the same size as number of instructions in " 
+           "tree node."); 
     auto I = std::max_element(
         NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
         [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
@@ -651,81 +651,81 @@ public:
     return makeArrayRef(I->getFirst());
   }
 
-  /// Builds the correct order for root instructions.
-  /// If some leaves have the same instructions to be vectorized, we may
-  /// incorrectly evaluate the best order for the root node (it is built for the
-  /// vector of instructions without repeated instructions and, thus, has less
-  /// elements than the root node). This function builds the correct order for
-  /// the root node.
-  /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves
-  /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first
-  /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should
-  /// be reordered, the best order will be \<1, 0\>. We need to extend this
-  /// order for the root node. For the root node this order should look like
-  /// \<3, 0, 1, 2\>. This function extends the order for the reused
-  /// instructions.
-  void findRootOrder(OrdersType &Order) {
-    // If the leaf has the same number of instructions to vectorize as the root
-    // - order must be set already.
-    unsigned RootSize = VectorizableTree[0]->Scalars.size();
-    if (Order.size() == RootSize)
-      return;
-    SmallVector<unsigned, 4> RealOrder(Order.size());
-    std::swap(Order, RealOrder);
-    SmallVector<int, 4> Mask;
-    inversePermutation(RealOrder, Mask);
-    Order.assign(Mask.begin(), Mask.end());
-    // The leaf has less number of instructions - need to find the true order of
-    // the root.
-    // Scan the nodes starting from the leaf back to the root.
-    const TreeEntry *PNode = VectorizableTree.back().get();
-    SmallVector<const TreeEntry *, 4> Nodes(1, PNode);
-    SmallPtrSet<const TreeEntry *, 4> Visited;
-    while (!Nodes.empty() && Order.size() != RootSize) {
-      const TreeEntry *PNode = Nodes.pop_back_val();
-      if (!Visited.insert(PNode).second)
-        continue;
-      const TreeEntry &Node = *PNode;
-      for (const EdgeInfo &EI : Node.UserTreeIndices)
-        if (EI.UserTE)
-          Nodes.push_back(EI.UserTE);
-      if (Node.ReuseShuffleIndices.empty())
-        continue;
-      // Build the order for the parent node.
-      OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize);
-      SmallVector<unsigned, 4> OrderCounter(Order.size(), 0);
-      // The algorithm of the order extension is:
-      // 1. Calculate the number of the same instructions for the order.
-      // 2. Calculate the index of the new order: total number of instructions
-      // with order less than the order of the current instruction + reuse
-      // number of the current instruction.
-      // 3. The new order is just the index of the instruction in the original
-      // vector of the instructions.
-      for (unsigned I : Node.ReuseShuffleIndices)
-        ++OrderCounter[Order[I]];
-      SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0);
-      for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) {
-        unsigned ReusedIdx = Node.ReuseShuffleIndices[I];
-        unsigned OrderIdx = Order[ReusedIdx];
-        unsigned NewIdx = 0;
-        for (unsigned J = 0; J < OrderIdx; ++J)
-          NewIdx += OrderCounter[J];
-        NewIdx += CurrentCounter[OrderIdx];
-        ++CurrentCounter[OrderIdx];
-        assert(NewOrder[NewIdx] == RootSize &&
-               "The order index should not be written already.");
-        NewOrder[NewIdx] = I;
-      }
-      std::swap(Order, NewOrder);
-    }
-    assert(Order.size() == RootSize &&
-           "Root node is expected or the size of the order must be the same as "
-           "the number of elements in the root node.");
-    assert(llvm::all_of(Order,
-                        [RootSize](unsigned Val) { return Val != RootSize; }) &&
-           "All indices must be initialized");
-  }
-
+  /// Builds the correct order for root instructions. 
+  /// If some leaves have the same instructions to be vectorized, we may 
+  /// incorrectly evaluate the best order for the root node (it is built for the 
+  /// vector of instructions without repeated instructions and, thus, has less 
+  /// elements than the root node). This function builds the correct order for 
+  /// the root node. 
+  /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves 
+  /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first 
+  /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should 
+  /// be reordered, the best order will be \<1, 0\>. We need to extend this 
+  /// order for the root node. For the root node this order should look like 
+  /// \<3, 0, 1, 2\>. This function extends the order for the reused 
+  /// instructions. 
+  void findRootOrder(OrdersType &Order) { 
+    // If the leaf has the same number of instructions to vectorize as the root 
+    // - order must be set already. 
+    unsigned RootSize = VectorizableTree[0]->Scalars.size(); 
+    if (Order.size() == RootSize) 
+      return; 
+    SmallVector<unsigned, 4> RealOrder(Order.size()); 
+    std::swap(Order, RealOrder); 
+    SmallVector<int, 4> Mask; 
+    inversePermutation(RealOrder, Mask); 
+    Order.assign(Mask.begin(), Mask.end()); 
+    // The leaf has less number of instructions - need to find the true order of 
+    // the root. 
+    // Scan the nodes starting from the leaf back to the root. 
+    const TreeEntry *PNode = VectorizableTree.back().get(); 
+    SmallVector<const TreeEntry *, 4> Nodes(1, PNode); 
+    SmallPtrSet<const TreeEntry *, 4> Visited; 
+    while (!Nodes.empty() && Order.size() != RootSize) { 
+      const TreeEntry *PNode = Nodes.pop_back_val(); 
+      if (!Visited.insert(PNode).second) 
+        continue; 
+      const TreeEntry &Node = *PNode; 
+      for (const EdgeInfo &EI : Node.UserTreeIndices) 
+        if (EI.UserTE) 
+          Nodes.push_back(EI.UserTE); 
+      if (Node.ReuseShuffleIndices.empty()) 
+        continue; 
+      // Build the order for the parent node. 
+      OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize); 
+      SmallVector<unsigned, 4> OrderCounter(Order.size(), 0); 
+      // The algorithm of the order extension is: 
+      // 1. Calculate the number of the same instructions for the order. 
+      // 2. Calculate the index of the new order: total number of instructions 
+      // with order less than the order of the current instruction + reuse 
+      // number of the current instruction. 
+      // 3. The new order is just the index of the instruction in the original 
+      // vector of the instructions. 
+      for (unsigned I : Node.ReuseShuffleIndices) 
+        ++OrderCounter[Order[I]]; 
+      SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0); 
+      for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) { 
+        unsigned ReusedIdx = Node.ReuseShuffleIndices[I]; 
+        unsigned OrderIdx = Order[ReusedIdx]; 
+        unsigned NewIdx = 0; 
+        for (unsigned J = 0; J < OrderIdx; ++J) 
+          NewIdx += OrderCounter[J]; 
+        NewIdx += CurrentCounter[OrderIdx]; 
+        ++CurrentCounter[OrderIdx]; 
+        assert(NewOrder[NewIdx] == RootSize && 
+               "The order index should not be written already."); 
+        NewOrder[NewIdx] = I; 
+      } 
+      std::swap(Order, NewOrder); 
+    } 
+    assert(Order.size() == RootSize && 
+           "Root node is expected or the size of the order must be the same as " 
+           "the number of elements in the root node."); 
+    assert(llvm::all_of(Order, 
+                        [RootSize](unsigned Val) { return Val != RootSize; }) && 
+           "All indices must be initialized"); 
+  } 
+ 
   /// \return The vector element size in bits to use when vectorizing the
   /// expression tree ending at \p V. If V is a store, the size is the width of
   /// the stored value. Otherwise, the size is the width of the largest loaded
@@ -747,12 +747,12 @@ public:
     return MinVecRegSize;
   }
 
-  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
-    unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
-      MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
-    return MaxVF ? MaxVF : UINT_MAX;
-  }
-
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 
+    unsigned MaxVF = MaxVFOption.getNumOccurrences() ? 
+      MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); 
+    return MaxVF ? MaxVF : UINT_MAX; 
+  } 
+ 
   /// Check if homogeneous aggregate is isomorphic to some VectorType.
   /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
   /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
@@ -772,7 +772,7 @@ public:
   /// effectively impossible for the backend to undo.
   /// TODO: If load combining is allowed in the IR optimizer, this analysis
   ///       may not be necessary.
-  bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
+  bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; 
 
   /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
   /// can be load combined in the backend. Load combining may not be allowed in
@@ -987,14 +987,14 @@ public:
       std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
       for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
         Value *V = Values[Idx].first;
-        if (isa<Constant>(V)) {
-          // Since this is a function pass, it doesn't make semantic sense to
-          // walk the users of a subclass of Constant. The users could be in
-          // another function, or even another module that happens to be in
-          // the same LLVMContext.
-          continue;
-        }
-
+        if (isa<Constant>(V)) { 
+          // Since this is a function pass, it doesn't make semantic sense to 
+          // walk the users of a subclass of Constant. The users could be in 
+          // another function, or even another module that happens to be in 
+          // the same LLVMContext. 
+          continue; 
+        } 
+ 
         // Calculate the absolute lane, using the minimum relative lane of LHS
         // and RHS as base and Idx as the offset.
         int Ln = std::min(LHS.second, RHS.second) + Idx;
@@ -1503,7 +1503,7 @@ private:
   bool areAllUsersVectorized(Instruction *I) const;
 
   /// \returns the cost of the vectorizable entry.
-  InstructionCost getEntryCost(TreeEntry *E);
+  InstructionCost getEntryCost(TreeEntry *E); 
 
   /// This is the recursive part of buildTree.
   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
@@ -1525,21 +1525,21 @@ private:
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
-  InstructionCost
-  getGatherCost(FixedVectorType *Ty,
-                const DenseSet<unsigned> &ShuffledIndices) const;
+  InstructionCost 
+  getGatherCost(FixedVectorType *Ty, 
+                const DenseSet<unsigned> &ShuffledIndices) const; 
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
   /// roots. This method calculates the cost of extracting the values.
-  InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
+  InstructionCost getGatherCost(ArrayRef<Value *> VL) const; 
 
   /// Set the Builder insert point to one after the last instruction in
   /// the bundle
   void setInsertPointAfterBundle(TreeEntry *E);
 
   /// \returns a vector from a collection of scalars in \p VL.
-  Value *gather(ArrayRef<Value *> VL);
+  Value *gather(ArrayRef<Value *> VL); 
 
   /// \returns whether the VectorizableTree is fully vectorizable and will
   /// be beneficial even the tree height is tiny.
@@ -1573,17 +1573,17 @@ private:
     /// The Scalars are vectorized into this value. It is initialized to Null.
     Value *VectorizedValue = nullptr;
 
-    /// Do we need to gather this sequence or vectorize it
-    /// (either with vector instruction or with scatter/gather
-    /// intrinsics for store/load)?
-    enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
+    /// Do we need to gather this sequence or vectorize it 
+    /// (either with vector instruction or with scatter/gather 
+    /// intrinsics for store/load)? 
+    enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; 
     EntryState State;
 
     /// Does this sequence require some shuffling?
     SmallVector<int, 4> ReuseShuffleIndices;
 
     /// Does this entry require reordering?
-    SmallVector<unsigned, 4> ReorderIndices;
+    SmallVector<unsigned, 4> ReorderIndices; 
 
     /// Points back to the VectorizableTree.
     ///
@@ -1724,9 +1724,9 @@ private:
       case Vectorize:
         dbgs() << "Vectorize\n";
         break;
-      case ScatterVectorize:
-        dbgs() << "ScatterVectorize\n";
-        break;
+      case ScatterVectorize: 
+        dbgs() << "ScatterVectorize\n"; 
+        break; 
       case NeedToGather:
         dbgs() << "NeedToGather\n";
         break;
@@ -1748,7 +1748,7 @@ private:
         dbgs() << "NULL\n";
       dbgs() << "ReuseShuffleIndices: ";
       if (ReuseShuffleIndices.empty())
-        dbgs() << "Empty";
+        dbgs() << "Empty"; 
       else
         for (unsigned ReuseIdx : ReuseShuffleIndices)
           dbgs() << ReuseIdx << ", ";
@@ -1765,55 +1765,55 @@ private:
 #endif
   };
 
-#ifndef NDEBUG
-  void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost,
-                     InstructionCost VecCost,
-                     InstructionCost ScalarCost) const {
-    dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();
-    dbgs() << "SLP: Costs:\n";
-    dbgs() << "SLP:     ReuseShuffleCost = " << ReuseShuffleCost << "\n";
-    dbgs() << "SLP:     VectorCost = " << VecCost << "\n";
-    dbgs() << "SLP:     ScalarCost = " << ScalarCost << "\n";
-    dbgs() << "SLP:     ReuseShuffleCost + VecCost - ScalarCost = " <<
-               ReuseShuffleCost + VecCost - ScalarCost << "\n";
-  }
-#endif
-
+#ifndef NDEBUG 
+  void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost, 
+                     InstructionCost VecCost, 
+                     InstructionCost ScalarCost) const { 
+    dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump(); 
+    dbgs() << "SLP: Costs:\n"; 
+    dbgs() << "SLP:     ReuseShuffleCost = " << ReuseShuffleCost << "\n"; 
+    dbgs() << "SLP:     VectorCost = " << VecCost << "\n"; 
+    dbgs() << "SLP:     ScalarCost = " << ScalarCost << "\n"; 
+    dbgs() << "SLP:     ReuseShuffleCost + VecCost - ScalarCost = " << 
+               ReuseShuffleCost + VecCost - ScalarCost << "\n"; 
+  } 
+#endif 
+ 
   /// Create a new VectorizableTree entry.
   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
                           const InstructionsState &S,
                           const EdgeInfo &UserTreeIdx,
                           ArrayRef<unsigned> ReuseShuffleIndices = None,
                           ArrayRef<unsigned> ReorderIndices = None) {
-    TreeEntry::EntryState EntryState =
-        Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
-    return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
-                        ReuseShuffleIndices, ReorderIndices);
-  }
-
-  TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
-                          TreeEntry::EntryState EntryState,
-                          Optional<ScheduleData *> Bundle,
-                          const InstructionsState &S,
-                          const EdgeInfo &UserTreeIdx,
-                          ArrayRef<unsigned> ReuseShuffleIndices = None,
-                          ArrayRef<unsigned> ReorderIndices = None) {
-    assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
-            (Bundle && EntryState != TreeEntry::NeedToGather)) &&
-           "Need to vectorize gather entry?");
+    TreeEntry::EntryState EntryState = 
+        Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; 
+    return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, 
+                        ReuseShuffleIndices, ReorderIndices); 
+  } 
+ 
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, 
+                          TreeEntry::EntryState EntryState, 
+                          Optional<ScheduleData *> Bundle, 
+                          const InstructionsState &S, 
+                          const EdgeInfo &UserTreeIdx, 
+                          ArrayRef<unsigned> ReuseShuffleIndices = None, 
+                          ArrayRef<unsigned> ReorderIndices = None) { 
+    assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || 
+            (Bundle && EntryState != TreeEntry::NeedToGather)) && 
+           "Need to vectorize gather entry?"); 
     VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
     TreeEntry *Last = VectorizableTree.back().get();
     Last->Idx = VectorizableTree.size() - 1;
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
-    Last->State = EntryState;
+    Last->State = EntryState; 
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
-    Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
+    Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); 
     Last->setOperations(S);
-    if (Last->State != TreeEntry::NeedToGather) {
-      for (Value *V : VL) {
-        assert(!getTreeEntry(V) && "Scalar already in tree!");
-        ScalarToTreeEntry[V] = Last;
+    if (Last->State != TreeEntry::NeedToGather) { 
+      for (Value *V : VL) { 
+        assert(!getTreeEntry(V) && "Scalar already in tree!"); 
+        ScalarToTreeEntry[V] = Last; 
       }
       // Update the scheduler bundle to point to this TreeEntry.
       unsigned Lane = 0;
@@ -1849,10 +1849,10 @@ private:
   }
 #endif
 
-  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
+  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); } 
 
   const TreeEntry *getTreeEntry(Value *V) const {
-    return ScalarToTreeEntry.lookup(V);
+    return ScalarToTreeEntry.lookup(V); 
   }
 
   /// Maps a specific scalar to its tree entry.
@@ -2374,7 +2374,7 @@ private:
   ScalarEvolution *SE;
   TargetTransformInfo *TTI;
   TargetLibraryInfo *TLI;
-  AAResults *AA;
+  AAResults *AA; 
   LoopInfo *LI;
   DominatorTree *DT;
   AssumptionCache *AC;
@@ -2473,9 +2473,9 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
     }
     for (auto V : Entry->Scalars) {
       OS << *V;
-      if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
-            return EU.Scalar == V;
-          }))
+      if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) { 
+            return EU.Scalar == V; 
+          })) 
         OS << " <extract>";
       OS << "\n";
     }
@@ -2507,17 +2507,17 @@ BoUpSLP::~BoUpSLP() {
            "trying to erase instruction with users.");
     Pair.getFirst()->eraseFromParent();
   }
-#ifdef EXPENSIVE_CHECKS
-  // If we could guarantee that this call is not extremely slow, we could
-  // remove the ifdef limitation (see PR47712).
+#ifdef EXPENSIVE_CHECKS 
+  // If we could guarantee that this call is not extremely slow, we could 
+  // remove the ifdef limitation (see PR47712). 
   assert(!verifyFunction(*F, &dbgs()));
-#endif
+#endif 
 }
 
 void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
   for (auto *V : AV) {
     if (auto *I = dyn_cast<Instruction>(V))
-      eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
+      eraseInstruction(I, /*ReplaceOpsWithUndef=*/true); 
   };
 }
 
@@ -2742,11 +2742,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       auto *PH = cast<PHINode>(VL0);
 
       // Check for terminator values (e.g. invoke).
-      for (Value *V : VL)
-        for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
+      for (Value *V : VL) 
+        for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { 
           Instruction *Term = dyn_cast<Instruction>(
-              cast<PHINode>(V)->getIncomingValueForBlock(
-                  PH->getIncomingBlock(I)));
+              cast<PHINode>(V)->getIncomingValueForBlock( 
+                  PH->getIncomingBlock(I))); 
           if (Term && Term->isTerminator()) {
             LLVM_DEBUG(dbgs()
                        << "SLP: Need to swizzle PHINodes (terminator use).\n");
@@ -2763,13 +2763,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       // Keeps the reordered operands to avoid code duplication.
       SmallVector<ValueList, 2> OperandsVec;
-      for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
+      for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { 
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *V : VL)
-          Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
-              PH->getIncomingBlock(I)));
-        TE->setOperand(I, Operands);
+        for (Value *V : VL) 
+          Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock( 
+              PH->getIncomingBlock(I))); 
+        TE->setOperand(I, Operands); 
         OperandsVec.push_back(Operands);
       }
       for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
@@ -2803,9 +2803,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         // Insert new order with initial value 0, if it does not exist,
         // otherwise return the iterator to the existing one.
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies, CurrentOrder);
-        findRootOrder(CurrentOrder);
-        ++NumOpsWantToKeepOrder[CurrentOrder];
+                     ReuseShuffleIndicies, CurrentOrder); 
+        findRootOrder(CurrentOrder); 
+        ++NumOpsWantToKeepOrder[CurrentOrder]; 
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
@@ -2884,21 +2884,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             // Need to reorder.
             TreeEntry *TE =
                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                             ReuseShuffleIndicies, CurrentOrder);
+                             ReuseShuffleIndicies, CurrentOrder); 
             TE->setOperandsInOrder();
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
-            findRootOrder(CurrentOrder);
-            ++NumOpsWantToKeepOrder[CurrentOrder];
+            findRootOrder(CurrentOrder); 
+            ++NumOpsWantToKeepOrder[CurrentOrder]; 
           }
           return;
         }
-        // Vectorizing non-consecutive loads with `llvm.masked.gather`.
-        TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
-                                     UserTreeIdx, ReuseShuffleIndicies);
-        TE->setOperandsInOrder();
-        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
-        LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
-        return;
+        // Vectorizing non-consecutive loads with `llvm.masked.gather`. 
+        TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, 
+                                     UserTreeIdx, ReuseShuffleIndicies); 
+        TE->setOperandsInOrder(); 
+        buildTree_rec(PointerOps, Depth + 1, {TE, 0}); 
+        LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); 
+        return; 
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
@@ -3033,8 +3033,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(i));
+        for (Value *V : VL) 
+          Operands.push_back(cast<Instruction>(V)->getOperand(i)); 
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
@@ -3102,16 +3102,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::Store: {
       // Check if the stores are consecutive or if we need to swizzle them.
       llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
-      // Avoid types that are padded when being allocated as scalars, while
-      // being packed together in a vector (such as i1).
-      if (DL->getTypeSizeInBits(ScalarTy) !=
-          DL->getTypeAllocSizeInBits(ScalarTy)) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies);
-        LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
-        return;
-      }
+      // Avoid types that are padded when being allocated as scalars, while 
+      // being packed together in a vector (such as i1). 
+      if (DL->getTypeSizeInBits(ScalarTy) != 
+          DL->getTypeAllocSizeInBits(ScalarTy)) { 
+        BS.cancelScheduling(VL, VL0); 
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
+                     ReuseShuffleIndicies); 
+        LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); 
+        return; 
+      } 
       // Make sure all stores in the bundle are simple - we can't vectorize
       // atomic or volatile stores.
       SmallVector<Value *, 4> PointerOps(VL.size());
@@ -3163,12 +3163,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           } else {
             TreeEntry *TE =
                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                             ReuseShuffleIndicies, CurrentOrder);
+                             ReuseShuffleIndicies, CurrentOrder); 
             TE->setOperandsInOrder();
             buildTree_rec(Operands, Depth + 1, {TE, 0});
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
-            findRootOrder(CurrentOrder);
-            ++NumOpsWantToKeepOrder[CurrentOrder];
+            findRootOrder(CurrentOrder); 
+            ++NumOpsWantToKeepOrder[CurrentOrder]; 
           }
           return;
         }
@@ -3187,7 +3187,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       VFShape Shape = VFShape::get(
-          *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
+          *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())), 
           false /*HasGlobalPred*/);
       Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
 
@@ -3324,7 +3324,7 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
       N *= AT->getNumElements();
       EltTy = AT->getElementType();
     } else {
-      auto *VT = cast<FixedVectorType>(EltTy);
+      auto *VT = cast<FixedVectorType>(EltTy); 
       N *= VT->getNumElements();
       EltTy = VT->getElementType();
     }
@@ -3362,7 +3362,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
     if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
       return false;
   } else {
-    NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
+    NElts = cast<FixedVectorType>(Vec->getType())->getNumElements(); 
   }
 
   if (NElts != VL.size())
@@ -3406,26 +3406,26 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
 }
 
 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
-  return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) {
+  return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) { 
            return ScalarToTreeEntry.count(U) > 0;
          });
 }
 
-static std::pair<InstructionCost, InstructionCost>
-getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
-                   TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {
+static std::pair<InstructionCost, InstructionCost> 
+getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, 
+                   TargetTransformInfo *TTI, TargetLibraryInfo *TLI) { 
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
   // Calculate the cost of the scalar and vector calls.
-  IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount());
-  auto IntrinsicCost =
+  IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount()); 
+  auto IntrinsicCost = 
     TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
 
-  auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
-                                     VecTy->getNumElements())),
-                            false /*HasGlobalPred*/);
+  auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( 
+                                     VecTy->getNumElements())), 
+                            false /*HasGlobalPred*/); 
   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
-  auto LibCost = IntrinsicCost;
+  auto LibCost = IntrinsicCost; 
   if (!CI->isNoBuiltin() && VecFunc) {
     // Calculate the cost of the vector library call.
     SmallVector<Type *, 4> VecTys;
@@ -3440,7 +3440,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
   return {IntrinsicCost, LibCost};
 }
 
-InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
+InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { 
   ArrayRef<Value*> VL = E->Scalars;
 
   Type *ScalarTy = VL[0]->getType();
@@ -3459,7 +3459,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
 
   unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
-  InstructionCost ReuseShuffleCost = 0;
+  InstructionCost ReuseShuffleCost = 0; 
   if (NeedToShuffleReuses) {
     ReuseShuffleCost =
         TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
@@ -3475,8 +3475,8 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
         allSameType(VL) && allSameBlock(VL)) {
       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
       if (ShuffleKind.hasValue()) {
-        InstructionCost Cost =
-            TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
+        InstructionCost Cost = 
+            TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); 
         for (auto *V : VL) {
           // If all users of instruction are going to be vectorized and this
           // instruction itself is not going to be vectorized, consider this
@@ -3495,9 +3495,9 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
     }
     return ReuseShuffleCost + getGatherCost(VL);
   }
-  assert((E->State == TreeEntry::Vectorize ||
-          E->State == TreeEntry::ScatterVectorize) &&
-         "Unhandled state");
+  assert((E->State == TreeEntry::Vectorize || 
+          E->State == TreeEntry::ScatterVectorize) && 
+         "Unhandled state"); 
   assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
   Instruction *VL0 = E->getMainOp();
   unsigned ShuffleOrOp =
@@ -3536,37 +3536,37 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
               TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
         }
       }
-      InstructionCost DeadCost = ReuseShuffleCost;
+      InstructionCost DeadCost = ReuseShuffleCost; 
       if (!E->ReorderIndices.empty()) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
         DeadCost += TTI->getShuffleCost(
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
       }
-      for (unsigned I = 0, E = VL.size(); I < E; ++I) {
-        Instruction *EI = cast<Instruction>(VL[I]);
+      for (unsigned I = 0, E = VL.size(); I < E; ++I) { 
+        Instruction *EI = cast<Instruction>(VL[I]); 
         // If all users are going to be vectorized, instruction can be
         // considered as dead.
         // The same, if have only one user, it will be vectorized for sure.
-        if (areAllUsersVectorized(EI)) {
+        if (areAllUsersVectorized(EI)) { 
           // Take credit for instruction that will become dead.
-          if (EI->hasOneUse()) {
-            Instruction *Ext = EI->user_back();
+          if (EI->hasOneUse()) { 
+            Instruction *Ext = EI->user_back(); 
             if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
                 all_of(Ext->users(),
                        [](User *U) { return isa<GetElementPtrInst>(U); })) {
               // Use getExtractWithExtendCost() to calculate the cost of
               // extractelement/ext pair.
               DeadCost -= TTI->getExtractWithExtendCost(
-                  Ext->getOpcode(), Ext->getType(), VecTy, I);
+                  Ext->getOpcode(), Ext->getType(), VecTy, I); 
               // Add back the cost of s|zext which is subtracted separately.
               DeadCost += TTI->getCastInstrCost(
-                  Ext->getOpcode(), Ext->getType(), EI->getType(),
-                  TTI::getCastContextHint(Ext), CostKind, Ext);
+                  Ext->getOpcode(), Ext->getType(), EI->getType(), 
+                  TTI::getCastContextHint(Ext), CostKind, Ext); 
               continue;
             }
           }
           DeadCost -=
-              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
+              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); 
         }
       }
       return DeadCost;
@@ -3584,78 +3584,78 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
-      InstructionCost ScalarEltCost =
-          TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
-                                TTI::getCastContextHint(VL0), CostKind, VL0);
+      InstructionCost ScalarEltCost = 
+          TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, 
+                                TTI::getCastContextHint(VL0), CostKind, VL0); 
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
 
       // Calculate the cost of this instruction.
-      InstructionCost ScalarCost = VL.size() * ScalarEltCost;
+      InstructionCost ScalarCost = VL.size() * ScalarEltCost; 
 
       auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
-      InstructionCost VecCost = 0;
+      InstructionCost VecCost = 0; 
       // Check if the values are candidates to demote.
       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
-        VecCost =
-            ReuseShuffleCost +
-            TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
-                                  TTI::getCastContextHint(VL0), CostKind, VL0);
+        VecCost = 
+            ReuseShuffleCost + 
+            TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, 
+                                  TTI::getCastContextHint(VL0), CostKind, VL0); 
       }
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); 
       return VecCost - ScalarCost;
     }
     case Instruction::FCmp:
     case Instruction::ICmp:
     case Instruction::Select: {
       // Calculate the cost of this instruction.
-      InstructionCost ScalarEltCost =
-          TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
-                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
+      InstructionCost ScalarEltCost = 
+          TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), 
+                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); 
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
-      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-
-      // Check if all entries in VL are either compares or selects with compares
-      // as condition that have the same predicates.
-      CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
-      bool First = true;
-      for (auto *V : VL) {
-        CmpInst::Predicate CurrentPred;
-        auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
-        if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
-             !match(V, MatchCmp)) ||
-            (!First && VecPred != CurrentPred)) {
-          VecPred = CmpInst::BAD_ICMP_PREDICATE;
-          break;
-        }
-        First = false;
-        VecPred = CurrentPred;
-      }
-
-      InstructionCost VecCost = TTI->getCmpSelInstrCost(
-          E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
-      // Check if it is possible and profitable to use min/max for selects in
-      // VL.
-      //
-      auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
-      if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
-        IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
-                                          {VecTy, VecTy});
-        InstructionCost IntrinsicCost =
-            TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
-        // If the selects are the only uses of the compares, they will be dead
-        // and we can adjust the cost by removing their cost.
-        if (IntrinsicAndUse.second)
-          IntrinsicCost -=
-              TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy,
-                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
-        VecCost = std::min(VecCost, IntrinsicCost);
-      }
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
+      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; 
+ 
+      // Check if all entries in VL are either compares or selects with compares 
+      // as condition that have the same predicates. 
+      CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE; 
+      bool First = true; 
+      for (auto *V : VL) { 
+        CmpInst::Predicate CurrentPred; 
+        auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); 
+        if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) && 
+             !match(V, MatchCmp)) || 
+            (!First && VecPred != CurrentPred)) { 
+          VecPred = CmpInst::BAD_ICMP_PREDICATE; 
+          break; 
+        } 
+        First = false; 
+        VecPred = CurrentPred; 
+      } 
+ 
+      InstructionCost VecCost = TTI->getCmpSelInstrCost( 
+          E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); 
+      // Check if it is possible and profitable to use min/max for selects in 
+      // VL. 
+      // 
+      auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); 
+      if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { 
+        IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy, 
+                                          {VecTy, VecTy}); 
+        InstructionCost IntrinsicCost = 
+            TTI->getIntrinsicInstrCost(CostAttrs, CostKind); 
+        // If the selects are the only uses of the compares, they will be dead 
+        // and we can adjust the cost by removing their cost. 
+        if (IntrinsicAndUse.second) 
+          IntrinsicCost -= 
+              TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy, 
+                                      CmpInst::BAD_ICMP_PREDICATE, CostKind); 
+        VecCost = std::min(VecCost, IntrinsicCost); 
+      } 
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); 
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::FNeg:
@@ -3715,17 +3715,17 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
       }
 
       SmallVector<const Value *, 4> Operands(VL0->operand_values());
-      InstructionCost ScalarEltCost =
-          TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
-                                      Op2VK, Op1VP, Op2VP, Operands, VL0);
+      InstructionCost ScalarEltCost = 
+          TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, 
+                                      Op2VK, Op1VP, Op2VP, Operands, VL0); 
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
-      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      InstructionCost VecCost =
-          TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
-                                      Op2VK, Op1VP, Op2VP, Operands, VL0);
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
+      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; 
+      InstructionCost VecCost = 
+          TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, 
+                                      Op2VK, Op1VP, Op2VP, Operands, VL0); 
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); 
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
@@ -3734,42 +3734,42 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
       TargetTransformInfo::OperandValueKind Op2VK =
           TargetTransformInfo::OK_UniformConstantValue;
 
-      InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
-          Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
+      InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( 
+          Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); 
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
-      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      InstructionCost VecCost = TTI->getArithmeticInstrCost(
-          Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
+      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; 
+      InstructionCost VecCost = TTI->getArithmeticInstrCost( 
+          Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); 
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); 
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
       Align alignment = cast<LoadInst>(VL0)->getAlign();
-      InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
-          Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0);
+      InstructionCost ScalarEltCost = TTI->getMemoryOpCost( 
+          Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0); 
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
-      InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
-      InstructionCost VecLdCost;
-      if (E->State == TreeEntry::Vectorize) {
-        VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
-                                         CostKind, VL0);
-      } else {
-        assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
-        VecLdCost = TTI->getGatherScatterOpCost(
-            Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
-            /*VariableMask=*/false, alignment, CostKind, VL0);
-      }
+      InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; 
+      InstructionCost VecLdCost; 
+      if (E->State == TreeEntry::Vectorize) { 
+        VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, 
+                                         CostKind, VL0); 
+      } else { 
+        assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); 
+        VecLdCost = TTI->getGatherScatterOpCost( 
+            Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(), 
+            /*VariableMask=*/false, alignment, CostKind, VL0); 
+      } 
       if (!E->ReorderIndices.empty()) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
         VecLdCost += TTI->getShuffleCost(
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
       }
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost));
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); 
       return ReuseShuffleCost + VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
@@ -3778,19 +3778,19 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
       auto *SI =
           cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
       Align Alignment = SI->getAlign();
-      InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
-          Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
+      InstructionCost ScalarEltCost = TTI->getMemoryOpCost( 
+          Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); 
       if (NeedToShuffleReuses)
         ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
-      InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
-      InstructionCost VecStCost = TTI->getMemoryOpCost(
-          Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
+      InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; 
+      InstructionCost VecStCost = TTI->getMemoryOpCost( 
+          Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); 
       if (IsReorder) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
         VecStCost += TTI->getShuffleCost(
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
       }
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); 
       return ReuseShuffleCost + VecStCost - ScalarStCost;
     }
     case Instruction::Call: {
@@ -3798,17 +3798,17 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       // Calculate the cost of the scalar and vector calls.
-      IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1);
-      InstructionCost ScalarEltCost =
-          TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+      IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1); 
+      InstructionCost ScalarEltCost = 
+          TTI->getIntrinsicInstrCost(CostAttrs, CostKind); 
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
-      InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
+      InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; 
 
       auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
-      InstructionCost VecCallCost =
-          std::min(VecCallCosts.first, VecCallCosts.second);
+      InstructionCost VecCallCost = 
+          std::min(VecCallCosts.first, VecCallCosts.second); 
 
       LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
                         << " (" << VecCallCost << "-" << ScalarCallCost << ")"
@@ -3823,7 +3823,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
               (Instruction::isCast(E->getOpcode()) &&
                Instruction::isCast(E->getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
-      InstructionCost ScalarCost = 0;
+      InstructionCost ScalarCost = 0; 
       if (NeedToShuffleReuses) {
         for (unsigned Idx : E->ReuseShuffleIndices) {
           Instruction *I = cast<Instruction>(VL[Idx]);
@@ -3841,7 +3841,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
       }
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
-      InstructionCost VecCost = 0;
+      InstructionCost VecCost = 0; 
       if (Instruction::isBinaryOp(E->getOpcode())) {
         VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
         VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
@@ -3852,12 +3852,12 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
         auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
         auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
         VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
-                                        TTI::CastContextHint::None, CostKind);
+                                        TTI::CastContextHint::None, CostKind); 
         VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
-                                         TTI::CastContextHint::None, CostKind);
+                                         TTI::CastContextHint::None, CostKind); 
       }
       VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); 
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     default:
@@ -3895,13 +3895,13 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
                                        TargetTransformInfo *TTI) {
   // Look past the root to find a source value. Arbitrarily follow the
   // path through operand 0 of any 'or'. Also, peek through optional
-  // shift-left-by-multiple-of-8-bits.
+  // shift-left-by-multiple-of-8-bits. 
   Value *ZextLoad = Root;
-  const APInt *ShAmtC;
+  const APInt *ShAmtC; 
   while (!isa<ConstantExpr>(ZextLoad) &&
          (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
-          (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
-           ShAmtC->urem(8) == 0)))
+          (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && 
+           ShAmtC->urem(8) == 0))) 
     ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
 
   // Check if the input is an extended load of the required or/shift expression.
@@ -3925,8 +3925,8 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
   return true;
 }
 
-bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
-  if (RdxKind != RecurKind::Or)
+bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { 
+  if (RdxKind != RecurKind::Or) 
     return false;
 
   unsigned NumElts = VectorizableTree[0]->Scalars.size();
@@ -3967,35 +3967,35 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
   return true;
 }
 
-InstructionCost BoUpSLP::getSpillCost() const {
+InstructionCost BoUpSLP::getSpillCost() const { 
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
   unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
-  InstructionCost Cost = 0;
+  InstructionCost Cost = 0; 
 
   SmallPtrSet<Instruction*, 4> LiveValues;
   Instruction *PrevInst = nullptr;
 
-  // The entries in VectorizableTree are not necessarily ordered by their
-  // position in basic blocks. Collect them and order them by dominance so later
-  // instructions are guaranteed to be visited first. For instructions in
-  // different basic blocks, we only scan to the beginning of the block, so
-  // their order does not matter, as long as all instructions in a basic block
-  // are grouped together. Using dominance ensures a deterministic order.
-  SmallVector<Instruction *, 16> OrderedScalars;
+  // The entries in VectorizableTree are not necessarily ordered by their 
+  // position in basic blocks. Collect them and order them by dominance so later 
+  // instructions are guaranteed to be visited first. For instructions in 
+  // different basic blocks, we only scan to the beginning of the block, so 
+  // their order does not matter, as long as all instructions in a basic block 
+  // are grouped together. Using dominance ensures a deterministic order. 
+  SmallVector<Instruction *, 16> OrderedScalars; 
   for (const auto &TEPtr : VectorizableTree) {
     Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
     if (!Inst)
       continue;
-    OrderedScalars.push_back(Inst);
-  }
-  llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) {
-    return DT->dominates(B, A);
-  });
+    OrderedScalars.push_back(Inst); 
+  } 
+  llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) { 
+    return DT->dominates(B, A); 
+  }); 
 
-  for (Instruction *Inst : OrderedScalars) {
+  for (Instruction *Inst : OrderedScalars) { 
     if (!PrevInst) {
       PrevInst = Inst;
       continue;
@@ -4049,8 +4049,8 @@ InstructionCost BoUpSLP::getSpillCost() const {
   return Cost;
 }
 
-InstructionCost BoUpSLP::getTreeCost() {
-  InstructionCost Cost = 0;
+InstructionCost BoUpSLP::getTreeCost() { 
+  InstructionCost Cost = 0; 
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
                     << VectorizableTree.size() << ".\n");
 
@@ -4080,16 +4080,16 @@ InstructionCost BoUpSLP::getTreeCost() {
                     }))
       continue;
 
-    InstructionCost C = getEntryCost(&TE);
-    Cost += C;
+    InstructionCost C = getEntryCost(&TE); 
+    Cost += C; 
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
                       << " for bundle that starts with " << *TE.Scalars[0]
-                      << ".\n"
-                      << "SLP: Current total cost = " << Cost << "\n");
+                      << ".\n" 
+                      << "SLP: Current total cost = " << Cost << "\n"); 
   }
 
   SmallPtrSet<Value *, 16> ExtractCostCalculated;
-  InstructionCost ExtractCost = 0;
+  InstructionCost ExtractCost = 0; 
   for (ExternalUser &EU : ExternalUses) {
     // We only add extract cost once for the same scalar.
     if (!ExtractCostCalculated.insert(EU.Scalar).second)
@@ -4119,13 +4119,13 @@ InstructionCost BoUpSLP::getTreeCost() {
     }
   }
 
-  InstructionCost SpillCost = getSpillCost();
+  InstructionCost SpillCost = getSpillCost(); 
   Cost += SpillCost + ExtractCost;
 
-#ifndef NDEBUG
-  SmallString<256> Str;
+#ifndef NDEBUG 
+  SmallString<256> Str; 
   {
-    raw_svector_ostream OS(Str);
+    raw_svector_ostream OS(Str); 
     OS << "SLP: Spill Cost = " << SpillCost << ".\n"
        << "SLP: Extract Cost = " << ExtractCost << ".\n"
        << "SLP: Total Cost = " << Cost << ".\n";
@@ -4133,28 +4133,28 @@ InstructionCost BoUpSLP::getTreeCost() {
   LLVM_DEBUG(dbgs() << Str);
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
-#endif
+#endif 
 
   return Cost;
 }
 
-InstructionCost
-BoUpSLP::getGatherCost(FixedVectorType *Ty,
-                       const DenseSet<unsigned> &ShuffledIndices) const {
+InstructionCost 
+BoUpSLP::getGatherCost(FixedVectorType *Ty, 
+                       const DenseSet<unsigned> &ShuffledIndices) const { 
   unsigned NumElts = Ty->getNumElements();
   APInt DemandedElts = APInt::getNullValue(NumElts);
-  for (unsigned I = 0; I < NumElts; ++I)
-    if (!ShuffledIndices.count(I))
-      DemandedElts.setBit(I);
-  InstructionCost Cost =
-      TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
-                                    /*Extract*/ false);
+  for (unsigned I = 0; I < NumElts; ++I) 
+    if (!ShuffledIndices.count(I)) 
+      DemandedElts.setBit(I); 
+  InstructionCost Cost = 
+      TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, 
+                                    /*Extract*/ false); 
   if (!ShuffledIndices.empty())
     Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
   return Cost;
 }
 
-InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
+InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { 
   // Find the type of the operands in VL.
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -4196,10 +4196,10 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
   // should be in this block.
   auto *Front = E->getMainOp();
   auto *BB = Front->getParent();
-  assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
-    auto *I = cast<Instruction>(V);
-    return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
-  }));
+  assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool { 
+    auto *I = cast<Instruction>(V); 
+    return !E->isOpcodeOrAlt(I) || I->getParent() == BB; 
+  })); 
 
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
@@ -4252,30 +4252,30 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
 
-Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
-  Value *Val0 =
-      isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
-  FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
-  Value *Vec = PoisonValue::get(VecTy);
-  unsigned InsIndex = 0;
-  for (Value *Val : VL) {
-    Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++));
-    auto *InsElt = dyn_cast<InsertElementInst>(Vec);
-    if (!InsElt)
-      continue;
-    GatherSeq.insert(InsElt);
-    CSEBlocks.insert(InsElt->getParent());
-    // Add to our 'need-to-extract' list.
-    if (TreeEntry *Entry = getTreeEntry(Val)) {
-      // Find which lane we need to extract.
-      unsigned FoundLane = std::distance(Entry->Scalars.begin(),
-                                         find(Entry->Scalars, Val));
-      assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane");
-      if (!Entry->ReuseShuffleIndices.empty()) {
-        FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(),
-                                  find(Entry->ReuseShuffleIndices, FoundLane));
-      }
-      ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
+Value *BoUpSLP::gather(ArrayRef<Value *> VL) { 
+  Value *Val0 = 
+      isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0]; 
+  FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size()); 
+  Value *Vec = PoisonValue::get(VecTy); 
+  unsigned InsIndex = 0; 
+  for (Value *Val : VL) { 
+    Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++)); 
+    auto *InsElt = dyn_cast<InsertElementInst>(Vec); 
+    if (!InsElt) 
+      continue; 
+    GatherSeq.insert(InsElt); 
+    CSEBlocks.insert(InsElt->getParent()); 
+    // Add to our 'need-to-extract' list. 
+    if (TreeEntry *Entry = getTreeEntry(Val)) { 
+      // Find which lane we need to extract. 
+      unsigned FoundLane = std::distance(Entry->Scalars.begin(), 
+                                         find(Entry->Scalars, Val)); 
+      assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane"); 
+      if (!Entry->ReuseShuffleIndices.empty()) { 
+        FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), 
+                                  find(Entry->ReuseShuffleIndices, FoundLane)); 
+      }
+      ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane)); 
     }
   }
 
@@ -4299,7 +4299,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
             for (int Idx : E->ReuseShuffleIndices)
               if (UsedIdxs.insert(Idx).second)
                 UniqueIdxs.emplace_back(Idx);
-            V = Builder.CreateShuffleVector(V, UniqueIdxs);
+            V = Builder.CreateShuffleVector(V, UniqueIdxs); 
           }
         }
         return V;
@@ -4327,15 +4327,15 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
       VL = UniqueValues;
   }
 
-  Value *Vec = gather(VL);
+  Value *Vec = gather(VL); 
   if (!ReuseShuffleIndicies.empty()) {
-    Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle");
-    if (auto *I = dyn_cast<Instruction>(Vec)) {
+    Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle"); 
+    if (auto *I = dyn_cast<Instruction>(Vec)) { 
       GatherSeq.insert(I);
       CSEBlocks.insert(I->getParent());
     }
   }
-  return Vec;
+  return Vec; 
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
@@ -4349,28 +4349,28 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   if (E->State == TreeEntry::NeedToGather) {
     setInsertPointAfterBundle(E);
-    Value *Vec = gather(E->Scalars);
+    Value *Vec = gather(E->Scalars); 
     if (NeedToShuffleReuses) {
-      Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle");
-      if (auto *I = dyn_cast<Instruction>(Vec)) {
+      Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle"); 
+      if (auto *I = dyn_cast<Instruction>(Vec)) { 
         GatherSeq.insert(I);
         CSEBlocks.insert(I->getParent());
       }
     }
-    E->VectorizedValue = Vec;
-    return Vec;
+    E->VectorizedValue = Vec; 
+    return Vec; 
   }
 
-  assert((E->State == TreeEntry::Vectorize ||
-          E->State == TreeEntry::ScatterVectorize) &&
-         "Unhandled state");
+  assert((E->State == TreeEntry::Vectorize || 
+          E->State == TreeEntry::ScatterVectorize) && 
+         "Unhandled state"); 
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
-  Instruction *VL0 = E->getMainOp();
-  Type *ScalarTy = VL0->getType();
-  if (auto *Store = dyn_cast<StoreInst>(VL0))
-    ScalarTy = Store->getValueOperand()->getType();
-  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
+  Instruction *VL0 = E->getMainOp(); 
+  Type *ScalarTy = VL0->getType(); 
+  if (auto *Store = dyn_cast<StoreInst>(VL0)) 
+    ScalarTy = Store->getValueOperand()->getType(); 
+  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); 
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       auto *PH = cast<PHINode>(VL0);
@@ -4378,9 +4378,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
       Value *V = NewPhi;
-      if (NeedToShuffleReuses)
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-
+      if (NeedToShuffleReuses) 
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); 
+ 
       E->VectorizedValue = V;
 
       // PHINodes may have multiple entries from the same block. We want to
@@ -4413,33 +4413,33 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         SmallVector<int, 4> Mask;
         inversePermutation(E->ReorderIndices, Mask);
         Builder.SetInsertPoint(VL0);
-        V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
+        V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); 
       }
       if (NeedToShuffleReuses) {
         // TODO: Merge this shuffle with the ReorderShuffleMask.
         if (E->ReorderIndices.empty())
           Builder.SetInsertPoint(VL0);
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); 
       }
       E->VectorizedValue = V;
       return V;
     }
     case Instruction::ExtractValue: {
-      auto *LI = cast<LoadInst>(E->getSingleOperand(0));
+      auto *LI = cast<LoadInst>(E->getSingleOperand(0)); 
       Builder.SetInsertPoint(LI);
-      auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
+      auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); 
       Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
       LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
       Value *NewV = propagateMetadata(V, E->Scalars);
       if (!E->ReorderIndices.empty()) {
         SmallVector<int, 4> Mask;
         inversePermutation(E->ReorderIndices, Mask);
-        NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle");
+        NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle"); 
       }
       if (NeedToShuffleReuses) {
         // TODO: Merge this shuffle with the ReorderShuffleMask.
-        NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices,
-                                           "shuffle");
+        NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices, 
+                                           "shuffle"); 
       }
       E->VectorizedValue = NewV;
       return NewV;
@@ -4467,9 +4467,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       auto *CI = cast<CastInst>(VL0);
       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
-      if (NeedToShuffleReuses)
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-
+      if (NeedToShuffleReuses) 
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); 
+ 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -4489,9 +4489,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Value *V = Builder.CreateCmp(P0, L, R);
       propagateIRFlags(V, E->Scalars, VL0);
-      if (NeedToShuffleReuses)
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-
+      if (NeedToShuffleReuses) 
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); 
+ 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -4509,9 +4509,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Value *V = Builder.CreateSelect(Cond, True, False);
-      if (NeedToShuffleReuses)
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-
+      if (NeedToShuffleReuses) 
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); 
+ 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -4532,9 +4532,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
-      if (NeedToShuffleReuses)
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-
+      if (NeedToShuffleReuses) 
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); 
+ 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
@@ -4575,9 +4575,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
-      if (NeedToShuffleReuses)
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-
+      if (NeedToShuffleReuses) 
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); 
+ 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
@@ -4592,40 +4592,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       setInsertPointAfterBundle(E);
 
       LoadInst *LI = cast<LoadInst>(VL0);
-      Instruction *NewLI;
+      Instruction *NewLI; 
       unsigned AS = LI->getPointerAddressSpace();
-      Value *PO = LI->getPointerOperand();
-      if (E->State == TreeEntry::Vectorize) {
-
-        Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
-
-        // The pointer operand uses an in-tree scalar so we add the new BitCast
-        // to ExternalUses list to make sure that an extract will be generated
-        // in the future.
-        if (getTreeEntry(PO))
-          ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
-
-        NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
-      } else {
-        assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
-        Value *VecPtr = vectorizeTree(E->getOperand(0));
-        // Use the minimum alignment of the gathered loads.
-        Align CommonAlignment = LI->getAlign();
-        for (Value *V : E->Scalars)
-          CommonAlignment =
-              commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
-        NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment);
-      }
-      Value *V = propagateMetadata(NewLI, E->Scalars);
-
+      Value *PO = LI->getPointerOperand(); 
+      if (E->State == TreeEntry::Vectorize) { 
+
+        Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS)); 
+
+        // The pointer operand uses an in-tree scalar so we add the new BitCast 
+        // to ExternalUses list to make sure that an extract will be generated 
+        // in the future. 
+        if (getTreeEntry(PO)) 
+          ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0); 
+
+        NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); 
+      } else { 
+        assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); 
+        Value *VecPtr = vectorizeTree(E->getOperand(0)); 
+        // Use the minimum alignment of the gathered loads. 
+        Align CommonAlignment = LI->getAlign(); 
+        for (Value *V : E->Scalars) 
+          CommonAlignment = 
+              commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); 
+        NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment); 
+      } 
+      Value *V = propagateMetadata(NewLI, E->Scalars); 
+ 
       if (IsReorder) {
         SmallVector<int, 4> Mask;
         inversePermutation(E->ReorderIndices, Mask);
-        V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
+        V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); 
       }
       if (NeedToShuffleReuses) {
         // TODO: Merge this shuffle with the ReorderShuffleMask.
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); 
       }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -4643,7 +4643,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (IsReorder) {
         SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
                                  E->ReorderIndices.end());
-        VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf");
+        VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf"); 
       }
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(
@@ -4658,9 +4658,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
 
       Value *V = propagateMetadata(ST, E->Scalars);
-      if (NeedToShuffleReuses)
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-
+      if (NeedToShuffleReuses) 
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); 
+ 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -4697,9 +4697,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (Instruction *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
-      if (NeedToShuffleReuses)
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-
+      if (NeedToShuffleReuses) 
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); 
+ 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
@@ -4739,10 +4739,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       Function *CF;
       if (!UseIntrinsic) {
-        VFShape Shape =
-            VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
-                                  VecTy->getNumElements())),
-                         false /*HasGlobalPred*/);
+        VFShape Shape = 
+            VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( 
+                                  VecTy->getNumElements())), 
+                         false /*HasGlobalPred*/); 
         CF = VFDatabase(*CI).getVectorizedFunction(Shape);
       } else {
         Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
@@ -4760,9 +4760,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
 
       propagateIRFlags(V, E->Scalars, VL0);
-      if (NeedToShuffleReuses)
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-
+      if (NeedToShuffleReuses) 
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); 
+ 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -4827,9 +4827,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
       if (Instruction *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
-      if (NeedToShuffleReuses)
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-
+      if (NeedToShuffleReuses) 
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); 
+ 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
@@ -4894,8 +4894,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       continue;
     TreeEntry *E = getTreeEntry(Scalar);
     assert(E && "Invalid scalar");
-    assert(E->State != TreeEntry::NeedToGather &&
-           "Extracting from a gather list");
+    assert(E->State != TreeEntry::NeedToGather && 
+           "Extracting from a gather list"); 
 
     Value *Vec = E->VectorizedValue;
     assert(Vec && "Can't find vectorizable value");
@@ -5053,8 +5053,8 @@ void BoUpSLP::optimizeGatherSequence() {
   // instructions into different buckets based on the insert lane.
   SmallVector<Instruction *, 16> Visited;
   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
-    assert(*I &&
-           (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
+    assert(*I && 
+           (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 
            "Worklist not sorted properly!");
     BasicBlock *BB = (*I)->getBlock();
     // For all instructions in blocks containing gather sequences:
@@ -5164,7 +5164,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   // cancelScheduling).
   while (!Bundle->isReady() && !ReadyInsts.empty()) {
 
-    ScheduleData *pickedSD = ReadyInsts.pop_back_val();
+    ScheduleData *pickedSD = ReadyInsts.pop_back_val(); 
 
     if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
       schedule(pickedSD, ReadyInsts);
@@ -5308,9 +5308,9 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
 
     if (I->mayReadOrWriteMemory() &&
         (!isa<IntrinsicInst>(I) ||
-         (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
-          cast<IntrinsicInst>(I)->getIntrinsicID() !=
-              Intrinsic::pseudoprobe))) {
+         (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect && 
+          cast<IntrinsicInst>(I)->getIntrinsicID() != 
+              Intrinsic::pseudoprobe))) { 
       // Update the linked list of memory accessing instructions.
       if (CurrentLoadStore) {
         CurrentLoadStore->NextLoadStore = SD;
@@ -5337,7 +5337,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
   WorkList.push_back(SD);
 
   while (!WorkList.empty()) {
-    ScheduleData *SD = WorkList.pop_back_val();
+    ScheduleData *SD = WorkList.pop_back_val(); 
 
     ScheduleData *BundleMember = SD;
     while (BundleMember) {
@@ -5534,15 +5534,15 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 }
 
 unsigned BoUpSLP::getVectorElementSize(Value *V) {
-  // If V is a store, just return the width of the stored value (or value
-  // truncated just before storing) without traversing the expression tree.
-  // This is the common case.
-  if (auto *Store = dyn_cast<StoreInst>(V)) {
-    if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
-      return DL->getTypeSizeInBits(Trunc->getSrcTy());
-    else
-      return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
-  }
+  // If V is a store, just return the width of the stored value (or value 
+  // truncated just before storing) without traversing the expression tree. 
+  // This is the common case. 
+  if (auto *Store = dyn_cast<StoreInst>(V)) { 
+    if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) 
+      return DL->getTypeSizeInBits(Trunc->getSrcTy()); 
+    else 
+      return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); 
+  } 
 
   auto E = InstrElementSize.find(V);
   if (E != InstrElementSize.end())
@@ -5891,7 +5891,7 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
 
 bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
                                 TargetTransformInfo *TTI_,
-                                TargetLibraryInfo *TLI_, AAResults *AA_,
+                                TargetLibraryInfo *TLI_, AAResults *AA_, 
                                 LoopInfo *LI_, DominatorTree *DT_,
                                 AssumptionCache *AC_, DemandedBits *DB_,
                                 OptimizationRemarkEmitter *ORE_) {
@@ -5991,11 +5991,11 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
 
   R.computeMinimumValueSizes();
 
-  InstructionCost Cost = R.getTreeCost();
+  InstructionCost Cost = R.getTreeCost(); 
 
-  LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n");
+  LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n"); 
   if (Cost < -SLPCostThreshold) {
-    LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
+    LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n"); 
 
     using namespace ore;
 
@@ -6068,7 +6068,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
 
     // If a vector register can't hold 1 element, we are done.
     unsigned MaxVecRegSize = R.getMaxVecRegSize();
-    unsigned EltSize = R.getVectorElementSize(Operands[0]);
+    unsigned EltSize = R.getVectorElementSize(Operands[0]); 
     if (MaxVecRegSize % EltSize != 0)
       continue;
 
@@ -6119,7 +6119,7 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
         continue;
       if (!isValidElementType(SI->getValueOperand()->getType()))
         continue;
-      Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
+      Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI); 
     }
 
     // Ignore getelementptr instructions that have more than one index, a
@@ -6183,7 +6183,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   unsigned Sz = R.getVectorElementSize(I0);
   unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
   unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
-  MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
+  MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); 
   if (MaxVF < 2) {
     R.getORE()->emit([&]() {
       return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
@@ -6195,7 +6195,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
 
   bool Changed = false;
   bool CandidateFound = false;
-  InstructionCost MinCost = SLPCostThreshold.getValue();
+  InstructionCost MinCost = SLPCostThreshold.getValue(); 
 
   bool CompensateUseCost =
       !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
@@ -6251,7 +6251,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         continue;
 
       R.computeMinimumValueSizes();
-      InstructionCost Cost = R.getTreeCost();
+      InstructionCost Cost = R.getTreeCost(); 
       CandidateFound = true;
       if (CompensateUseCost) {
         // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
@@ -6261,7 +6261,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         // part should also switch to same interface.
         // For example, the following case is projected code after SLP:
         //  %4 = extractelement <4 x i64> %3, i32 0
-        //  %v0 = insertelement <4 x i64> poison, i64 %4, i32 0
+        //  %v0 = insertelement <4 x i64> poison, i64 %4, i32 0 
         //  %5 = extractelement <4 x i64> %3, i32 1
         //  %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
         //  %6 = extractelement <4 x i64> %3, i32 2
@@ -6281,7 +6281,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         // Switching to the TTI interface might help a bit.
         // Alternative solution could be pattern-match to detect a no-op or
         // shuffle.
-        InstructionCost UserCost = 0;
+        InstructionCost UserCost = 0; 
         for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
           auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
           if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
@@ -6376,16 +6376,16 @@ namespace {
 
 /// Model horizontal reductions.
 ///
-/// A horizontal reduction is a tree of reduction instructions that has values
-/// that can be put into a vector as its leaves. For example:
+/// A horizontal reduction is a tree of reduction instructions that has values 
+/// that can be put into a vector as its leaves. For example: 
 ///
 /// mul mul mul mul
 ///  \  /    \  /
 ///   +       +
 ///    \     /
 ///       +
-/// This tree has "mul" as its leaf values and "+" as its reduction
-/// instructions. A reduction can feed into a store or a binary operation
+/// This tree has "mul" as its leaf values and "+" as its reduction 
+/// instructions. A reduction can feed into a store or a binary operation 
 /// feeding a phi.
 ///    ...
 ///    \  /
@@ -6403,345 +6403,345 @@ namespace {
 class HorizontalReduction {
   using ReductionOpsType = SmallVector<Value *, 16>;
   using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
-  ReductionOpsListType ReductionOps;
+  ReductionOpsListType ReductionOps; 
   SmallVector<Value *, 32> ReducedVals;
   // Use map vector to make stable output.
   MapVector<Instruction *, Value *> ExtraArgs;
-  WeakTrackingVH ReductionRoot;
-  /// The type of reduction operation.
-  RecurKind RdxKind;
-
-  /// Checks if instruction is associative and can be vectorized.
-  static bool isVectorizable(RecurKind Kind, Instruction *I) {
-    if (Kind == RecurKind::None)
-      return false;
-    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
-      return true;
-
-    if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
-      // FP min/max are associative except for NaN and -0.0. We do not
-      // have to rule out -0.0 here because the intrinsic semantics do not
-      // specify a fixed result for it.
-      return I->getFastMathFlags().noNaNs();
-    }
-
-    return I->isAssociative();
-  }
-
-  /// Checks if the ParentStackElem.first should be marked as a reduction
-  /// operation with an extra argument or as extra argument itself.
-  void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
-                    Value *ExtraArg) {
-    if (ExtraArgs.count(ParentStackElem.first)) {
-      ExtraArgs[ParentStackElem.first] = nullptr;
-      // We ran into something like:
-      // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
-      // The whole ParentStackElem.first should be considered as an extra value
-      // in this case.
-      // Do not perform analysis of remaining operands of ParentStackElem.first
-      // instruction, this whole instruction is an extra argument.
-      RecurKind ParentRdxKind = getRdxKind(ParentStackElem.first);
-      ParentStackElem.second = getNumberOfOperands(ParentRdxKind);
-    } else {
-      // We ran into something like:
-      // ParentStackElem.first += ... + ExtraArg + ...
-      ExtraArgs[ParentStackElem.first] = ExtraArg;
-    }
-  }
-
-  /// Creates reduction operation with the current opcode.
-  static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
-                         Value *RHS, const Twine &Name) {
-    unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
-    switch (Kind) {
-    case RecurKind::Add:
-    case RecurKind::Mul:
-    case RecurKind::Or:
-    case RecurKind::And:
-    case RecurKind::Xor:
-    case RecurKind::FAdd:
-    case RecurKind::FMul:
-      return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
-                                 Name);
-    case RecurKind::FMax:
-      return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
-    case RecurKind::FMin:
-      return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
-
-    case RecurKind::SMax: {
-      Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
-      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-    }
-    case RecurKind::SMin: {
-      Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
-      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-    }
-    case RecurKind::UMax: {
-      Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
-      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-    }
-    case RecurKind::UMin: {
-      Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
-      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-    }
-    default:
-      llvm_unreachable("Unknown reduction operation.");
-    }
-  }
-
-  /// Creates reduction operation with the current opcode with the IR flags
-  /// from \p ReductionOps.
-  static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
-                         Value *RHS, const Twine &Name,
-                         const ReductionOpsListType &ReductionOps) {
-    Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name);
-    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
-      if (auto *Sel = dyn_cast<SelectInst>(Op))
-        propagateIRFlags(Sel->getCondition(), ReductionOps[0]);
-      propagateIRFlags(Op, ReductionOps[1]);
-      return Op;
-    }
-    propagateIRFlags(Op, ReductionOps[0]);
-    return Op;
-  }
-  /// Creates reduction operation with the current opcode with the IR flags
-  /// from \p I.
-  static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
-                         Value *RHS, const Twine &Name, Instruction *I) {
-    Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name);
-    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
-      if (auto *Sel = dyn_cast<SelectInst>(Op)) {
-        propagateIRFlags(Sel->getCondition(),
-                         cast<SelectInst>(I)->getCondition());
-      }
-    }
-    propagateIRFlags(Op, I);
-    return Op;
-  }
-
-  static RecurKind getRdxKind(Instruction *I) {
-    assert(I && "Expected instruction for reduction matching");
-    TargetTransformInfo::ReductionFlags RdxFlags;
-    if (match(I, m_Add(m_Value(), m_Value())))
-      return RecurKind::Add;
-    if (match(I, m_Mul(m_Value(), m_Value())))
-      return RecurKind::Mul;
-    if (match(I, m_And(m_Value(), m_Value())))
-      return RecurKind::And;
-    if (match(I, m_Or(m_Value(), m_Value())))
-      return RecurKind::Or;
-    if (match(I, m_Xor(m_Value(), m_Value())))
-      return RecurKind::Xor;
-    if (match(I, m_FAdd(m_Value(), m_Value())))
-      return RecurKind::FAdd;
-    if (match(I, m_FMul(m_Value(), m_Value())))
-      return RecurKind::FMul;
-
-    if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
-      return RecurKind::FMax;
-    if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
-      return RecurKind::FMin;
-
-    if (match(I, m_SMax(m_Value(), m_Value())))
-      return RecurKind::SMax;
-    if (match(I, m_SMin(m_Value(), m_Value())))
-      return RecurKind::SMin;
-    if (match(I, m_UMax(m_Value(), m_Value())))
-      return RecurKind::UMax;
-    if (match(I, m_UMin(m_Value(), m_Value())))
-      return RecurKind::UMin;
-
-    if (auto *Select = dyn_cast<SelectInst>(I)) {
-      // Try harder: look for min/max pattern based on instructions producing
-      // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
-      // During the intermediate stages of SLP, it's very common to have
-      // pattern like this (since optimizeGatherSequence is run only once
-      // at the end):
-      // %1 = extractelement <2 x i32> %a, i32 0
-      // %2 = extractelement <2 x i32> %a, i32 1
-      // %cond = icmp sgt i32 %1, %2
-      // %3 = extractelement <2 x i32> %a, i32 0
-      // %4 = extractelement <2 x i32> %a, i32 1
-      // %select = select i1 %cond, i32 %3, i32 %4
-      CmpInst::Predicate Pred;
-      Instruction *L1;
-      Instruction *L2;
-
-      Value *LHS = Select->getTrueValue();
-      Value *RHS = Select->getFalseValue();
-      Value *Cond = Select->getCondition();
-
-      // TODO: Support inverse predicates.
-      if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
-        if (!isa<ExtractElementInst>(RHS) ||
-            !L2->isIdenticalTo(cast<Instruction>(RHS)))
-          return RecurKind::None;
-      } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
-        if (!isa<ExtractElementInst>(LHS) ||
-            !L1->isIdenticalTo(cast<Instruction>(LHS)))
-          return RecurKind::None;
-      } else {
-        if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
-          return RecurKind::None;
-        if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
-            !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
-            !L2->isIdenticalTo(cast<Instruction>(RHS)))
-          return RecurKind::None;
-      }
-
-      TargetTransformInfo::ReductionFlags RdxFlags;
-      switch (Pred) {
-      default:
-        return RecurKind::None;
-      case CmpInst::ICMP_SGT:
-      case CmpInst::ICMP_SGE:
-        return RecurKind::SMax;
-      case CmpInst::ICMP_SLT:
-      case CmpInst::ICMP_SLE:
-        return RecurKind::SMin;
-      case CmpInst::ICMP_UGT:
-      case CmpInst::ICMP_UGE:
-        return RecurKind::UMax;
-      case CmpInst::ICMP_ULT:
-      case CmpInst::ICMP_ULE:
-        return RecurKind::UMin;
-      }
-    }
-    return RecurKind::None;
-  }
-
-  /// Return true if this operation is a cmp+select idiom.
-  static bool isCmpSel(RecurKind Kind) {
-    return RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind);
-  }
-
-  /// Get the index of the first operand.
-  static unsigned getFirstOperandIndex(RecurKind Kind) {
-    // We allow calling this before 'Kind' is set, so handle that specially.
-    if (Kind == RecurKind::None)
-      return 0;
-    return isCmpSel(Kind) ? 1 : 0;
-  }
-
-  /// Total number of operands in the reduction operation.
-  static unsigned getNumberOfOperands(RecurKind Kind) {
-    return isCmpSel(Kind) ? 3 : 2;
-  }
-
-  /// Checks if the instruction is in basic block \p BB.
-  /// For a min/max reduction check that both compare and select are in \p BB.
-  static bool hasSameParent(RecurKind Kind, Instruction *I, BasicBlock *BB,
-                            bool IsRedOp) {
-    if (IsRedOp && isCmpSel(Kind)) {
-      auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
-      return I->getParent() == BB && Cmp && Cmp->getParent() == BB;
-    }
-    return I->getParent() == BB;
-  }
-
-  /// Expected number of uses for reduction operations/reduced values.
-  static bool hasRequiredNumberOfUses(RecurKind Kind, Instruction *I,
-                                      bool IsReductionOp) {
-    // SelectInst must be used twice while the condition op must have single
-    // use only.
-    if (isCmpSel(Kind))
-      return I->hasNUses(2) &&
-             (!IsReductionOp ||
-              cast<SelectInst>(I)->getCondition()->hasOneUse());
-
-    // Arithmetic reduction operation must be used once only.
-    return I->hasOneUse();
-  }
-
-  /// Initializes the list of reduction operations.
-  void initReductionOps(RecurKind Kind) {
-    if (isCmpSel(Kind))
-      ReductionOps.assign(2, ReductionOpsType());
-    else
-      ReductionOps.assign(1, ReductionOpsType());
-  }
-
-  /// Add all reduction operations for the reduction instruction \p I.
-  void addReductionOps(RecurKind Kind, Instruction *I) {
-    assert(Kind != RecurKind::None && "Expected reduction operation.");
-    if (isCmpSel(Kind)) {
-      ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
-      ReductionOps[1].emplace_back(I);
-    } else {
-      ReductionOps[0].emplace_back(I);
-    }
-  }
-
-  static Value *getLHS(RecurKind Kind, Instruction *I) {
-    if (Kind == RecurKind::None)
-      return nullptr;
-    return I->getOperand(getFirstOperandIndex(Kind));
-  }
-  static Value *getRHS(RecurKind Kind, Instruction *I) {
-    if (Kind == RecurKind::None)
-      return nullptr;
-    return I->getOperand(getFirstOperandIndex(Kind) + 1);
-  }
-
+  WeakTrackingVH ReductionRoot; 
+  /// The type of reduction operation. 
+  RecurKind RdxKind; 
+
+  /// Checks if instruction is associative and can be vectorized. 
+  static bool isVectorizable(RecurKind Kind, Instruction *I) { 
+    if (Kind == RecurKind::None) 
+      return false; 
+    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind)) 
+      return true; 
+
+    if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { 
+      // FP min/max are associative except for NaN and -0.0. We do not 
+      // have to rule out -0.0 here because the intrinsic semantics do not 
+      // specify a fixed result for it. 
+      return I->getFastMathFlags().noNaNs(); 
+    }
+
+    return I->isAssociative(); 
+  } 
+
+  /// Checks if the ParentStackElem.first should be marked as a reduction 
+  /// operation with an extra argument or as extra argument itself. 
+  void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem, 
+                    Value *ExtraArg) { 
+    if (ExtraArgs.count(ParentStackElem.first)) { 
+      ExtraArgs[ParentStackElem.first] = nullptr; 
+      // We ran into something like: 
+      // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. 
+      // The whole ParentStackElem.first should be considered as an extra value 
+      // in this case. 
+      // Do not perform analysis of remaining operands of ParentStackElem.first 
+      // instruction, this whole instruction is an extra argument. 
+      RecurKind ParentRdxKind = getRdxKind(ParentStackElem.first); 
+      ParentStackElem.second = getNumberOfOperands(ParentRdxKind); 
+    } else { 
+      // We ran into something like: 
+      // ParentStackElem.first += ... + ExtraArg + ... 
+      ExtraArgs[ParentStackElem.first] = ExtraArg; 
+    }
+  } 
+
+  /// Creates reduction operation with the current opcode. 
+  static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, 
+                         Value *RHS, const Twine &Name) { 
+    unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); 
+    switch (Kind) { 
+    case RecurKind::Add: 
+    case RecurKind::Mul: 
+    case RecurKind::Or: 
+    case RecurKind::And: 
+    case RecurKind::Xor: 
+    case RecurKind::FAdd: 
+    case RecurKind::FMul: 
+      return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 
+                                 Name); 
+    case RecurKind::FMax: 
+      return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); 
+    case RecurKind::FMin: 
+      return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); 
+
+    case RecurKind::SMax: { 
+      Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); 
+      return Builder.CreateSelect(Cmp, LHS, RHS, Name); 
+    }
+    case RecurKind::SMin: { 
+      Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); 
+      return Builder.CreateSelect(Cmp, LHS, RHS, Name); 
+    }
+    case RecurKind::UMax: { 
+      Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); 
+      return Builder.CreateSelect(Cmp, LHS, RHS, Name); 
+    }
+    case RecurKind::UMin: { 
+      Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); 
+      return Builder.CreateSelect(Cmp, LHS, RHS, Name); 
+    }
+    default: 
+      llvm_unreachable("Unknown reduction operation."); 
+    }
+  } 
+
+  /// Creates reduction operation with the current opcode with the IR flags 
+  /// from \p ReductionOps. 
+  static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, 
+                         Value *RHS, const Twine &Name, 
+                         const ReductionOpsListType &ReductionOps) { 
+    Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name); 
+    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { 
+      if (auto *Sel = dyn_cast<SelectInst>(Op)) 
+        propagateIRFlags(Sel->getCondition(), ReductionOps[0]); 
+      propagateIRFlags(Op, ReductionOps[1]); 
+      return Op; 
+    }
+    propagateIRFlags(Op, ReductionOps[0]); 
+    return Op; 
+  } 
+  /// Creates reduction operation with the current opcode with the IR flags 
+  /// from \p I. 
+  static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, 
+                         Value *RHS, const Twine &Name, Instruction *I) { 
+    Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name); 
+    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { 
+      if (auto *Sel = dyn_cast<SelectInst>(Op)) { 
+        propagateIRFlags(Sel->getCondition(), 
+                         cast<SelectInst>(I)->getCondition()); 
+      }
+    }
+    propagateIRFlags(Op, I); 
+    return Op; 
+  } 
+
+  static RecurKind getRdxKind(Instruction *I) { 
+    assert(I && "Expected instruction for reduction matching"); 
+    TargetTransformInfo::ReductionFlags RdxFlags; 
+    if (match(I, m_Add(m_Value(), m_Value()))) 
+      return RecurKind::Add; 
+    if (match(I, m_Mul(m_Value(), m_Value()))) 
+      return RecurKind::Mul; 
+    if (match(I, m_And(m_Value(), m_Value()))) 
+      return RecurKind::And; 
+    if (match(I, m_Or(m_Value(), m_Value()))) 
+      return RecurKind::Or; 
+    if (match(I, m_Xor(m_Value(), m_Value()))) 
+      return RecurKind::Xor; 
+    if (match(I, m_FAdd(m_Value(), m_Value()))) 
+      return RecurKind::FAdd; 
+    if (match(I, m_FMul(m_Value(), m_Value()))) 
+      return RecurKind::FMul; 
+
+    if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) 
+      return RecurKind::FMax; 
+    if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) 
+      return RecurKind::FMin; 
+
+    if (match(I, m_SMax(m_Value(), m_Value()))) 
+      return RecurKind::SMax; 
+    if (match(I, m_SMin(m_Value(), m_Value()))) 
+      return RecurKind::SMin; 
+    if (match(I, m_UMax(m_Value(), m_Value()))) 
+      return RecurKind::UMax; 
+    if (match(I, m_UMin(m_Value(), m_Value()))) 
+      return RecurKind::UMin; 
+
+    if (auto *Select = dyn_cast<SelectInst>(I)) { 
+      // Try harder: look for min/max pattern based on instructions producing 
+      // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). 
+      // During the intermediate stages of SLP, it's very common to have 
+      // pattern like this (since optimizeGatherSequence is run only once 
+      // at the end): 
+      // %1 = extractelement <2 x i32> %a, i32 0 
+      // %2 = extractelement <2 x i32> %a, i32 1 
+      // %cond = icmp sgt i32 %1, %2 
+      // %3 = extractelement <2 x i32> %a, i32 0 
+      // %4 = extractelement <2 x i32> %a, i32 1 
+      // %select = select i1 %cond, i32 %3, i32 %4 
+      CmpInst::Predicate Pred; 
+      Instruction *L1; 
+      Instruction *L2; 
+
+      Value *LHS = Select->getTrueValue(); 
+      Value *RHS = Select->getFalseValue(); 
+      Value *Cond = Select->getCondition(); 
+
+      // TODO: Support inverse predicates. 
+      if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) { 
+        if (!isa<ExtractElementInst>(RHS) || 
+            !L2->isIdenticalTo(cast<Instruction>(RHS))) 
+          return RecurKind::None; 
+      } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) { 
+        if (!isa<ExtractElementInst>(LHS) || 
+            !L1->isIdenticalTo(cast<Instruction>(LHS))) 
+          return RecurKind::None; 
+      } else { 
+        if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS)) 
+          return RecurKind::None; 
+        if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) || 
+            !L1->isIdenticalTo(cast<Instruction>(LHS)) || 
+            !L2->isIdenticalTo(cast<Instruction>(RHS))) 
+          return RecurKind::None; 
+      }
+
+      TargetTransformInfo::ReductionFlags RdxFlags; 
+      switch (Pred) { 
+      default: 
+        return RecurKind::None; 
+      case CmpInst::ICMP_SGT: 
+      case CmpInst::ICMP_SGE: 
+        return RecurKind::SMax; 
+      case CmpInst::ICMP_SLT: 
+      case CmpInst::ICMP_SLE: 
+        return RecurKind::SMin; 
+      case CmpInst::ICMP_UGT: 
+      case CmpInst::ICMP_UGE: 
+        return RecurKind::UMax; 
+      case CmpInst::ICMP_ULT: 
+      case CmpInst::ICMP_ULE: 
+        return RecurKind::UMin; 
+      }
+    }
+    return RecurKind::None; 
+  } 
+
+  /// Return true if this operation is a cmp+select idiom. 
+  static bool isCmpSel(RecurKind Kind) { 
+    return RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind); 
+  } 
+
+  /// Get the index of the first operand. 
+  static unsigned getFirstOperandIndex(RecurKind Kind) { 
+    // We allow calling this before 'Kind' is set, so handle that specially. 
+    if (Kind == RecurKind::None) 
+      return 0; 
+    return isCmpSel(Kind) ? 1 : 0; 
+  } 
+
+  /// Total number of operands in the reduction operation. 
+  static unsigned getNumberOfOperands(RecurKind Kind) { 
+    return isCmpSel(Kind) ? 3 : 2; 
+  } 
+
+  /// Checks if the instruction is in basic block \p BB. 
+  /// For a min/max reduction check that both compare and select are in \p BB. 
+  static bool hasSameParent(RecurKind Kind, Instruction *I, BasicBlock *BB, 
+                            bool IsRedOp) { 
+    if (IsRedOp && isCmpSel(Kind)) { 
+      auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition()); 
+      return I->getParent() == BB && Cmp && Cmp->getParent() == BB; 
+    }
+    return I->getParent() == BB; 
+  }
+
+  /// Expected number of uses for reduction operations/reduced values. 
+  static bool hasRequiredNumberOfUses(RecurKind Kind, Instruction *I, 
+                                      bool IsReductionOp) { 
+    // SelectInst must be used twice while the condition op must have single 
+    // use only. 
+    if (isCmpSel(Kind)) 
+      return I->hasNUses(2) && 
+             (!IsReductionOp || 
+              cast<SelectInst>(I)->getCondition()->hasOneUse()); 
+
+    // Arithmetic reduction operation must be used once only. 
+    return I->hasOneUse(); 
+  } 
+
+  /// Initializes the list of reduction operations. 
+  void initReductionOps(RecurKind Kind) { 
+    if (isCmpSel(Kind)) 
+      ReductionOps.assign(2, ReductionOpsType()); 
+    else 
+      ReductionOps.assign(1, ReductionOpsType()); 
+  } 
+
+  /// Add all reduction operations for the reduction instruction \p I. 
+  void addReductionOps(RecurKind Kind, Instruction *I) { 
+    assert(Kind != RecurKind::None && "Expected reduction operation."); 
+    if (isCmpSel(Kind)) { 
+      ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); 
+      ReductionOps[1].emplace_back(I); 
+    } else { 
+      ReductionOps[0].emplace_back(I); 
+    }
+  }
+
+  static Value *getLHS(RecurKind Kind, Instruction *I) { 
+    if (Kind == RecurKind::None) 
+      return nullptr; 
+    return I->getOperand(getFirstOperandIndex(Kind)); 
+  } 
+  static Value *getRHS(RecurKind Kind, Instruction *I) { 
+    if (Kind == RecurKind::None) 
+      return nullptr; 
+    return I->getOperand(getFirstOperandIndex(Kind) + 1); 
+  } 
+ 
 public:
   HorizontalReduction() = default;
 
   /// Try to find a reduction tree.
   bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
     assert((!Phi || is_contained(Phi->operands(), B)) &&
-           "Phi needs to use the binary operator");
+           "Phi needs to use the binary operator"); 
 
-    RdxKind = getRdxKind(B);
+    RdxKind = getRdxKind(B); 
 
     // We could have a initial reductions that is not an add.
     //  r *= v1 + v2 + v3 + v4
     // In such a case start looking for a tree rooted in the first '+'.
     if (Phi) {
-      if (getLHS(RdxKind, B) == Phi) {
+      if (getLHS(RdxKind, B) == Phi) { 
         Phi = nullptr;
-        B = dyn_cast<Instruction>(getRHS(RdxKind, B));
-        if (!B)
-          return false;
-        RdxKind = getRdxKind(B);
-      } else if (getRHS(RdxKind, B) == Phi) {
+        B = dyn_cast<Instruction>(getRHS(RdxKind, B)); 
+        if (!B) 
+          return false; 
+        RdxKind = getRdxKind(B); 
+      } else if (getRHS(RdxKind, B) == Phi) { 
         Phi = nullptr;
-        B = dyn_cast<Instruction>(getLHS(RdxKind, B));
-        if (!B)
-          return false;
-        RdxKind = getRdxKind(B);
+        B = dyn_cast<Instruction>(getLHS(RdxKind, B)); 
+        if (!B) 
+          return false; 
+        RdxKind = getRdxKind(B); 
       }
     }
 
-    if (!isVectorizable(RdxKind, B))
+    if (!isVectorizable(RdxKind, B)) 
       return false;
 
-    // Analyze "regular" integer/FP types for reductions - no target-specific
-    // types or pointers.
+    // Analyze "regular" integer/FP types for reductions - no target-specific 
+    // types or pointers. 
     Type *Ty = B->getType();
-    if (!isValidElementType(Ty) || Ty->isPointerTy())
+    if (!isValidElementType(Ty) || Ty->isPointerTy()) 
       return false;
 
     ReductionRoot = B;
 
-    // The opcode for leaf values that we perform a reduction on.
-    // For example: load(x) + load(y) + load(z) + fptoui(w)
-    // The leaf opcode for 'w' does not match, so we don't include it as a
-    // potential candidate for the reduction.
-    unsigned LeafOpcode = 0;
-
+    // The opcode for leaf values that we perform a reduction on. 
+    // For example: load(x) + load(y) + load(z) + fptoui(w) 
+    // The leaf opcode for 'w' does not match, so we don't include it as a 
+    // potential candidate for the reduction. 
+    unsigned LeafOpcode = 0; 
+ 
     // Post order traverse the reduction tree starting at B. We only handle true
     // trees containing only binary operators.
     SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
-    Stack.push_back(std::make_pair(B, getFirstOperandIndex(RdxKind)));
-    initReductionOps(RdxKind);
+    Stack.push_back(std::make_pair(B, getFirstOperandIndex(RdxKind))); 
+    initReductionOps(RdxKind); 
     while (!Stack.empty()) {
       Instruction *TreeN = Stack.back().first;
-      unsigned EdgeToVisit = Stack.back().second++;
-      const RecurKind TreeRdxKind = getRdxKind(TreeN);
-      bool IsReducedValue = TreeRdxKind != RdxKind;
+      unsigned EdgeToVisit = Stack.back().second++; 
+      const RecurKind TreeRdxKind = getRdxKind(TreeN); 
+      bool IsReducedValue = TreeRdxKind != RdxKind; 
 
-      // Postorder visit.
-      if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeRdxKind)) {
+      // Postorder visit. 
+      if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeRdxKind)) { 
         if (IsReducedValue)
           ReducedVals.push_back(TreeN);
         else {
@@ -6759,7 +6759,7 @@ public:
             markExtraArg(Stack[Stack.size() - 2], TreeN);
             ExtraArgs.erase(TreeN);
           } else
-            addReductionOps(RdxKind, TreeN);
+            addReductionOps(RdxKind, TreeN); 
         }
         // Retract.
         Stack.pop_back();
@@ -6767,72 +6767,72 @@ public:
       }
 
       // Visit left or right.
-      Value *EdgeVal = TreeN->getOperand(EdgeToVisit);
-      auto *I = dyn_cast<Instruction>(EdgeVal);
-      if (!I) {
-        // Edge value is not a reduction instruction or a leaf instruction.
-        // (It may be a constant, function argument, or something else.)
-        markExtraArg(Stack.back(), EdgeVal);
-        continue;
-      }
-      RecurKind EdgeRdxKind = getRdxKind(I);
-      // Continue analysis if the next operand is a reduction operation or
-      // (possibly) a leaf value. If the leaf value opcode is not set,
-      // the first met operation != reduction operation is considered as the
-      // leaf opcode.
-      // Only handle trees in the current basic block.
-      // Each tree node needs to have minimal number of users except for the
-      // ultimate reduction.
-      const bool IsRdxInst = EdgeRdxKind == RdxKind;
-      if (I != Phi && I != B &&
-          hasSameParent(RdxKind, I, B->getParent(), IsRdxInst) &&
-          hasRequiredNumberOfUses(RdxKind, I, IsRdxInst) &&
-          (!LeafOpcode || LeafOpcode == I->getOpcode() || IsRdxInst)) {
-        if (IsRdxInst) {
-          // We need to be able to reassociate the reduction operations.
-          if (!isVectorizable(EdgeRdxKind, I)) {
+      Value *EdgeVal = TreeN->getOperand(EdgeToVisit); 
+      auto *I = dyn_cast<Instruction>(EdgeVal); 
+      if (!I) { 
+        // Edge value is not a reduction instruction or a leaf instruction. 
+        // (It may be a constant, function argument, or something else.) 
+        markExtraArg(Stack.back(), EdgeVal); 
+        continue; 
+      } 
+      RecurKind EdgeRdxKind = getRdxKind(I); 
+      // Continue analysis if the next operand is a reduction operation or 
+      // (possibly) a leaf value. If the leaf value opcode is not set, 
+      // the first met operation != reduction operation is considered as the 
+      // leaf opcode. 
+      // Only handle trees in the current basic block. 
+      // Each tree node needs to have minimal number of users except for the 
+      // ultimate reduction. 
+      const bool IsRdxInst = EdgeRdxKind == RdxKind; 
+      if (I != Phi && I != B && 
+          hasSameParent(RdxKind, I, B->getParent(), IsRdxInst) && 
+          hasRequiredNumberOfUses(RdxKind, I, IsRdxInst) && 
+          (!LeafOpcode || LeafOpcode == I->getOpcode() || IsRdxInst)) { 
+        if (IsRdxInst) { 
+          // We need to be able to reassociate the reduction operations. 
+          if (!isVectorizable(EdgeRdxKind, I)) { 
             // I is an extra argument for TreeN (its parent operation).
             markExtraArg(Stack.back(), I);
             continue;
           }
-        } else if (!LeafOpcode) {
-          LeafOpcode = I->getOpcode();
+        } else if (!LeafOpcode) { 
+          LeafOpcode = I->getOpcode(); 
         }
-        Stack.push_back(std::make_pair(I, getFirstOperandIndex(EdgeRdxKind)));
-        continue;
+        Stack.push_back(std::make_pair(I, getFirstOperandIndex(EdgeRdxKind))); 
+        continue; 
       }
-      // I is an extra argument for TreeN (its parent operation).
-      markExtraArg(Stack.back(), I);
+      // I is an extra argument for TreeN (its parent operation). 
+      markExtraArg(Stack.back(), I); 
     }
     return true;
   }
 
-  /// Attempt to vectorize the tree found by matchAssociativeReduction.
+  /// Attempt to vectorize the tree found by matchAssociativeReduction. 
   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
-    // If there are a sufficient number of reduction values, reduce
-    // to a nearby power-of-2. We can safely generate oversized
+    // If there are a sufficient number of reduction values, reduce 
+    // to a nearby power-of-2. We can safely generate oversized 
     // vectors and rely on the backend to split them to legal sizes.
     unsigned NumReducedVals = ReducedVals.size();
     if (NumReducedVals < 4)
       return false;
 
-    // Intersect the fast-math-flags from all reduction operations.
-    FastMathFlags RdxFMF;
-    RdxFMF.set();
-    for (ReductionOpsType &RdxOp : ReductionOps) {
-      for (Value *RdxVal : RdxOp) {
-        if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))
-          RdxFMF &= FPMO->getFastMathFlags();
-      }
-    }
+    // Intersect the fast-math-flags from all reduction operations. 
+    FastMathFlags RdxFMF; 
+    RdxFMF.set(); 
+    for (ReductionOpsType &RdxOp : ReductionOps) { 
+      for (Value *RdxVal : RdxOp) { 
+        if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal)) 
+          RdxFMF &= FPMO->getFastMathFlags(); 
+      } 
+    } 
 
     IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
-    Builder.setFastMathFlags(RdxFMF);
+    Builder.setFastMathFlags(RdxFMF); 
 
     BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
-    // The same extra argument may be used several times, so log each attempt
+    // The same extra argument may be used several times, so log each attempt 
     // to use it.
-    for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
+    for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) { 
       assert(Pair.first && "DebugLoc must be set.");
       ExternallyUsedValues[Pair.second].push_back(Pair.first);
     }
@@ -6852,48 +6852,48 @@ public:
     // so set it as externally used to prevent it from being deleted.
     ExternallyUsedValues[ReductionRoot];
     SmallVector<Value *, 16> IgnoreList;
-    for (ReductionOpsType &RdxOp : ReductionOps)
-      IgnoreList.append(RdxOp.begin(), RdxOp.end());
-
-    unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
-    if (NumReducedVals > ReduxWidth) {
-      // In the loop below, we are building a tree based on a window of
-      // 'ReduxWidth' values.
-      // If the operands of those values have common traits (compare predicate,
-      // constant operand, etc), then we want to group those together to
-      // minimize the cost of the reduction.
-
-      // TODO: This should be extended to count common operands for
-      //       compares and binops.
-
-      // Step 1: Count the number of times each compare predicate occurs.
-      SmallDenseMap<unsigned, unsigned> PredCountMap;
-      for (Value *RdxVal : ReducedVals) {
-        CmpInst::Predicate Pred;
-        if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
-          ++PredCountMap[Pred];
-      }
-      // Step 2: Sort the values so the most common predicates come first.
-      stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
-        CmpInst::Predicate PredA, PredB;
-        if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
-            match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
-          return PredCountMap[PredA] > PredCountMap[PredB];
-        }
-        return false;
-      });
-    }
-
-    Value *VectorizedTree = nullptr;
-    unsigned i = 0;
+    for (ReductionOpsType &RdxOp : ReductionOps) 
+      IgnoreList.append(RdxOp.begin(), RdxOp.end()); 
+ 
+    unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); 
+    if (NumReducedVals > ReduxWidth) { 
+      // In the loop below, we are building a tree based on a window of 
+      // 'ReduxWidth' values. 
+      // If the operands of those values have common traits (compare predicate, 
+      // constant operand, etc), then we want to group those together to 
+      // minimize the cost of the reduction. 
+ 
+      // TODO: This should be extended to count common operands for 
+      //       compares and binops. 
+ 
+      // Step 1: Count the number of times each compare predicate occurs. 
+      SmallDenseMap<unsigned, unsigned> PredCountMap; 
+      for (Value *RdxVal : ReducedVals) { 
+        CmpInst::Predicate Pred; 
+        if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value()))) 
+          ++PredCountMap[Pred]; 
+      } 
+      // Step 2: Sort the values so the most common predicates come first. 
+      stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) { 
+        CmpInst::Predicate PredA, PredB; 
+        if (match(A, m_Cmp(PredA, m_Value(), m_Value())) && 
+            match(B, m_Cmp(PredB, m_Value(), m_Value()))) { 
+          return PredCountMap[PredA] > PredCountMap[PredB]; 
+        } 
+        return false; 
+      }); 
+    } 
+ 
+    Value *VectorizedTree = nullptr; 
+    unsigned i = 0; 
     while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
-      ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
+      ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth); 
       V.buildTree(VL, ExternallyUsedValues, IgnoreList);
       Optional<ArrayRef<unsigned>> Order = V.bestOrder();
-      if (Order) {
-        assert(Order->size() == VL.size() &&
-               "Order size must be the same as number of vectorized "
-               "instructions.");
+      if (Order) { 
+        assert(Order->size() == VL.size() && 
+               "Order size must be the same as number of vectorized " 
+               "instructions."); 
         // TODO: reorder tree nodes without tree rebuilding.
         SmallVector<Value *, 4> ReorderedOps(VL.size());
         llvm::transform(*Order, ReorderedOps.begin(),
@@ -6902,66 +6902,66 @@ public:
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
-      if (V.isLoadCombineReductionCandidate(RdxKind))
+      if (V.isLoadCombineReductionCandidate(RdxKind)) 
         break;
 
       V.computeMinimumValueSizes();
 
       // Estimate cost.
-      InstructionCost TreeCost = V.getTreeCost();
-      InstructionCost ReductionCost =
-          getReductionCost(TTI, ReducedVals[i], ReduxWidth);
-      InstructionCost Cost = TreeCost + ReductionCost;
-      if (!Cost.isValid()) {
-        LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
-        return false;
-      }
+      InstructionCost TreeCost = V.getTreeCost(); 
+      InstructionCost ReductionCost = 
+          getReductionCost(TTI, ReducedVals[i], ReduxWidth); 
+      InstructionCost Cost = TreeCost + ReductionCost; 
+      if (!Cost.isValid()) { 
+        LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); 
+        return false; 
+      } 
       if (Cost >= -SLPCostThreshold) {
-        V.getORE()->emit([&]() {
-          return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
-                                          cast<Instruction>(VL[0]))
-                 << "Vectorizing horizontal reduction is possible"
-                 << "but not beneficial with cost " << ore::NV("Cost", Cost)
-                 << " and threshold "
-                 << ore::NV("Threshold", -SLPCostThreshold);
-        });
-        break;
+        V.getORE()->emit([&]() { 
+          return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", 
+                                          cast<Instruction>(VL[0])) 
+                 << "Vectorizing horizontal reduction is possible" 
+                 << "but not beneficial with cost " << ore::NV("Cost", Cost) 
+                 << " and threshold " 
+                 << ore::NV("Threshold", -SLPCostThreshold); 
+        }); 
+        break; 
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
                         << Cost << ". (HorRdx)\n");
       V.getORE()->emit([&]() {
-        return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
-                                  cast<Instruction>(VL[0]))
-               << "Vectorized horizontal reduction with cost "
-               << ore::NV("Cost", Cost) << " and with tree size "
-               << ore::NV("TreeSize", V.getTreeSize());
+        return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", 
+                                  cast<Instruction>(VL[0])) 
+               << "Vectorized horizontal reduction with cost " 
+               << ore::NV("Cost", Cost) << " and with tree size " 
+               << ore::NV("TreeSize", V.getTreeSize()); 
       });
 
       // Vectorize a tree.
       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
       Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
 
-      // Emit a reduction. If the root is a select (min/max idiom), the insert
+      // Emit a reduction. If the root is a select (min/max idiom), the insert 
       // point is the compare condition of that select.
       Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
-      if (isCmpSel(RdxKind))
+      if (isCmpSel(RdxKind)) 
         Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
       else
         Builder.SetInsertPoint(RdxRootInst);
 
       Value *ReducedSubTree =
           emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
-
-      if (!VectorizedTree) {
-        // Initialize the final value in the reduction.
-        VectorizedTree = ReducedSubTree;
-      } else {
-        // Update the final value in the reduction.
+ 
+      if (!VectorizedTree) { 
+        // Initialize the final value in the reduction. 
+        VectorizedTree = ReducedSubTree; 
+      } else { 
+        // Update the final value in the reduction. 
         Builder.SetCurrentDebugLocation(Loc);
-        VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
-                                  ReducedSubTree, "op.rdx", ReductionOps);
-      }
+        VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, 
+                                  ReducedSubTree, "op.rdx", ReductionOps); 
+      } 
       i += ReduxWidth;
       ReduxWidth = PowerOf2Floor(NumReducedVals - i);
     }
@@ -6971,15 +6971,15 @@ public:
       for (; i < NumReducedVals; ++i) {
         auto *I = cast<Instruction>(ReducedVals[i]);
         Builder.SetCurrentDebugLocation(I->getDebugLoc());
-        VectorizedTree =
-            createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
+        VectorizedTree = 
+            createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps); 
       }
       for (auto &Pair : ExternallyUsedValues) {
         // Add each externally used value to the final reduction.
         for (auto *I : Pair.second) {
           Builder.SetCurrentDebugLocation(I->getDebugLoc());
-          VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
-                                    Pair.first, "op.extra", I);
+          VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, 
+                                    Pair.first, "op.extra", I); 
         }
       }
 
@@ -6987,7 +6987,7 @@ public:
       // select, we also have to RAUW for the compare instruction feeding the
       // reduction root. That's because the original compare may have extra uses
       // besides the final select of the reduction.
-      if (isCmpSel(RdxKind)) {
+      if (isCmpSel(RdxKind)) { 
         if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {
           Instruction *ScalarCmp =
               getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));
@@ -7003,68 +7003,68 @@ public:
     return VectorizedTree != nullptr;
   }
 
-  unsigned numReductionValues() const { return ReducedVals.size(); }
+  unsigned numReductionValues() const { return ReducedVals.size(); } 
 
 private:
   /// Calculate the cost of a reduction.
-  InstructionCost getReductionCost(TargetTransformInfo *TTI,
-                                   Value *FirstReducedVal,
-                                   unsigned ReduxWidth) {
+  InstructionCost getReductionCost(TargetTransformInfo *TTI, 
+                                   Value *FirstReducedVal, 
+                                   unsigned ReduxWidth) { 
     Type *ScalarTy = FirstReducedVal->getType();
-    FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
-    InstructionCost VectorCost, ScalarCost;
-    switch (RdxKind) {
-    case RecurKind::Add:
-    case RecurKind::Mul:
-    case RecurKind::Or:
-    case RecurKind::And:
-    case RecurKind::Xor:
-    case RecurKind::FAdd:
-    case RecurKind::FMul: {
-      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
-      VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
-                                                   /*IsPairwiseForm=*/false);
-      ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
+    FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); 
+    InstructionCost VectorCost, ScalarCost; 
+    switch (RdxKind) { 
+    case RecurKind::Add: 
+    case RecurKind::Mul: 
+    case RecurKind::Or: 
+    case RecurKind::And: 
+    case RecurKind::Xor: 
+    case RecurKind::FAdd: 
+    case RecurKind::FMul: { 
+      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); 
+      VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, 
+                                                   /*IsPairwiseForm=*/false); 
+      ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); 
       break;
-    }
-    case RecurKind::FMax:
-    case RecurKind::FMin: {
-      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
-      VectorCost =
-          TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
-                                      /*pairwise=*/false, /*unsigned=*/false);
-      ScalarCost =
-          TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
-          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                  CmpInst::makeCmpResultType(ScalarTy));
+    } 
+    case RecurKind::FMax: 
+    case RecurKind::FMin: { 
+      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); 
+      VectorCost = 
+          TTI->getMinMaxReductionCost(VectorTy, VecCondTy, 
+                                      /*pairwise=*/false, /*unsigned=*/false); 
+      ScalarCost = 
+          TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) + 
+          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, 
+                                  CmpInst::makeCmpResultType(ScalarTy)); 
       break;
     }
-    case RecurKind::SMax:
-    case RecurKind::SMin:
-    case RecurKind::UMax:
-    case RecurKind::UMin: {
-      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
-      bool IsUnsigned =
-          RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
-      VectorCost =
-          TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
-                                      /*IsPairwiseForm=*/false, IsUnsigned);
-      ScalarCost =
-          TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
+    case RecurKind::SMax: 
+    case RecurKind::SMin: 
+    case RecurKind::UMax: 
+    case RecurKind::UMin: { 
+      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); 
+      bool IsUnsigned = 
+          RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; 
+      VectorCost = 
+          TTI->getMinMaxReductionCost(VectorTy, VecCondTy, 
+                                      /*IsPairwiseForm=*/false, IsUnsigned); 
+      ScalarCost = 
+          TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) + 
           TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
                                   CmpInst::makeCmpResultType(ScalarTy));
       break;
-    }
-    default:
+    } 
+    default: 
       llvm_unreachable("Expected arithmetic or min/max reduction operation");
     }
 
-    // Scalar cost is repeated for N-1 elements.
-    ScalarCost *= (ReduxWidth - 1);
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
+    // Scalar cost is repeated for N-1 elements. 
+    ScalarCost *= (ReduxWidth - 1); 
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost 
                       << " for reduction that starts with " << *FirstReducedVal
-                      << " (It is a splitting reduction)\n");
-    return VectorCost - ScalarCost;
+                      << " (It is a splitting reduction)\n"); 
+    return VectorCost - ScalarCost; 
   }
 
   /// Emit a horizontal reduction of the vectorized value.
@@ -7074,142 +7074,142 @@ private:
     assert(isPowerOf2_32(ReduxWidth) &&
            "We only handle power-of-two reductions for now");
 
-    return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
-                                       ReductionOps.back());
-  }
-};
-
-} // end anonymous namespace
-
-static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
-  if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
-    return cast<FixedVectorType>(IE->getType())->getNumElements();
-
-  unsigned AggregateSize = 1;
-  auto *IV = cast<InsertValueInst>(InsertInst);
-  Type *CurrentType = IV->getType();
-  do {
-    if (auto *ST = dyn_cast<StructType>(CurrentType)) {
-      for (auto *Elt : ST->elements())
-        if (Elt != ST->getElementType(0)) // check homogeneity
-          return None;
-      AggregateSize *= ST->getNumElements();
-      CurrentType = ST->getElementType(0);
-    } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
-      AggregateSize *= AT->getNumElements();
-      CurrentType = AT->getElementType();
-    } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
-      AggregateSize *= VT->getNumElements();
-      return AggregateSize;
-    } else if (CurrentType->isSingleValueType()) {
-      return AggregateSize;
-    } else {
-      return None;
-    }
-  } while (true);
-}
-
-static Optional<unsigned> getOperandIndex(Instruction *InsertInst,
-                                          unsigned OperandOffset) {
-  unsigned OperandIndex = OperandOffset;
-  if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
-    if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
-      auto *VT = cast<FixedVectorType>(IE->getType());
-      OperandIndex *= VT->getNumElements();
-      OperandIndex += CI->getZExtValue();
-      return OperandIndex;
-    }
-    return None;
-  }
-
-  auto *IV = cast<InsertValueInst>(InsertInst);
-  Type *CurrentType = IV->getType();
-  for (unsigned int Index : IV->indices()) {
-    if (auto *ST = dyn_cast<StructType>(CurrentType)) {
-      OperandIndex *= ST->getNumElements();
-      CurrentType = ST->getElementType(Index);
-    } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
-      OperandIndex *= AT->getNumElements();
-      CurrentType = AT->getElementType();
-    } else {
-      return None;
-    }
-    OperandIndex += Index;
-  }
-  return OperandIndex;
-}
-
-static bool findBuildAggregate_rec(Instruction *LastInsertInst,
-                                   TargetTransformInfo *TTI,
-                                   SmallVectorImpl<Value *> &BuildVectorOpds,
-                                   SmallVectorImpl<Value *> &InsertElts,
-                                   unsigned OperandOffset) {
-  do {
-    Value *InsertedOperand = LastInsertInst->getOperand(1);
-    Optional<unsigned> OperandIndex =
-        getOperandIndex(LastInsertInst, OperandOffset);
-    if (!OperandIndex)
-      return false;
-    if (isa<InsertElementInst>(InsertedOperand) ||
-        isa<InsertValueInst>(InsertedOperand)) {
-      if (!findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
-                                  BuildVectorOpds, InsertElts, *OperandIndex))
-        return false;
-    } else {
-      BuildVectorOpds[*OperandIndex] = InsertedOperand;
-      InsertElts[*OperandIndex] = LastInsertInst;
-    }
-    if (isa<UndefValue>(LastInsertInst->getOperand(0)))
-      return true;
-    LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
-  } while (LastInsertInst != nullptr &&
-           (isa<InsertValueInst>(LastInsertInst) ||
-            isa<InsertElementInst>(LastInsertInst)) &&
-           LastInsertInst->hasOneUse());
-  return false;
-}
-
+    return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, 
+                                       ReductionOps.back()); 
+  } 
+}; 
+
+} // end anonymous namespace 
+
+static Optional<unsigned> getAggregateSize(Instruction *InsertInst) { 
+  if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) 
+    return cast<FixedVectorType>(IE->getType())->getNumElements(); 
+ 
+  unsigned AggregateSize = 1; 
+  auto *IV = cast<InsertValueInst>(InsertInst); 
+  Type *CurrentType = IV->getType(); 
+  do { 
+    if (auto *ST = dyn_cast<StructType>(CurrentType)) { 
+      for (auto *Elt : ST->elements()) 
+        if (Elt != ST->getElementType(0)) // check homogeneity 
+          return None; 
+      AggregateSize *= ST->getNumElements(); 
+      CurrentType = ST->getElementType(0); 
+    } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { 
+      AggregateSize *= AT->getNumElements(); 
+      CurrentType = AT->getElementType(); 
+    } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) { 
+      AggregateSize *= VT->getNumElements(); 
+      return AggregateSize; 
+    } else if (CurrentType->isSingleValueType()) { 
+      return AggregateSize; 
+    } else { 
+      return None; 
+    }
+  } while (true); 
+} 
+
+static Optional<unsigned> getOperandIndex(Instruction *InsertInst, 
+                                          unsigned OperandOffset) { 
+  unsigned OperandIndex = OperandOffset; 
+  if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { 
+    if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { 
+      auto *VT = cast<FixedVectorType>(IE->getType()); 
+      OperandIndex *= VT->getNumElements(); 
+      OperandIndex += CI->getZExtValue(); 
+      return OperandIndex; 
+    } 
+    return None; 
+  }
+
+  auto *IV = cast<InsertValueInst>(InsertInst); 
+  Type *CurrentType = IV->getType(); 
+  for (unsigned int Index : IV->indices()) { 
+    if (auto *ST = dyn_cast<StructType>(CurrentType)) { 
+      OperandIndex *= ST->getNumElements(); 
+      CurrentType = ST->getElementType(Index); 
+    } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { 
+      OperandIndex *= AT->getNumElements(); 
+      CurrentType = AT->getElementType(); 
+    } else { 
+      return None; 
+    } 
+    OperandIndex += Index; 
+  } 
+  return OperandIndex; 
+} 
+
+static bool findBuildAggregate_rec(Instruction *LastInsertInst, 
+                                   TargetTransformInfo *TTI, 
+                                   SmallVectorImpl<Value *> &BuildVectorOpds, 
+                                   SmallVectorImpl<Value *> &InsertElts, 
+                                   unsigned OperandOffset) { 
+  do { 
+    Value *InsertedOperand = LastInsertInst->getOperand(1); 
+    Optional<unsigned> OperandIndex = 
+        getOperandIndex(LastInsertInst, OperandOffset); 
+    if (!OperandIndex) 
+      return false; 
+    if (isa<InsertElementInst>(InsertedOperand) || 
+        isa<InsertValueInst>(InsertedOperand)) { 
+      if (!findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI, 
+                                  BuildVectorOpds, InsertElts, *OperandIndex)) 
+        return false; 
+    } else { 
+      BuildVectorOpds[*OperandIndex] = InsertedOperand; 
+      InsertElts[*OperandIndex] = LastInsertInst; 
+    } 
+    if (isa<UndefValue>(LastInsertInst->getOperand(0))) 
+      return true; 
+    LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0)); 
+  } while (LastInsertInst != nullptr && 
+           (isa<InsertValueInst>(LastInsertInst) || 
+            isa<InsertElementInst>(LastInsertInst)) && 
+           LastInsertInst->hasOneUse()); 
+  return false; 
+} 
+ 
 /// Recognize construction of vectors like
-///  %ra = insertelement <4 x float> poison, float %s0, i32 0
+///  %ra = insertelement <4 x float> poison, float %s0, i32 0 
 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
 ///  starting from the last insertelement or insertvalue instruction.
 ///
-/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
+/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, 
 /// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
 /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
 ///
 /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
 ///
 /// \return true if it matches.
-static bool findBuildAggregate(Instruction *LastInsertInst,
-                               TargetTransformInfo *TTI,
+static bool findBuildAggregate(Instruction *LastInsertInst, 
+                               TargetTransformInfo *TTI, 
                                SmallVectorImpl<Value *> &BuildVectorOpds,
                                SmallVectorImpl<Value *> &InsertElts) {
-
+ 
   assert((isa<InsertElementInst>(LastInsertInst) ||
           isa<InsertValueInst>(LastInsertInst)) &&
          "Expected insertelement or insertvalue instruction!");
-
-  assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
-         "Expected empty result vectors!");
-
-  Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
-  if (!AggregateSize)
-    return false;
-  BuildVectorOpds.resize(*AggregateSize);
-  InsertElts.resize(*AggregateSize);
-
-  if (findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts,
-                             0)) {
-    llvm::erase_value(BuildVectorOpds, nullptr);
-    llvm::erase_value(InsertElts, nullptr);
-    if (BuildVectorOpds.size() >= 2)
-      return true;
-  }
-
-  return false;
+ 
+  assert((BuildVectorOpds.empty() && InsertElts.empty()) && 
+         "Expected empty result vectors!"); 
+ 
+  Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst); 
+  if (!AggregateSize) 
+    return false; 
+  BuildVectorOpds.resize(*AggregateSize); 
+  InsertElts.resize(*AggregateSize); 
+ 
+  if (findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 
+                             0)) { 
+    llvm::erase_value(BuildVectorOpds, nullptr); 
+    llvm::erase_value(InsertElts, nullptr); 
+    if (BuildVectorOpds.size() >= 2) 
+      return true; 
+  } 
+ 
+  return false; 
 }
 
 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
@@ -7267,16 +7267,16 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
   return nullptr;
 }
 
-static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
-  if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
-    return true;
-  if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
-    return true;
-  if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
-    return true;
-  return false;
-}
-
+static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { 
+  if (match(I, m_BinOp(m_Value(V0), m_Value(V1)))) 
+    return true; 
+  if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1)))) 
+    return true; 
+  if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1)))) 
+    return true; 
+  return false; 
+} 
+ 
 /// Attempt to reduce a horizontal reduction.
 /// If it is legal to match a horizontal reduction feeding the phi node \a P
 /// with reduction operators \a Root (or one of its operands) in a basic block
@@ -7316,10 +7316,10 @@ static bool tryToVectorizeHorReductionOrInstOperands(
     Instruction *Inst;
     unsigned Level;
     std::tie(Inst, Level) = Stack.pop_back_val();
-    Value *B0, *B1;
-    bool IsBinop = matchRdxBop(Inst, B0, B1);
-    bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
-    if (IsBinop || IsSelect) {
+    Value *B0, *B1; 
+    bool IsBinop = matchRdxBop(Inst, B0, B1); 
+    bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); 
+    if (IsBinop || IsSelect) { 
       HorizontalReduction HorRdx;
       if (HorRdx.matchAssociativeReduction(P, Inst)) {
         if (HorRdx.tryToReduce(R, TTI)) {
@@ -7330,10 +7330,10 @@ static bool tryToVectorizeHorReductionOrInstOperands(
           continue;
         }
       }
-      if (P && IsBinop) {
-        Inst = dyn_cast<Instruction>(B0);
+      if (P && IsBinop) { 
+        Inst = dyn_cast<Instruction>(B0); 
         if (Inst == P)
-          Inst = dyn_cast<Instruction>(B1);
+          Inst = dyn_cast<Instruction>(B1); 
         if (!Inst) {
           // Set P to nullptr to avoid re-analysis of phi node in
           // matchAssociativeReduction function unless this is the root node.
@@ -7366,7 +7366,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
                                                  BasicBlock *BB, BoUpSLP &R,
                                                  TargetTransformInfo *TTI) {
-  auto *I = dyn_cast_or_null<Instruction>(V);
+  auto *I = dyn_cast_or_null<Instruction>(V); 
   if (!I)
     return false;
 
@@ -7388,7 +7388,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
 
   SmallVector<Value *, 16> BuildVectorOpds;
   SmallVector<Value *, 16> BuildVectorInsts;
-  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
+  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts)) 
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
@@ -7475,7 +7475,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       // Look for the next elements with the same type.
       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
       while (SameTypeIt != E &&
-             (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+             (*SameTypeIt)->getType() == (*IncIt)->getType()) { 
         VisitedInstrs.insert(*SameTypeIt);
         ++SameTypeIt;
       }
@@ -7507,17 +7507,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   SmallVector<Instruction *, 8> PostProcessInstructions;
   SmallDenseSet<Instruction *, 4> KeyNodes;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    // Skip instructions with scalable type. The num of elements is unknown at
-    // compile-time for scalable type.
-    if (isa<ScalableVectorType>(it->getType()))
-      continue;
-
+    // Skip instructions with scalable type. The num of elements is unknown at 
+    // compile-time for scalable type. 
+    if (isa<ScalableVectorType>(it->getType())) 
+      continue; 
+ 
     // Skip instructions marked for the deletion.
     if (R.isDeleted(&*it))
       continue;
     // We may go through BB multiple times so skip the one we have checked.
     if (!VisitedInstrs.insert(&*it).second) {
-      if (it->use_empty() && KeyNodes.contains(&*it) &&
+      if (it->use_empty() && KeyNodes.contains(&*it) && 
           vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
         // We would like to start over since some instructions are deleted
         // and the iterator may become invalid value.
@@ -7534,29 +7534,29 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
     // Try to vectorize reductions that use PHINodes.
     if (PHINode *P = dyn_cast<PHINode>(it)) {
       // Check that the PHI is a reduction PHI.
-      if (P->getNumIncomingValues() == 2) {
-        // Try to match and vectorize a horizontal reduction.
-        if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
-                                     TTI)) {
-          Changed = true;
-          it = BB->begin();
-          e = BB->end();
-          continue;
-        }
-      }
-      // Try to vectorize the incoming values of the PHI, to catch reductions
-      // that feed into PHIs.
-      for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
-        // Skip if the incoming block is the current BB for now. Also, bypass
-        // unreachable IR for efficiency and to avoid crashing.
-        // TODO: Collect the skipped incoming values and try to vectorize them
-        // after processing BB.
-        if (BB == P->getIncomingBlock(I) ||
-            !DT->isReachableFromEntry(P->getIncomingBlock(I)))
-          continue;
-
-        Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
-                                            P->getIncomingBlock(I), R, TTI);
+      if (P->getNumIncomingValues() == 2) { 
+        // Try to match and vectorize a horizontal reduction. 
+        if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, 
+                                     TTI)) { 
+          Changed = true; 
+          it = BB->begin(); 
+          e = BB->end(); 
+          continue; 
+        } 
+      } 
+      // Try to vectorize the incoming values of the PHI, to catch reductions 
+      // that feed into PHIs. 
+      for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) { 
+        // Skip if the incoming block is the current BB for now. Also, bypass 
+        // unreachable IR for efficiency and to avoid crashing. 
+        // TODO: Collect the skipped incoming values and try to vectorize them 
+        // after processing BB. 
+        if (BB == P->getIncomingBlock(I) || 
+            !DT->isReachableFromEntry(P->getIncomingBlock(I))) 
+          continue; 
+
+        Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), 
+                                            P->getIncomingBlock(I), R, TTI); 
       }
       continue;
     }
@@ -7620,7 +7620,7 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
     unsigned MaxElts = MaxVecRegSize / EltSize;
     for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
       auto Len = std::min<unsigned>(BE - BI, MaxElts);
-      ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
+      ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len); 
 
       // Initialize a set a candidate getelementptrs. Note that we use a
       // SetVector here to preserve program order. If the index computations
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 8737016760..dd33853d34 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -61,19 +61,19 @@ class VPRecipeBuilder {
   /// Check if the load or store instruction \p I should widened for \p
   /// Range.Start and potentially masked. Such instructions are handled by a
   /// recipe that takes an additional VPInstruction for the mask.
-  VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range,
-                                 VPlanPtr &Plan);
+  VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range, 
+                                 VPlanPtr &Plan); 
 
   /// Check if an induction recipe should be constructed for \I. If so build and
   /// return it. If not, return null.
-  VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi,
-                                                           VPlan &Plan) const;
+  VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi, 
+                                                           VPlan &Plan) const; 
 
   /// Optimize the special case where the operand of \p I is a constant integer
   /// induction variable.
   VPWidenIntOrFpInductionRecipe *
-  tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
-                                 VPlan &Plan) const;
+  tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 
+                                 VPlan &Plan) const; 
 
   /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
   /// a sequence of select instructions as the vectorizer currently performs
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp
index b26399e0ae..e65b4ea4a7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp
@@ -20,10 +20,10 @@
 #include "VPlanDominatorTree.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLExtras.h" 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/IVDescriptors.h" 
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -58,69 +58,69 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
   return OS;
 }
 
-VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
-    : SubclassID(SC), UnderlyingVal(UV), Def(Def) {
-  if (Def)
-    Def->addDefinedValue(this);
-}
-
-VPValue::~VPValue() {
-  assert(Users.empty() && "trying to delete a VPValue with remaining users");
-  if (Def)
-    Def->removeDefinedValue(this);
-}
-
+VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def) 
+    : SubclassID(SC), UnderlyingVal(UV), Def(Def) { 
+  if (Def) 
+    Def->addDefinedValue(this); 
+} 
+ 
+VPValue::~VPValue() { 
+  assert(Users.empty() && "trying to delete a VPValue with remaining users"); 
+  if (Def) 
+    Def->removeDefinedValue(this); 
+} 
+ 
 void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
-  if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def))
-    R->print(OS, "", SlotTracker);
+  if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def)) 
+    R->print(OS, "", SlotTracker); 
   else
     printAsOperand(OS, SlotTracker);
 }
 
-void VPValue::dump() const {
-  const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def);
-  VPSlotTracker SlotTracker(
-      (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
-  print(dbgs(), SlotTracker);
-  dbgs() << "\n";
-}
-
-void VPDef::dump() const {
-  const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this);
-  VPSlotTracker SlotTracker(
-      (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
-  print(dbgs(), "", SlotTracker);
-  dbgs() << "\n";
-}
-
-VPUser *VPRecipeBase::toVPUser() {
-  if (auto *U = dyn_cast<VPInstruction>(this))
-    return U;
-  if (auto *U = dyn_cast<VPWidenRecipe>(this))
-    return U;
-  if (auto *U = dyn_cast<VPWidenCallRecipe>(this))
-    return U;
-  if (auto *U = dyn_cast<VPWidenSelectRecipe>(this))
-    return U;
-  if (auto *U = dyn_cast<VPWidenGEPRecipe>(this))
-    return U;
-  if (auto *U = dyn_cast<VPBlendRecipe>(this))
-    return U;
-  if (auto *U = dyn_cast<VPInterleaveRecipe>(this))
-    return U;
-  if (auto *U = dyn_cast<VPReplicateRecipe>(this))
-    return U;
-  if (auto *U = dyn_cast<VPBranchOnMaskRecipe>(this))
-    return U;
-  if (auto *U = dyn_cast<VPWidenMemoryInstructionRecipe>(this))
-    return U;
-  if (auto *U = dyn_cast<VPReductionRecipe>(this))
-    return U;
-  if (auto *U = dyn_cast<VPPredInstPHIRecipe>(this))
-    return U;
-  return nullptr;
-}
-
+void VPValue::dump() const { 
+  const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def); 
+  VPSlotTracker SlotTracker( 
+      (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); 
+  print(dbgs(), SlotTracker); 
+  dbgs() << "\n"; 
+} 
+ 
+void VPDef::dump() const { 
+  const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this); 
+  VPSlotTracker SlotTracker( 
+      (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); 
+  print(dbgs(), "", SlotTracker); 
+  dbgs() << "\n"; 
+} 
+ 
+VPUser *VPRecipeBase::toVPUser() { 
+  if (auto *U = dyn_cast<VPInstruction>(this)) 
+    return U; 
+  if (auto *U = dyn_cast<VPWidenRecipe>(this)) 
+    return U; 
+  if (auto *U = dyn_cast<VPWidenCallRecipe>(this)) 
+    return U; 
+  if (auto *U = dyn_cast<VPWidenSelectRecipe>(this)) 
+    return U; 
+  if (auto *U = dyn_cast<VPWidenGEPRecipe>(this)) 
+    return U; 
+  if (auto *U = dyn_cast<VPBlendRecipe>(this)) 
+    return U; 
+  if (auto *U = dyn_cast<VPInterleaveRecipe>(this)) 
+    return U; 
+  if (auto *U = dyn_cast<VPReplicateRecipe>(this)) 
+    return U; 
+  if (auto *U = dyn_cast<VPBranchOnMaskRecipe>(this)) 
+    return U; 
+  if (auto *U = dyn_cast<VPWidenMemoryInstructionRecipe>(this)) 
+    return U; 
+  if (auto *U = dyn_cast<VPReductionRecipe>(this)) 
+    return U; 
+  if (auto *U = dyn_cast<VPPredInstPHIRecipe>(this)) 
+    return U; 
+  return nullptr; 
+} 
+ 
 // Get the top-most entry block of \p Start. This is the entry block of the
 // containing VPlan. This function is templated to support both const and non-const blocks
 template <typename T> static T *getPlanEntry(T *Start) {
@@ -200,43 +200,43 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
 }
 
 void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
-  SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry));
+  SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry)); 
 
   for (VPBlockBase *Block : Blocks)
     delete Block;
 }
 
-VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
-  iterator It = begin();
-  while (It != end() && (isa<VPWidenPHIRecipe>(&*It) ||
-                         isa<VPWidenIntOrFpInductionRecipe>(&*It) ||
-                         isa<VPPredInstPHIRecipe>(&*It) ||
-                         isa<VPWidenCanonicalIVRecipe>(&*It)))
-    It++;
-  return It;
-}
-
-Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
-  if (!Def->getDef() && OrigLoop->isLoopInvariant(Def->getLiveInIRValue()))
-    return Def->getLiveInIRValue();
-
-  if (hasScalarValue(Def, Instance))
-    return Data.PerPartScalars[Def][Instance.Part][Instance.Lane];
-
-  if (hasVectorValue(Def, Instance.Part)) {
-    assert(Data.PerPartOutput.count(Def));
-    auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
-    if (!VecPart->getType()->isVectorTy()) {
-      assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar");
-      return VecPart;
-    }
-    // TODO: Cache created scalar values.
-    return Builder.CreateExtractElement(VecPart,
-                                        Builder.getInt32(Instance.Lane));
-  }
-  return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance);
-}
-
+VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { 
+  iterator It = begin(); 
+  while (It != end() && (isa<VPWidenPHIRecipe>(&*It) || 
+                         isa<VPWidenIntOrFpInductionRecipe>(&*It) || 
+                         isa<VPPredInstPHIRecipe>(&*It) || 
+                         isa<VPWidenCanonicalIVRecipe>(&*It))) 
+    It++; 
+  return It; 
+} 
+ 
+Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { 
+  if (!Def->getDef() && OrigLoop->isLoopInvariant(Def->getLiveInIRValue())) 
+    return Def->getLiveInIRValue(); 
+ 
+  if (hasScalarValue(Def, Instance)) 
+    return Data.PerPartScalars[Def][Instance.Part][Instance.Lane]; 
+ 
+  if (hasVectorValue(Def, Instance.Part)) { 
+    assert(Data.PerPartOutput.count(Def)); 
+    auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; 
+    if (!VecPart->getType()->isVectorTy()) { 
+      assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar"); 
+      return VecPart; 
+    } 
+    // TODO: Cache created scalar values. 
+    return Builder.CreateExtractElement(VecPart, 
+                                        Builder.getInt32(Instance.Lane)); 
+  } 
+  return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance); 
+} 
+ 
 BasicBlock *
 VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
   // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
@@ -354,24 +354,24 @@ void VPBasicBlock::execute(VPTransformState *State) {
   LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
 }
 
-void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
-  for (VPRecipeBase &R : Recipes) {
-    for (auto *Def : R.definedValues())
-      Def->replaceAllUsesWith(NewValue);
-
-    if (auto *User = R.toVPUser())
-      for (unsigned I = 0, E = User->getNumOperands(); I != E; I++)
-        User->setOperand(I, NewValue);
-  }
-}
-
-void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
-  for (VPBlockBase *Block : depth_first(Entry))
-    // Drop all references in VPBasicBlocks and replace all uses with
-    // DummyValue.
-    Block->dropAllReferences(NewValue);
-}
-
+void VPBasicBlock::dropAllReferences(VPValue *NewValue) { 
+  for (VPRecipeBase &R : Recipes) { 
+    for (auto *Def : R.definedValues()) 
+      Def->replaceAllUsesWith(NewValue); 
+ 
+    if (auto *User = R.toVPUser()) 
+      for (unsigned I = 0, E = User->getNumOperands(); I != E; I++) 
+        User->setOperand(I, NewValue); 
+  } 
+} 
+ 
+void VPRegionBlock::dropAllReferences(VPValue *NewValue) { 
+  for (VPBlockBase *Block : depth_first(Entry)) 
+    // Drop all references in VPBasicBlocks and replace all uses with 
+    // DummyValue. 
+    Block->dropAllReferences(NewValue); 
+} 
+ 
 void VPRegionBlock::execute(VPTransformState *State) {
   ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
 
@@ -405,9 +405,9 @@ void VPRegionBlock::execute(VPTransformState *State) {
 
   for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
     State->Instance->Part = Part;
-    assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
-    for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
-         ++Lane) {
+    assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); 
+    for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; 
+         ++Lane) { 
       State->Instance->Lane = Lane;
       // Visit the VPBlocks connected to \p this, starting from it.
       for (VPBlockBase *Block : RPOT) {
@@ -453,14 +453,14 @@ void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
   insertAfter(InsertPos);
 }
 
-void VPRecipeBase::moveBefore(VPBasicBlock &BB,
-                              iplist<VPRecipeBase>::iterator I) {
-  assert(I == BB.end() || I->getParent() == &BB);
-  removeFromParent();
-  Parent = &BB;
-  BB.getRecipeList().insert(I, this);
-}
-
+void VPRecipeBase::moveBefore(VPBasicBlock &BB, 
+                              iplist<VPRecipeBase>::iterator I) { 
+  assert(I == BB.end() || I->getParent() == &BB); 
+  removeFromParent(); 
+  Parent = &BB; 
+  BB.getRecipeList().insert(I, this); 
+} 
+ 
 void VPInstruction::generateInstruction(VPTransformState &State,
                                         unsigned Part) {
   IRBuilder<> &Builder = State.Builder;
@@ -498,14 +498,14 @@ void VPInstruction::generateInstruction(VPTransformState &State,
   case VPInstruction::ActiveLaneMask: {
     // Get first lane of vector induction variable.
     Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
-    // Get the original loop tripcount.
-    Value *ScalarTC = State.TripCount;
+    // Get the original loop tripcount. 
+    Value *ScalarTC = State.TripCount; 
 
     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
-    auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue());
+    auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue()); 
     Instruction *Call = Builder.CreateIntrinsic(
-        Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
-        {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
+        Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, 
+        {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); 
     State.set(this, Call, Part);
     break;
   }
@@ -520,14 +520,14 @@ void VPInstruction::execute(VPTransformState &State) {
     generateInstruction(State, Part);
 }
 
-void VPInstruction::dump() const {
-  VPSlotTracker SlotTracker(getParent()->getPlan());
-  print(dbgs(), "", SlotTracker);
-}
-
+void VPInstruction::dump() const { 
+  VPSlotTracker SlotTracker(getParent()->getPlan()); 
+  print(dbgs(), "", SlotTracker); 
+} 
+ 
 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
-  O << "EMIT ";
+  O << "EMIT "; 
 
   if (hasResult()) {
     printAsOperand(O, SlotTracker);
@@ -573,7 +573,7 @@ void VPlan::execute(VPTransformState *State) {
                                    "trip.count.minus.1");
     auto VF = State->VF;
     Value *VTCMO =
-        VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
+        VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); 
     for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part)
       State->set(BackedgeTakenCount, VTCMO, Part);
   }
@@ -778,7 +778,7 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
   // Dump the block predicate.
   const VPValue *Pred = BasicBlock->getPredicate();
   if (Pred) {
-    OS << " +\n" << Indent << " \"BlockPredicate: \"";
+    OS << " +\n" << Indent << " \"BlockPredicate: \""; 
     if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) {
       PredI->printAsOperand(OS, SlotTracker);
       OS << " (" << DOT::EscapeString(PredI->getParent()->getName())
@@ -788,7 +788,7 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
   }
 
   for (const VPRecipeBase &Recipe : *BasicBlock) {
-    OS << " +\n" << Indent << "\"";
+    OS << " +\n" << Indent << "\""; 
     Recipe.print(OS, Indent, SlotTracker);
     OS << "\\l\"";
   }
@@ -827,7 +827,7 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
   dumpEdges(Region);
 }
 
-void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) {
+void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) { 
   std::string IngredientString;
   raw_string_ostream RSO(IngredientString);
   if (auto *Inst = dyn_cast<Instruction>(V)) {
@@ -850,45 +850,45 @@ void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) {
 
 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
-  O << "WIDEN-CALL ";
-
-  auto *CI = cast<CallInst>(getUnderlyingInstr());
-  if (CI->getType()->isVoidTy())
-    O << "void ";
-  else {
-    printAsOperand(O, SlotTracker);
-    O << " = ";
-  }
-
-  O << "call @" << CI->getCalledFunction()->getName() << "(";
-  printOperands(O, SlotTracker);
-  O << ")";
+  O << "WIDEN-CALL "; 
+ 
+  auto *CI = cast<CallInst>(getUnderlyingInstr()); 
+  if (CI->getType()->isVoidTy()) 
+    O << "void "; 
+  else { 
+    printAsOperand(O, SlotTracker); 
+    O << " = "; 
+  } 
+ 
+  O << "call @" << CI->getCalledFunction()->getName() << "("; 
+  printOperands(O, SlotTracker); 
+  O << ")"; 
 }
 
 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
                                 VPSlotTracker &SlotTracker) const {
-  O << "WIDEN-SELECT ";
-  printAsOperand(O, SlotTracker);
-  O << " = select ";
-  getOperand(0)->printAsOperand(O, SlotTracker);
-  O << ", ";
-  getOperand(1)->printAsOperand(O, SlotTracker);
-  O << ", ";
-  getOperand(2)->printAsOperand(O, SlotTracker);
-  O << (InvariantCond ? " (condition is loop invariant)" : "");
+  O << "WIDEN-SELECT "; 
+  printAsOperand(O, SlotTracker); 
+  O << " = select "; 
+  getOperand(0)->printAsOperand(O, SlotTracker); 
+  O << ", "; 
+  getOperand(1)->printAsOperand(O, SlotTracker); 
+  O << ", "; 
+  getOperand(2)->printAsOperand(O, SlotTracker); 
+  O << (InvariantCond ? " (condition is loop invariant)" : ""); 
 }
 
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
-  O << "WIDEN ";
-  printAsOperand(O, SlotTracker);
-  O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
-  printOperands(O, SlotTracker);
+  O << "WIDEN "; 
+  printAsOperand(O, SlotTracker); 
+  O << " = " << getUnderlyingInstr()->getOpcodeName() << " "; 
+  printOperands(O, SlotTracker); 
 }
 
 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
-  O << "WIDEN-INDUCTION";
+  O << "WIDEN-INDUCTION"; 
   if (Trunc) {
     O << "\\l\"";
     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
@@ -899,26 +899,26 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 
 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
                              VPSlotTracker &SlotTracker) const {
-  O << "WIDEN-GEP ";
+  O << "WIDEN-GEP "; 
   O << (IsPtrLoopInvariant ? "Inv" : "Var");
   size_t IndicesNumber = IsIndexLoopInvariant.size();
   for (size_t I = 0; I < IndicesNumber; ++I)
     O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
-
-  O << " ";
-  printAsOperand(O, SlotTracker);
-  O << " = getelementptr ";
-  printOperands(O, SlotTracker);
+ 
+  O << " "; 
+  printAsOperand(O, SlotTracker); 
+  O << " = getelementptr "; 
+  printOperands(O, SlotTracker); 
 }
 
 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                              VPSlotTracker &SlotTracker) const {
-  O << "WIDEN-PHI " << VPlanIngredient(Phi);
+  O << "WIDEN-PHI " << VPlanIngredient(Phi); 
 }
 
 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
-  O << "BLEND ";
+  O << "BLEND "; 
   Phi->printAsOperand(O, false);
   O << " =";
   if (getNumIncomingValues() == 1) {
@@ -936,75 +936,75 @@ void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
   }
 }
 
-void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
-                              VPSlotTracker &SlotTracker) const {
-  O << "REDUCE ";
-  printAsOperand(O, SlotTracker);
-  O << " = ";
-  getChainOp()->printAsOperand(O, SlotTracker);
-  O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode())
-    << " (";
-  getVecOp()->printAsOperand(O, SlotTracker);
-  if (getCondOp()) {
-    O << ", ";
-    getCondOp()->printAsOperand(O, SlotTracker);
-  }
-  O << ")";
-}
-
+void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 
+                              VPSlotTracker &SlotTracker) const { 
+  O << "REDUCE "; 
+  printAsOperand(O, SlotTracker); 
+  O << " = "; 
+  getChainOp()->printAsOperand(O, SlotTracker); 
+  O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) 
+    << " ("; 
+  getVecOp()->printAsOperand(O, SlotTracker); 
+  if (getCondOp()) { 
+    O << ", "; 
+    getCondOp()->printAsOperand(O, SlotTracker); 
+  } 
+  O << ")"; 
+} 
+ 
 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
-  O << (IsUniform ? "CLONE " : "REPLICATE ");
-
-  if (!getUnderlyingInstr()->getType()->isVoidTy()) {
-    printAsOperand(O, SlotTracker);
-    O << " = ";
-  }
-  O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
-  printOperands(O, SlotTracker);
-
+  O << (IsUniform ? "CLONE " : "REPLICATE "); 
+ 
+  if (!getUnderlyingInstr()->getType()->isVoidTy()) { 
+    printAsOperand(O, SlotTracker); 
+    O << " = "; 
+  } 
+  O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " "; 
+  printOperands(O, SlotTracker); 
+ 
   if (AlsoPack)
     O << " (S->V)";
 }
 
 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                 VPSlotTracker &SlotTracker) const {
-  O << "PHI-PREDICATED-INSTRUCTION ";
-  printOperands(O, SlotTracker);
+  O << "PHI-PREDICATED-INSTRUCTION "; 
+  printOperands(O, SlotTracker); 
 }
 
 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
                                            VPSlotTracker &SlotTracker) const {
-  O << "WIDEN ";
-
-  if (!isStore()) {
-    getVPValue()->printAsOperand(O, SlotTracker);
-    O << " = ";
+  O << "WIDEN "; 
+ 
+  if (!isStore()) { 
+    getVPValue()->printAsOperand(O, SlotTracker); 
+    O << " = "; 
   }
-  O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
-
-  printOperands(O, SlotTracker);
+  O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; 
+ 
+  printOperands(O, SlotTracker); 
 }
 
 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
   Value *CanonicalIV = State.CanonicalIV;
   Type *STy = CanonicalIV->getType();
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
-  ElementCount VF = State.VF;
-  assert(!VF.isScalable() && "the code following assumes non scalables ECs");
-  Value *VStart = VF.isScalar()
+  ElementCount VF = State.VF; 
+  assert(!VF.isScalable() && "the code following assumes non scalables ECs"); 
+  Value *VStart = VF.isScalar() 
                       ? CanonicalIV
-                      : Builder.CreateVectorSplat(VF.getKnownMinValue(),
-                                                  CanonicalIV, "broadcast");
+                      : Builder.CreateVectorSplat(VF.getKnownMinValue(), 
+                                                  CanonicalIV, "broadcast"); 
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
     SmallVector<Constant *, 8> Indices;
-    for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
-      Indices.push_back(
-          ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane));
+    for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 
+      Indices.push_back( 
+          ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane)); 
     // If VF == 1, there is only one iteration in the loop above, thus the
     // element pushed back into Indices is ConstantInt::get(STy, Part)
-    Constant *VStep =
-        VF.isScalar() ? Indices.back() : ConstantVector::get(Indices);
+    Constant *VStep = 
+        VF.isScalar() ? Indices.back() : ConstantVector::get(Indices); 
     // Add the consecutive indices to the vector value.
     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
     State.set(getVPValue(), CanonicalVectorIV, Part);
@@ -1013,7 +1013,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
 
 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
-  O << "EMIT ";
+  O << "EMIT "; 
   getVPValue()->printAsOperand(O, SlotTracker);
   O << " = WIDEN-CANONICAL-INDUCTION";
 }
@@ -1021,18 +1021,18 @@ void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
 
 void VPValue::replaceAllUsesWith(VPValue *New) {
-  for (unsigned J = 0; J < getNumUsers();) {
-    VPUser *User = Users[J];
-    unsigned NumUsers = getNumUsers();
+  for (unsigned J = 0; J < getNumUsers();) { 
+    VPUser *User = Users[J]; 
+    unsigned NumUsers = getNumUsers(); 
     for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
       if (User->getOperand(I) == this)
         User->setOperand(I, New);
-    // If a user got removed after updating the current user, the next user to
-    // update will be moved to the current position, so we only need to
-    // increment the index if the number of users did not change.
-    if (NumUsers == getNumUsers())
-      J++;
-  }
+    // If a user got removed after updating the current user, the next user to 
+    // update will be moved to the current position, so we only need to 
+    // increment the index if the number of users did not change. 
+    if (NumUsers == getNumUsers()) 
+      J++; 
+  } 
 }
 
 void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
@@ -1050,12 +1050,12 @@ void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
     OS << "vp<%" << Tracker.getSlot(this) << ">";
 }
 
-void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
-  interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
-    Op->printAsOperand(O, SlotTracker);
-  });
-}
-
+void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const { 
+  interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { 
+    Op->printAsOperand(O, SlotTracker); 
+  }); 
+} 
+ 
 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
                                           Old2NewTy &Old2New,
                                           InterleavedAccessInfo &IAI) {
@@ -1122,8 +1122,8 @@ void VPSlotTracker::assignSlots(const VPRegionBlock *Region) {
 
 void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) {
   for (const VPRecipeBase &Recipe : *VPBB) {
-    for (VPValue *Def : Recipe.definedValues())
-      assignSlot(Def);
+    for (VPValue *Def : Recipe.definedValues()) 
+      assignSlot(Def); 
   }
 }
 
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h
index 2cce127cd4..eec59ef006 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h
@@ -53,7 +53,7 @@ class DominatorTree;
 class InnerLoopVectorizer;
 class LoopInfo;
 class raw_ostream;
-class RecurrenceDescriptor;
+class RecurrenceDescriptor; 
 class Value;
 class VPBasicBlock;
 class VPRegionBlock;
@@ -65,22 +65,22 @@ class VPlanSlp;
 /// [1, 9) = {1, 2, 4, 8}
 struct VFRange {
   // A power of 2.
-  const ElementCount Start;
+  const ElementCount Start; 
 
   // Need not be a power of 2. If End <= Start range is empty.
-  ElementCount End;
-
-  bool isEmpty() const {
-    return End.getKnownMinValue() <= Start.getKnownMinValue();
-  }
-
-  VFRange(const ElementCount &Start, const ElementCount &End)
-      : Start(Start), End(End) {
-    assert(Start.isScalable() == End.isScalable() &&
-           "Both Start and End should have the same scalable flag");
-    assert(isPowerOf2_32(Start.getKnownMinValue()) &&
-           "Expected Start to be a power of 2");
-  }
+  ElementCount End; 
+ 
+  bool isEmpty() const { 
+    return End.getKnownMinValue() <= Start.getKnownMinValue(); 
+  } 
+ 
+  VFRange(const ElementCount &Start, const ElementCount &End) 
+      : Start(Start), End(End) { 
+    assert(Start.isScalable() == End.isScalable() && 
+           "Both Start and End should have the same scalable flag"); 
+    assert(isPowerOf2_32(Start.getKnownMinValue()) && 
+           "Expected Start to be a power of 2"); 
+  } 
 };
 
 using VPlanPtr = std::unique_ptr<VPlan>;
@@ -125,7 +125,7 @@ private:
 
   /// The vectorization factor. Each entry in the scalar map contains UF x VF
   /// scalar values.
-  ElementCount VF;
+  ElementCount VF; 
 
   /// The vector and scalar map storage. We use std::map and not DenseMap
   /// because insertions to DenseMap invalidate its iterators.
@@ -136,7 +136,7 @@ private:
 
 public:
   /// Construct an empty map with the given unroll and vectorization factors.
-  VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {}
+  VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {} 
 
   /// \return True if the map has any vector entry for \p Key.
   bool hasAnyVectorValue(Value *Key) const {
@@ -161,14 +161,14 @@ public:
   /// \return True if the map has a scalar entry for \p Key and \p Instance.
   bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
     assert(Instance.Part < UF && "Queried Scalar Part is too large.");
-    assert(Instance.Lane < VF.getKnownMinValue() &&
-           "Queried Scalar Lane is too large.");
-
+    assert(Instance.Lane < VF.getKnownMinValue() && 
+           "Queried Scalar Lane is too large."); 
+ 
     if (!hasAnyScalarValue(Key))
       return false;
     const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
     assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
-    assert(Entry[Instance.Part].size() == VF.getKnownMinValue() &&
+    assert(Entry[Instance.Part].size() == VF.getKnownMinValue() && 
            "ScalarParts has wrong dimensions.");
     return Entry[Instance.Part][Instance.Lane] != nullptr;
   }
@@ -207,7 +207,7 @@ public:
       // TODO: Consider storing uniform values only per-part, as they occupy
       //       lane 0 only, keeping the other VF-1 redundant entries null.
       for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part].resize(VF.getKnownMinValue(), nullptr);
+        Entry[Part].resize(VF.getKnownMinValue(), nullptr); 
       ScalarMapStorage[Key] = Entry;
     }
     ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
@@ -246,15 +246,15 @@ struct VPCallback {
 /// VPTransformState holds information passed down when "executing" a VPlan,
 /// needed for generating the output IR.
 struct VPTransformState {
-  VPTransformState(ElementCount VF, unsigned UF, Loop *OrigLoop, LoopInfo *LI,
-                   DominatorTree *DT, IRBuilder<> &Builder,
-                   VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV,
-                   VPCallback &Callback)
-      : VF(VF), UF(UF), Instance(), OrigLoop(OrigLoop), LI(LI), DT(DT),
-        Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {}
+  VPTransformState(ElementCount VF, unsigned UF, Loop *OrigLoop, LoopInfo *LI, 
+                   DominatorTree *DT, IRBuilder<> &Builder, 
+                   VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV, 
+                   VPCallback &Callback) 
+      : VF(VF), UF(UF), Instance(), OrigLoop(OrigLoop), LI(LI), DT(DT), 
+        Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} 
 
   /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
-  ElementCount VF;
+  ElementCount VF; 
   unsigned UF;
 
   /// Hold the indices to generate specific scalar instructions. Null indicates
@@ -269,9 +269,9 @@ struct VPTransformState {
     typedef SmallVector<Value *, 2> PerPartValuesTy;
 
     DenseMap<VPValue *, PerPartValuesTy> PerPartOutput;
-
-    using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>;
-    DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
+ 
+    using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>; 
+    DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars; 
   } Data;
 
   /// Get the generated Value for a given VPValue and a given Part. Note that
@@ -288,23 +288,23 @@ struct VPTransformState {
   }
 
   /// Get the generated Value for a given VPValue and given Part and Lane.
-  Value *get(VPValue *Def, const VPIteration &Instance);
-
-  bool hasVectorValue(VPValue *Def, unsigned Part) {
-    auto I = Data.PerPartOutput.find(Def);
-    return I != Data.PerPartOutput.end() && Part < I->second.size() &&
-           I->second[Part];
-  }
-
-  bool hasScalarValue(VPValue *Def, VPIteration Instance) {
-    auto I = Data.PerPartScalars.find(Def);
-    if (I == Data.PerPartScalars.end())
-      return false;
-    return Instance.Part < I->second.size() &&
-           Instance.Lane < I->second[Instance.Part].size() &&
-           I->second[Instance.Part][Instance.Lane];
-  }
-
+  Value *get(VPValue *Def, const VPIteration &Instance); 
+
+  bool hasVectorValue(VPValue *Def, unsigned Part) { 
+    auto I = Data.PerPartOutput.find(Def); 
+    return I != Data.PerPartOutput.end() && Part < I->second.size() && 
+           I->second[Part]; 
+  }
+
+  bool hasScalarValue(VPValue *Def, VPIteration Instance) { 
+    auto I = Data.PerPartScalars.find(Def); 
+    if (I == Data.PerPartScalars.end()) 
+      return false; 
+    return Instance.Part < I->second.size() && 
+           Instance.Lane < I->second[Instance.Part].size() && 
+           I->second[Instance.Part][Instance.Lane]; 
+  } 
+ 
   /// Set the generated Value for a given VPValue and a given Part.
   void set(VPValue *Def, Value *V, unsigned Part) {
     if (!Data.PerPartOutput.count(Def)) {
@@ -313,19 +313,19 @@ struct VPTransformState {
     }
     Data.PerPartOutput[Def][Part] = V;
   }
-  void set(VPValue *Def, Value *IRDef, Value *V, unsigned Part);
-
-  void set(VPValue *Def, Value *V, const VPIteration &Instance) {
-    auto Iter = Data.PerPartScalars.insert({Def, {}});
-    auto &PerPartVec = Iter.first->second;
-    while (PerPartVec.size() <= Instance.Part)
-      PerPartVec.emplace_back();
-    auto &Scalars = PerPartVec[Instance.Part];
-    while (Scalars.size() <= Instance.Lane)
-      Scalars.push_back(nullptr);
-    Scalars[Instance.Lane] = V;
-  }
-
+  void set(VPValue *Def, Value *IRDef, Value *V, unsigned Part); 
+
+  void set(VPValue *Def, Value *V, const VPIteration &Instance) { 
+    auto Iter = Data.PerPartScalars.insert({Def, {}}); 
+    auto &PerPartVec = Iter.first->second; 
+    while (PerPartVec.size() <= Instance.Part) 
+      PerPartVec.emplace_back(); 
+    auto &Scalars = PerPartVec[Instance.Part]; 
+    while (Scalars.size() <= Instance.Lane) 
+      Scalars.push_back(nullptr); 
+    Scalars[Instance.Lane] = V; 
+  } 
+ 
   /// Hold state information used when constructing the CFG of the output IR,
   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
   struct CFGState {
@@ -351,9 +351,9 @@ struct VPTransformState {
     CFGState() = default;
   } CFG;
 
-  /// Hold a pointer to the original loop.
-  Loop *OrigLoop;
-
+  /// Hold a pointer to the original loop. 
+  Loop *OrigLoop; 
+ 
   /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
   LoopInfo *LI;
 
@@ -427,14 +427,14 @@ class VPBlockBase {
 
   /// Remove \p Predecessor from the predecessors of this block.
   void removePredecessor(VPBlockBase *Predecessor) {
-    auto Pos = find(Predecessors, Predecessor);
+    auto Pos = find(Predecessors, Predecessor); 
     assert(Pos && "Predecessor does not exist");
     Predecessors.erase(Pos);
   }
 
   /// Remove \p Successor from the successors of this block.
   void removeSuccessor(VPBlockBase *Successor) {
-    auto Pos = find(Successors, Successor);
+    auto Pos = find(Successors, Successor); 
     assert(Pos && "Successor does not exist");
     Successors.erase(Pos);
   }
@@ -627,19 +627,19 @@ public:
     // hoisted into a VPBlockBase.
     return true;
   }
-
-  /// Replace all operands of VPUsers in the block with \p NewValue and also
-  /// replaces all uses of VPValues defined in the block with NewValue.
-  virtual void dropAllReferences(VPValue *NewValue) = 0;
+ 
+  /// Replace all operands of VPUsers in the block with \p NewValue and also 
+  /// replaces all uses of VPValues defined in the block with NewValue. 
+  virtual void dropAllReferences(VPValue *NewValue) = 0; 
 };
 
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
-/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef
-/// and is responsible for deleting its defined values. Single-value
-/// VPRecipeBases that also inherit from VPValue must make sure to inherit from
-/// VPRecipeBase before VPValue.
-class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
-                     public VPDef {
+/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef 
+/// and is responsible for deleting its defined values. Single-value 
+/// VPRecipeBases that also inherit from VPValue must make sure to inherit from 
+/// VPRecipeBase before VPValue. 
+class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>, 
+                     public VPDef { 
   friend VPBasicBlock;
   friend class VPBlockUtils;
 
@@ -648,7 +648,7 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   VPBasicBlock *Parent = nullptr;
 
 public:
-  VPRecipeBase(const unsigned char SC) : VPDef(SC) {}
+  VPRecipeBase(const unsigned char SC) : VPDef(SC) {} 
   virtual ~VPRecipeBase() = default;
 
   /// \return the VPBasicBlock which this VPRecipe belongs to.
@@ -671,11 +671,11 @@ public:
   /// the VPBasicBlock that MovePos lives in, right after MovePos.
   void moveAfter(VPRecipeBase *MovePos);
 
-  /// Unlink this recipe and insert into BB before I.
-  ///
-  /// \pre I is a valid iterator into BB.
-  void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I);
-
+  /// Unlink this recipe and insert into BB before I. 
+  /// 
+  /// \pre I is a valid iterator into BB. 
+  void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I); 
+ 
   /// This method unlinks 'this' from the containing basic block, but does not
   /// delete it.
   void removeFromParent();
@@ -684,46 +684,46 @@ public:
   ///
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
-
-  /// Returns a pointer to a VPUser, if the recipe inherits from VPUser or
-  /// nullptr otherwise.
-  VPUser *toVPUser();
-
-  /// Returns the underlying instruction, if the recipe is a VPValue or nullptr
-  /// otherwise.
-  Instruction *getUnderlyingInstr() {
-    return cast<Instruction>(getVPValue()->getUnderlyingValue());
-  }
-  const Instruction *getUnderlyingInstr() const {
-    return cast<Instruction>(getVPValue()->getUnderlyingValue());
-  }
-
-  /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    // All VPDefs are also VPRecipeBases.
-    return true;
-  }
+ 
+  /// Returns a pointer to a VPUser, if the recipe inherits from VPUser or 
+  /// nullptr otherwise. 
+  VPUser *toVPUser(); 
+ 
+  /// Returns the underlying instruction, if the recipe is a VPValue or nullptr 
+  /// otherwise. 
+  Instruction *getUnderlyingInstr() { 
+    return cast<Instruction>(getVPValue()->getUnderlyingValue()); 
+  } 
+  const Instruction *getUnderlyingInstr() const { 
+    return cast<Instruction>(getVPValue()->getUnderlyingValue()); 
+  } 
+ 
+  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+  static inline bool classof(const VPDef *D) { 
+    // All VPDefs are also VPRecipeBases. 
+    return true; 
+  } 
 };
 
-inline bool VPUser::classof(const VPDef *Def) {
-  return Def->getVPDefID() == VPRecipeBase::VPInstructionSC ||
-         Def->getVPDefID() == VPRecipeBase::VPWidenSC ||
-         Def->getVPDefID() == VPRecipeBase::VPWidenCallSC ||
-         Def->getVPDefID() == VPRecipeBase::VPWidenSelectSC ||
-         Def->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
-         Def->getVPDefID() == VPRecipeBase::VPBlendSC ||
-         Def->getVPDefID() == VPRecipeBase::VPInterleaveSC ||
-         Def->getVPDefID() == VPRecipeBase::VPReplicateSC ||
-         Def->getVPDefID() == VPRecipeBase::VPReductionSC ||
-         Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC ||
-         Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
-}
-
+inline bool VPUser::classof(const VPDef *Def) { 
+  return Def->getVPDefID() == VPRecipeBase::VPInstructionSC || 
+         Def->getVPDefID() == VPRecipeBase::VPWidenSC || 
+         Def->getVPDefID() == VPRecipeBase::VPWidenCallSC || 
+         Def->getVPDefID() == VPRecipeBase::VPWidenSelectSC || 
+         Def->getVPDefID() == VPRecipeBase::VPWidenGEPSC || 
+         Def->getVPDefID() == VPRecipeBase::VPBlendSC || 
+         Def->getVPDefID() == VPRecipeBase::VPInterleaveSC || 
+         Def->getVPDefID() == VPRecipeBase::VPReplicateSC || 
+         Def->getVPDefID() == VPRecipeBase::VPReductionSC || 
+         Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC || 
+         Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC; 
+} 
+ 
 /// This is a concrete Recipe that models a single VPlan-level instruction.
 /// While as any Recipe it may generate a sequence of IR instructions when
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
-class VPInstruction : public VPRecipeBase, public VPUser, public VPValue {
+class VPInstruction : public VPRecipeBase, public VPUser, public VPValue { 
   friend class VPlanSlp;
 
 public:
@@ -749,22 +749,22 @@ protected:
 
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
-      : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser(Operands),
-        VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}
-
-  VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands)
-      : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser({}),
-        VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {
-    for (auto *I : Operands)
-      addOperand(I->getVPValue());
-  }
-
+      : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser(Operands), 
+        VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {} 
+
+  VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands) 
+      : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser({}), 
+        VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) { 
+    for (auto *I : Operands) 
+      addOperand(I->getVPValue()); 
+  } 
+ 
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
       : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPValue *V) {
-    return V->getVPValueID() == VPValue::VPVInstructionSC;
+    return V->getVPValueID() == VPValue::VPVInstructionSC; 
   }
 
   VPInstruction *clone() const {
@@ -773,8 +773,8 @@ public:
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *R) {
-    return R->getVPDefID() == VPRecipeBase::VPInstructionSC;
+  static inline bool classof(const VPDef *R) { 
+    return R->getVPDefID() == VPRecipeBase::VPInstructionSC; 
   }
 
   unsigned getOpcode() const { return Opcode; }
@@ -784,12 +784,12 @@ public:
   /// provided.
   void execute(VPTransformState &State) override;
 
-  /// Print the VPInstruction to \p O.
+  /// Print the VPInstruction to \p O. 
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
 
-  /// Print the VPInstruction to dbgs() (for debugging).
-  void dump() const;
+  /// Print the VPInstruction to dbgs() (for debugging). 
+  void dump() const; 
 
   /// Return true if this instruction may modify memory.
   bool mayWriteToMemory() const {
@@ -823,22 +823,22 @@ public:
 /// VPWidenRecipe is a recipe for producing a copy of vector type its
 /// ingredient. This recipe covers most of the traditional vectorization cases
 /// where each ingredient transforms into a vectorized version of itself.
-class VPWidenRecipe : public VPRecipeBase, public VPValue, public VPUser {
+class VPWidenRecipe : public VPRecipeBase, public VPValue, public VPUser { 
 public:
   template <typename IterT>
   VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
-      : VPRecipeBase(VPRecipeBase::VPWidenSC),
-        VPValue(VPValue::VPVWidenSC, &I, this), VPUser(Operands) {}
+      : VPRecipeBase(VPRecipeBase::VPWidenSC), 
+        VPValue(VPValue::VPVWidenSC, &I, this), VPUser(Operands) {} 
 
   ~VPWidenRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPWidenSC;
-  }
-  static inline bool classof(const VPValue *V) {
-    return V->getVPValueID() == VPValue::VPVWidenSC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPWidenSC; 
   }
+  static inline bool classof(const VPValue *V) { 
+    return V->getVPValueID() == VPValue::VPVWidenSC; 
+  } 
 
   /// Produce widened copies of all Ingredients.
   void execute(VPTransformState &State) override;
@@ -849,19 +849,19 @@ public:
 };
 
 /// A recipe for widening Call instructions.
-class VPWidenCallRecipe : public VPRecipeBase, public VPUser, public VPValue {
+class VPWidenCallRecipe : public VPRecipeBase, public VPUser, public VPValue { 
 
 public:
   template <typename IterT>
   VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments)
-      : VPRecipeBase(VPRecipeBase::VPWidenCallSC), VPUser(CallArguments),
-        VPValue(VPValue::VPVWidenCallSC, &I, this) {}
+      : VPRecipeBase(VPRecipeBase::VPWidenCallSC), VPUser(CallArguments), 
+        VPValue(VPValue::VPVWidenCallSC, &I, this) {} 
 
   ~VPWidenCallRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPWidenCallSC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPWidenCallSC; 
   }
 
   /// Produce a widened version of the call instruction.
@@ -873,7 +873,7 @@ public:
 };
 
 /// A recipe for widening select instructions.
-class VPWidenSelectRecipe : public VPRecipeBase, public VPUser, public VPValue {
+class VPWidenSelectRecipe : public VPRecipeBase, public VPUser, public VPValue { 
 
   /// Is the condition of the select loop invariant?
   bool InvariantCond;
@@ -882,15 +882,15 @@ public:
   template <typename IterT>
   VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands,
                       bool InvariantCond)
-      : VPRecipeBase(VPRecipeBase::VPWidenSelectSC), VPUser(Operands),
-        VPValue(VPValue::VPVWidenSelectSC, &I, this),
+      : VPRecipeBase(VPRecipeBase::VPWidenSelectSC), VPUser(Operands), 
+        VPValue(VPValue::VPVWidenSelectSC, &I, this), 
         InvariantCond(InvariantCond) {}
 
   ~VPWidenSelectRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPWidenSelectSC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPWidenSelectSC; 
   }
 
   /// Produce a widened version of the select instruction.
@@ -902,24 +902,24 @@ public:
 };
 
 /// A recipe for handling GEP instructions.
-class VPWidenGEPRecipe : public VPRecipeBase,
-                         public VPUser,
-                         public VPValue {
+class VPWidenGEPRecipe : public VPRecipeBase, 
+                         public VPUser, 
+                         public VPValue { 
   bool IsPtrLoopInvariant;
   SmallBitVector IsIndexLoopInvariant;
 
 public:
   template <typename IterT>
-  VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
-      : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands),
-        VPValue(VPWidenGEPSC, GEP, this),
-        IsIndexLoopInvariant(GEP->getNumIndices(), false) {}
-
-  template <typename IterT>
+  VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands) 
+      : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands), 
+        VPValue(VPWidenGEPSC, GEP, this), 
+        IsIndexLoopInvariant(GEP->getNumIndices(), false) {} 
+ 
+  template <typename IterT> 
   VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands,
                    Loop *OrigLoop)
-      : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands),
-        VPValue(VPValue::VPVWidenGEPSC, GEP, this),
+      : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands), 
+        VPValue(VPValue::VPVWidenGEPSC, GEP, this), 
         IsIndexLoopInvariant(GEP->getNumIndices(), false) {
     IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
     for (auto Index : enumerate(GEP->indices()))
@@ -929,8 +929,8 @@ public:
   ~VPWidenGEPRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPWidenGEPSC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPWidenGEPSC; 
   }
 
   /// Generate the gep nodes.
@@ -943,25 +943,25 @@ public:
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
 /// producing their vector and scalar values.
-class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPUser {
+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPUser { 
   PHINode *IV;
   TruncInst *Trunc;
 
 public:
-  VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
-                                TruncInst *Trunc = nullptr)
-      : VPRecipeBase(VPWidenIntOrFpInductionSC), VPUser({Start}), IV(IV),
-        Trunc(Trunc) {
-    if (Trunc)
-      new VPValue(Trunc, this);
-    else
-      new VPValue(IV, this);
-  }
+  VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, 
+                                TruncInst *Trunc = nullptr) 
+      : VPRecipeBase(VPWidenIntOrFpInductionSC), VPUser({Start}), IV(IV), 
+        Trunc(Trunc) { 
+    if (Trunc) 
+      new VPValue(Trunc, this); 
+    else 
+      new VPValue(IV, this); 
+  } 
   ~VPWidenIntOrFpInductionRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC; 
   }
 
   /// Generate the vectorized and scalarized versions of the phi node as
@@ -971,38 +971,38 @@ public:
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
-
-  /// Returns the start value of the induction.
-  VPValue *getStartValue() { return getOperand(0); }
+ 
+  /// Returns the start value of the induction. 
+  VPValue *getStartValue() { return getOperand(0); } 
 };
 
 /// A recipe for handling all phi nodes except for integer and FP inductions.
-/// For reduction PHIs, RdxDesc must point to the corresponding recurrence
-/// descriptor and the start value is the first operand of the recipe.
-class VPWidenPHIRecipe : public VPRecipeBase, public VPUser {
+/// For reduction PHIs, RdxDesc must point to the corresponding recurrence 
+/// descriptor and the start value is the first operand of the recipe. 
+class VPWidenPHIRecipe : public VPRecipeBase, public VPUser { 
   PHINode *Phi;
 
-  /// Descriptor for a reduction PHI.
-  RecurrenceDescriptor *RdxDesc = nullptr;
-
+  /// Descriptor for a reduction PHI. 
+  RecurrenceDescriptor *RdxDesc = nullptr; 
+ 
 public:
-  /// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p
-  /// RdxDesc.
-  VPWidenPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, VPValue &Start)
-      : VPWidenPHIRecipe(Phi) {
-    this->RdxDesc = &RdxDesc;
-    addOperand(&Start);
-  }
-
-  /// Create a VPWidenPHIRecipe for \p Phi
-  VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {
-    new VPValue(Phi, this);
-  }
+  /// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p 
+  /// RdxDesc. 
+  VPWidenPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, VPValue &Start) 
+      : VPWidenPHIRecipe(Phi) { 
+    this->RdxDesc = &RdxDesc; 
+    addOperand(&Start); 
+  } 
+ 
+  /// Create a VPWidenPHIRecipe for \p Phi 
+  VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) { 
+    new VPValue(Phi, this); 
+  } 
   ~VPWidenPHIRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPWidenPHISC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPWidenPHISC; 
   }
 
   /// Generate the phi/select nodes.
@@ -1011,25 +1011,25 @@ public:
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
-
-  /// Returns the start value of the phi, if it is a reduction.
-  VPValue *getStartValue() {
-    return getNumOperands() == 0 ? nullptr : getOperand(0);
-  }
+ 
+  /// Returns the start value of the phi, if it is a reduction. 
+  VPValue *getStartValue() { 
+    return getNumOperands() == 0 ? nullptr : getOperand(0); 
+  } 
 };
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
-class VPBlendRecipe : public VPRecipeBase, public VPUser {
+class VPBlendRecipe : public VPRecipeBase, public VPUser { 
   PHINode *Phi;
 
-public:
+public: 
   /// The blend operation is a User of the incoming values and of their
   /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
   /// might be incoming with a full mask for which there is no VPValue.
   VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
-      : VPRecipeBase(VPBlendSC), VPUser(Operands), Phi(Phi) {
-    new VPValue(Phi, this);
+      : VPRecipeBase(VPBlendSC), VPUser(Operands), Phi(Phi) { 
+    new VPValue(Phi, this); 
     assert(Operands.size() > 0 &&
            ((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
            "Expected either a single incoming value or a positive even number "
@@ -1037,19 +1037,19 @@ public:
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPBlendSC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPBlendSC; 
   }
 
   /// Return the number of incoming values, taking into account that a single
   /// incoming value has no mask.
-  unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; }
+  unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; } 
 
   /// Return incoming value number \p Idx.
-  VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); }
+  VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); } 
 
   /// Return mask number \p Idx.
-  VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); }
+  VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); } 
 
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
@@ -1060,60 +1060,60 @@ public:
 };
 
 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load
-/// or stores into one wide load/store and shuffles. The first operand of a
-/// VPInterleave recipe is the address, followed by the stored values, followed
-/// by an optional mask.
-class VPInterleaveRecipe : public VPRecipeBase, public VPUser {
+/// or stores into one wide load/store and shuffles. The first operand of a 
+/// VPInterleave recipe is the address, followed by the stored values, followed 
+/// by an optional mask. 
+class VPInterleaveRecipe : public VPRecipeBase, public VPUser { 
   const InterleaveGroup<Instruction> *IG;
 
-  bool HasMask = false;
-
+  bool HasMask = false; 
+ 
 public:
   VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
-                     ArrayRef<VPValue *> StoredValues, VPValue *Mask)
-      : VPRecipeBase(VPInterleaveSC), VPUser(Addr), IG(IG) {
-    for (unsigned i = 0; i < IG->getFactor(); ++i)
-      if (Instruction *I = IG->getMember(i)) {
-        if (I->getType()->isVoidTy())
-          continue;
-        new VPValue(I, this);
-      }
-
-    for (auto *SV : StoredValues)
-      addOperand(SV);
-    if (Mask) {
-      HasMask = true;
-      addOperand(Mask);
-    }
+                     ArrayRef<VPValue *> StoredValues, VPValue *Mask) 
+      : VPRecipeBase(VPInterleaveSC), VPUser(Addr), IG(IG) { 
+    for (unsigned i = 0; i < IG->getFactor(); ++i) 
+      if (Instruction *I = IG->getMember(i)) { 
+        if (I->getType()->isVoidTy()) 
+          continue; 
+        new VPValue(I, this); 
+      } 
+ 
+    for (auto *SV : StoredValues) 
+      addOperand(SV); 
+    if (Mask) { 
+      HasMask = true; 
+      addOperand(Mask); 
+    } 
   }
   ~VPInterleaveRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPInterleaveSC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPInterleaveSC; 
   }
 
   /// Return the address accessed by this recipe.
   VPValue *getAddr() const {
-    return getOperand(0); // Address is the 1st, mandatory operand.
+    return getOperand(0); // Address is the 1st, mandatory operand. 
   }
 
   /// Return the mask used by this recipe. Note that a full mask is represented
   /// by a nullptr.
   VPValue *getMask() const {
     // Mask is optional and therefore the last, currently 2nd operand.
-    return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
-  }
-
-  /// Return the VPValues stored by this interleave group. If it is a load
-  /// interleave group, return an empty ArrayRef.
-  ArrayRef<VPValue *> getStoredValues() const {
-    // The first operand is the address, followed by the stored values, followed
-    // by an optional mask.
-    return ArrayRef<VPValue *>(op_begin(), getNumOperands())
-        .slice(1, getNumOperands() - (HasMask ? 2 : 1));
-  }
-
+    return HasMask ? getOperand(getNumOperands() - 1) : nullptr; 
+  }
+
+  /// Return the VPValues stored by this interleave group. If it is a load 
+  /// interleave group, return an empty ArrayRef. 
+  ArrayRef<VPValue *> getStoredValues() const { 
+    // The first operand is the address, followed by the stored values, followed 
+    // by an optional mask. 
+    return ArrayRef<VPValue *>(op_begin(), getNumOperands()) 
+        .slice(1, getNumOperands() - (HasMask ? 2 : 1)); 
+  } 
+ 
   /// Generate the wide load or store, and shuffles.
   void execute(VPTransformState &State) override;
 
@@ -1124,61 +1124,61 @@ public:
   const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
 };
 
-/// A recipe to represent inloop reduction operations, performing a reduction on
-/// a vector operand into a scalar value, and adding the result to a chain.
-/// The Operands are {ChainOp, VecOp, [Condition]}.
-class VPReductionRecipe : public VPRecipeBase, public VPUser, public VPValue {
-  /// The recurrence decriptor for the reduction in question.
-  RecurrenceDescriptor *RdxDesc;
-  /// Fast math flags to use for the resulting reduction operation.
-  bool NoNaN;
-  /// Pointer to the TTI, needed to create the target reduction
-  const TargetTransformInfo *TTI;
-
-public:
-  VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp,
-                    VPValue *VecOp, VPValue *CondOp, bool NoNaN,
-                    const TargetTransformInfo *TTI)
-      : VPRecipeBase(VPRecipeBase::VPReductionSC), VPUser({ChainOp, VecOp}),
-        VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), NoNaN(NoNaN),
-        TTI(TTI) {
-    if (CondOp)
-      addOperand(CondOp);
-  }
-
-  ~VPReductionRecipe() override = default;
-
-  /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPValue *V) {
-    return V->getVPValueID() == VPValue::VPVReductionSC;
-  }
-
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPReductionSC;
-  }
-
-  /// Generate the reduction in the loop
-  void execute(VPTransformState &State) override;
-
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-
-  /// The VPValue of the scalar Chain being accumulated.
-  VPValue *getChainOp() const { return getOperand(0); }
-  /// The VPValue of the vector value to be reduced.
-  VPValue *getVecOp() const { return getOperand(1); }
-  /// The VPValue of the condition for the block.
-  VPValue *getCondOp() const {
-    return getNumOperands() > 2 ? getOperand(2) : nullptr;
-  }
-};
-
+/// A recipe to represent inloop reduction operations, performing a reduction on 
+/// a vector operand into a scalar value, and adding the result to a chain. 
+/// The Operands are {ChainOp, VecOp, [Condition]}. 
+class VPReductionRecipe : public VPRecipeBase, public VPUser, public VPValue { 
+  /// The recurrence decriptor for the reduction in question. 
+  RecurrenceDescriptor *RdxDesc; 
+  /// Fast math flags to use for the resulting reduction operation. 
+  bool NoNaN; 
+  /// Pointer to the TTI, needed to create the target reduction 
+  const TargetTransformInfo *TTI; 
+ 
+public: 
+  VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp, 
+                    VPValue *VecOp, VPValue *CondOp, bool NoNaN, 
+                    const TargetTransformInfo *TTI) 
+      : VPRecipeBase(VPRecipeBase::VPReductionSC), VPUser({ChainOp, VecOp}), 
+        VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), NoNaN(NoNaN), 
+        TTI(TTI) { 
+    if (CondOp) 
+      addOperand(CondOp); 
+  } 
+ 
+  ~VPReductionRecipe() override = default; 
+ 
+  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+  static inline bool classof(const VPValue *V) { 
+    return V->getVPValueID() == VPValue::VPVReductionSC; 
+  } 
+ 
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPReductionSC; 
+  } 
+ 
+  /// Generate the reduction in the loop 
+  void execute(VPTransformState &State) override; 
+ 
+  /// Print the recipe. 
+  void print(raw_ostream &O, const Twine &Indent, 
+             VPSlotTracker &SlotTracker) const override; 
+ 
+  /// The VPValue of the scalar Chain being accumulated. 
+  VPValue *getChainOp() const { return getOperand(0); } 
+  /// The VPValue of the vector value to be reduced. 
+  VPValue *getVecOp() const { return getOperand(1); } 
+  /// The VPValue of the condition for the block. 
+  VPValue *getCondOp() const { 
+    return getNumOperands() > 2 ? getOperand(2) : nullptr; 
+  } 
+}; 
+ 
 /// VPReplicateRecipe replicates a given instruction producing multiple scalar
 /// copies of the original scalar type, one per lane, instead of producing a
 /// single copy of widened type for all lanes. If the instruction is known to be
 /// uniform only one copy, per lane zero, will be generated.
-class VPReplicateRecipe : public VPRecipeBase, public VPUser, public VPValue {
+class VPReplicateRecipe : public VPRecipeBase, public VPUser, public VPValue { 
   /// Indicator if only a single replica per lane is needed.
   bool IsUniform;
 
@@ -1192,9 +1192,9 @@ public:
   template <typename IterT>
   VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
                     bool IsUniform, bool IsPredicated = false)
-      : VPRecipeBase(VPReplicateSC), VPUser(Operands),
-        VPValue(VPVReplicateSC, I, this), IsUniform(IsUniform),
-        IsPredicated(IsPredicated) {
+      : VPRecipeBase(VPReplicateSC), VPUser(Operands), 
+        VPValue(VPVReplicateSC, I, this), IsUniform(IsUniform), 
+        IsPredicated(IsPredicated) { 
     // Retain the previous behavior of predicateInstructions(), where an
     // insert-element of a predicated instruction got hoisted into the
     // predicated basic block iff it was its only user. This is achieved by
@@ -1206,14 +1206,14 @@ public:
   ~VPReplicateRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPReplicateSC;
-  }
-
-  static inline bool classof(const VPValue *V) {
-    return V->getVPValueID() == VPValue::VPVReplicateSC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPReplicateSC; 
   }
 
+  static inline bool classof(const VPValue *V) { 
+    return V->getVPValueID() == VPValue::VPVReplicateSC; 
+  } 
+ 
   /// Generate replicas of the desired Ingredient. Replicas will be generated
   /// for all parts and lanes unless a specific part and lane are specified in
   /// the \p State.
@@ -1224,21 +1224,21 @@ public:
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
-
-  bool isUniform() const { return IsUniform; }
+ 
+  bool isUniform() const { return IsUniform; } 
 };
 
 /// A recipe for generating conditional branches on the bits of a mask.
-class VPBranchOnMaskRecipe : public VPRecipeBase, public VPUser {
+class VPBranchOnMaskRecipe : public VPRecipeBase, public VPUser { 
 public:
   VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) {
     if (BlockInMask) // nullptr means all-one mask.
-      addOperand(BlockInMask);
+      addOperand(BlockInMask); 
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC; 
   }
 
   /// Generate the extraction of the appropriate bit from the block mask and the
@@ -1250,7 +1250,7 @@ public:
              VPSlotTracker &SlotTracker) const override {
     O << " +\n" << Indent << "\"BRANCH-ON-MASK ";
     if (VPValue *Mask = getMask())
-      Mask->printAsOperand(O, SlotTracker);
+      Mask->printAsOperand(O, SlotTracker); 
     else
       O << " All-One";
     O << "\\l\"";
@@ -1259,9 +1259,9 @@ public:
   /// Return the mask used by this recipe. Note that a full mask is represented
   /// by a nullptr.
   VPValue *getMask() const {
-    assert(getNumOperands() <= 1 && "should have either 0 or 1 operands");
+    assert(getNumOperands() <= 1 && "should have either 0 or 1 operands"); 
     // Mask is optional.
-    return getNumOperands() == 1 ? getOperand(0) : nullptr;
+    return getNumOperands() == 1 ? getOperand(0) : nullptr; 
   }
 };
 
@@ -1270,20 +1270,20 @@ public:
 /// order to merge values that are set under such a branch and feed their uses.
 /// The phi nodes can be scalar or vector depending on the users of the value.
 /// This recipe works in concert with VPBranchOnMaskRecipe.
-class VPPredInstPHIRecipe : public VPRecipeBase, public VPUser {
+class VPPredInstPHIRecipe : public VPRecipeBase, public VPUser { 
 
 public:
   /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
   /// nodes after merging back from a Branch-on-Mask.
-  VPPredInstPHIRecipe(VPValue *PredV)
-      : VPRecipeBase(VPPredInstPHISC), VPUser(PredV) {
-    new VPValue(PredV->getUnderlyingValue(), this);
-  }
+  VPPredInstPHIRecipe(VPValue *PredV) 
+      : VPRecipeBase(VPPredInstPHISC), VPUser(PredV) { 
+    new VPValue(PredV->getUnderlyingValue(), this); 
+  } 
   ~VPPredInstPHIRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPPredInstPHISC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPPredInstPHISC; 
   }
 
   /// Generates phi nodes for live-outs as needed to retain SSA form.
@@ -1300,59 +1300,59 @@ public:
 /// - For store: Address, stored value, optional mask
 /// TODO: We currently execute only per-part unless a specific instance is
 /// provided.
-class VPWidenMemoryInstructionRecipe : public VPRecipeBase,
-                                       public VPUser {
-  Instruction &Ingredient;
+class VPWidenMemoryInstructionRecipe : public VPRecipeBase, 
+                                       public VPUser { 
+  Instruction &Ingredient; 
 
   void setMask(VPValue *Mask) {
     if (!Mask)
       return;
-    addOperand(Mask);
+    addOperand(Mask); 
   }
 
   bool isMasked() const {
-    return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
+    return isStore() ? getNumOperands() == 3 : getNumOperands() == 2; 
   }
 
 public:
   VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask)
-      : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr}),
-        Ingredient(Load) {
-    new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
+      : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr}), 
+        Ingredient(Load) { 
+    new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this); 
     setMask(Mask);
   }
 
   VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
                                  VPValue *StoredValue, VPValue *Mask)
-      : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr, StoredValue}),
-        Ingredient(Store) {
+      : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr, StoredValue}), 
+        Ingredient(Store) { 
     setMask(Mask);
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC; 
   }
 
   /// Return the address accessed by this recipe.
   VPValue *getAddr() const {
-    return getOperand(0); // Address is the 1st, mandatory operand.
+    return getOperand(0); // Address is the 1st, mandatory operand. 
   }
 
   /// Return the mask used by this recipe. Note that a full mask is represented
   /// by a nullptr.
   VPValue *getMask() const {
     // Mask is optional and therefore the last operand.
-    return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
+    return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; 
   }
 
-  /// Returns true if this recipe is a store.
-  bool isStore() const { return isa<StoreInst>(Ingredient); }
-
+  /// Returns true if this recipe is a store. 
+  bool isStore() const { return isa<StoreInst>(Ingredient); } 
+ 
   /// Return the address accessed by this recipe.
   VPValue *getStoredValue() const {
-    assert(isStore() && "Stored value only available for store instructions");
-    return getOperand(1); // Stored value is the 2nd, mandatory operand.
+    assert(isStore() && "Stored value only available for store instructions"); 
+    return getOperand(1); // Stored value is the 2nd, mandatory operand. 
   }
 
   /// Generate the wide load/store.
@@ -1365,16 +1365,16 @@ public:
 
 /// A Recipe for widening the canonical induction variable of the vector loop.
 class VPWidenCanonicalIVRecipe : public VPRecipeBase {
-public:
-  VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {
-    new VPValue(nullptr, this);
-  }
+public: 
+  VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) { 
+    new VPValue(nullptr, this); 
+  } 
 
   ~VPWidenCanonicalIVRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
+  static inline bool classof(const VPDef *D) { 
+    return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC; 
   }
 
   /// Generate a canonical vector induction variable of the vector loop, with
@@ -1461,11 +1461,11 @@ public:
   /// this VPBasicBlock, thereby "executing" the VPlan.
   void execute(struct VPTransformState *State) override;
 
-  /// Return the position of the first non-phi node recipe in the block.
-  iterator getFirstNonPhi();
-
-  void dropAllReferences(VPValue *NewValue) override;
-
+  /// Return the position of the first non-phi node recipe in the block. 
+  iterator getFirstNonPhi(); 
+ 
+  void dropAllReferences(VPValue *NewValue) override; 
+ 
 private:
   /// Create an IR BasicBlock to hold the output instructions generated by this
   /// VPBasicBlock, and return it. Update the CFGState accordingly.
@@ -1506,11 +1506,11 @@ public:
         IsReplicator(IsReplicator) {}
 
   ~VPRegionBlock() override {
-    if (Entry) {
-      VPValue DummyValue;
-      Entry->dropAllReferences(&DummyValue);
+    if (Entry) { 
+      VPValue DummyValue; 
+      Entry->dropAllReferences(&DummyValue); 
       deleteCFG(Entry);
-    }
+    } 
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1555,8 +1555,8 @@ public:
   /// The method which generates the output IR instructions that correspond to
   /// this VPRegionBlock, thereby "executing" the VPlan.
   void execute(struct VPTransformState *State) override;
-
-  void dropAllReferences(VPValue *NewValue) override;
+ 
+  void dropAllReferences(VPValue *NewValue) override; 
 };
 
 //===----------------------------------------------------------------------===//
@@ -1694,7 +1694,7 @@ class VPlan {
   VPBlockBase *Entry;
 
   /// Holds the VFs applicable to this VPlan.
-  SmallSetVector<ElementCount, 2> VFs;
+  SmallSetVector<ElementCount, 2> VFs; 
 
   /// Holds the name of the VPlan, for printing.
   std::string Name;
@@ -1714,10 +1714,10 @@ class VPlan {
   /// VPlan.
   Value2VPValueTy Value2VPValue;
 
-  /// Contains all VPValues that been allocated by addVPValue directly and need
-  /// to be free when the plan's destructor is called.
-  SmallVector<VPValue *, 16> VPValuesToFree;
-
+  /// Contains all VPValues that been allocated by addVPValue directly and need 
+  /// to be free when the plan's destructor is called. 
+  SmallVector<VPValue *, 16> VPValuesToFree; 
+ 
   /// Holds the VPLoopInfo analysis for this VPlan.
   VPLoopInfo VPLInfo;
 
@@ -1731,15 +1731,15 @@ public:
   }
 
   ~VPlan() {
-    if (Entry) {
-      VPValue DummyValue;
-      for (VPBlockBase *Block : depth_first(Entry))
-        Block->dropAllReferences(&DummyValue);
-
+    if (Entry) { 
+      VPValue DummyValue; 
+      for (VPBlockBase *Block : depth_first(Entry)) 
+        Block->dropAllReferences(&DummyValue); 
+ 
       VPBlockBase::deleteCFG(Entry);
-    }
-    for (VPValue *VPV : VPValuesToFree)
-      delete VPV;
+    } 
+    for (VPValue *VPV : VPValuesToFree) 
+      delete VPV; 
     if (BackedgeTakenCount)
       delete BackedgeTakenCount;
     for (VPValue *Def : VPExternalDefs)
@@ -1767,9 +1767,9 @@ public:
     return BackedgeTakenCount;
   }
 
-  void addVF(ElementCount VF) { VFs.insert(VF); }
+  void addVF(ElementCount VF) { VFs.insert(VF); } 
 
-  bool hasVF(ElementCount VF) { return VFs.count(VF); }
+  bool hasVF(ElementCount VF) { return VFs.count(VF); } 
 
   const std::string &getName() const { return Name; }
 
@@ -1789,17 +1789,17 @@ public:
   void addVPValue(Value *V) {
     assert(V && "Trying to add a null Value to VPlan");
     assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
-    VPValue *VPV = new VPValue(V);
-    Value2VPValue[V] = VPV;
-    VPValuesToFree.push_back(VPV);
-  }
-
-  void addVPValue(Value *V, VPValue *VPV) {
-    assert(V && "Trying to add a null Value to VPlan");
-    assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
-    Value2VPValue[V] = VPV;
+    VPValue *VPV = new VPValue(V); 
+    Value2VPValue[V] = VPV; 
+    VPValuesToFree.push_back(VPV); 
   }
 
+  void addVPValue(Value *V, VPValue *VPV) { 
+    assert(V && "Trying to add a null Value to VPlan"); 
+    assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); 
+    Value2VPValue[V] = VPV; 
+  } 
+ 
   VPValue *getVPValue(Value *V) {
     assert(V && "Trying to get the VPValue of a null Value");
     assert(Value2VPValue.count(V) && "Value does not exist in VPlan");
@@ -1813,8 +1813,8 @@ public:
     return getVPValue(V);
   }
 
-  void removeVPValueFor(Value *V) { Value2VPValue.erase(V); }
-
+  void removeVPValueFor(Value *V) { Value2VPValue.erase(V); } 
+ 
   /// Return the VPLoopInfo analysis for this VPlan.
   VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
   const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }
@@ -1892,13 +1892,13 @@ private:
 
   void dump();
 
-  static void printAsIngredient(raw_ostream &O, const Value *V);
+  static void printAsIngredient(raw_ostream &O, const Value *V); 
 };
 
 struct VPlanIngredient {
-  const Value *V;
+  const Value *V; 
 
-  VPlanIngredient(const Value *V) : V(V) {}
+  VPlanIngredient(const Value *V) : V(V) {} 
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
@@ -2048,7 +2048,7 @@ public:
   /// \returns nullptr if doesn't have such group.
   InterleaveGroup<VPInstruction> *
   getInterleaveGroup(VPInstruction *Instr) const {
-    return InterleaveGroupMap.lookup(Instr);
+    return InterleaveGroupMap.lookup(Instr); 
   }
 };
 
@@ -2132,7 +2132,7 @@ class VPlanSlp {
 public:
   VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
 
-  ~VPlanSlp() = default;
+  ~VPlanSlp() = default; 
 
   /// Tries to build an SLP tree rooted at \p Operands and returns a
   /// VPInstruction combining \p Operands, if they can be combined.
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp
index ac3b3505dc..7da23508b7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -191,7 +191,7 @@ void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) {
   // Generate edge predicates and append them to the block predicate. RPO is
   // necessary since the predecessor blocks' block predicate needs to be set
   // before the current block's block predicate can be computed.
-  for (VPBlockBase *Block : RPOT) {
+  for (VPBlockBase *Block : RPOT) { 
     // TODO: Handle nested regions once we start generating the same.
     assert(!isa<VPRegionBlock>(Block) && "Nested region not expected");
     createOrPropagatePredicates(Block, Region);
@@ -208,7 +208,7 @@ void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) {
   ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
   VPBlockBase *PrevBlock = nullptr;
 
-  for (VPBlockBase *CurrBlock : RPOT) {
+  for (VPBlockBase *CurrBlock : RPOT) { 
     // TODO: Handle nested regions once we start generating the same.
     assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected");
 
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp
index 6f21bf4429..5b8145ff62 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -124,7 +124,7 @@ bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
     for (auto &I : *Parent) {
       auto *VPI = cast<VPInstruction>(&I);
       if (VPI->getOpcode() == Instruction::Load &&
-          llvm::is_contained(Operands, VPI))
+          llvm::is_contained(Operands, VPI)) 
         LoadsSeen++;
 
       if (LoadsSeen == Operands.size())
@@ -161,8 +161,8 @@ static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
                                              unsigned OperandIndex) {
   SmallVector<VPValue *, 4> Operands;
   for (VPValue *V : Values) {
-    // Currently we only support VPInstructions.
-    auto *U = cast<VPInstruction>(V);
+    // Currently we only support VPInstructions. 
+    auto *U = cast<VPInstruction>(V); 
     Operands.push_back(U->getOperand(OperandIndex));
   }
   return Operands;
@@ -223,20 +223,20 @@ static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
 /// Traverses and compares operands of V1 and V2 to MaxLevel.
 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
                            VPInterleavedAccessInfo &IAI) {
-  auto *I1 = dyn_cast<VPInstruction>(V1);
-  auto *I2 = dyn_cast<VPInstruction>(V2);
-  // Currently we only support VPInstructions.
-  if (!I1 || !I2)
+  auto *I1 = dyn_cast<VPInstruction>(V1); 
+  auto *I2 = dyn_cast<VPInstruction>(V2); 
+  // Currently we only support VPInstructions. 
+  if (!I1 || !I2) 
     return 0;
 
   if (MaxLevel == 0)
-    return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
+    return (unsigned)areConsecutiveOrMatch(I1, I2, IAI); 
 
   unsigned Score = 0;
-  for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
-    for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
-      Score +=
-          getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
+  for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I) 
+    for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J) 
+      Score += 
+          getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI); 
   return Score;
 }
 
@@ -466,8 +466,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
   auto *VPI = new VPInstruction(Opcode, CombinedOperands);
   VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
 
-  LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
-                    << *cast<VPInstruction>(Values[0]) << "\n");
+  LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " 
+                    << *cast<VPInstruction>(Values[0]) << "\n"); 
   addCombined(Values, VPI);
   return VPI;
 }
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1a54603faf..6773dc5a61 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -48,8 +48,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
       VPInstruction *VPInst = cast<VPInstruction>(Ingredient);
       Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
       if (DeadInstructions.count(Inst)) {
-        VPValue DummyValue;
-        VPInst->replaceAllUsesWith(&DummyValue);
+        VPValue DummyValue; 
+        VPInst->replaceAllUsesWith(&DummyValue); 
         Ingredient->eraseFromParent();
         continue;
       }
@@ -68,8 +68,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         InductionDescriptor II = Inductions.lookup(Phi);
         if (II.getKind() == InductionDescriptor::IK_IntInduction ||
             II.getKind() == InductionDescriptor::IK_FpInduction) {
-          VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
-          NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start);
+          VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); 
+          NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start); 
         } else
           NewRecipe = new VPWidenPHIRecipe(Phi);
       } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
@@ -80,11 +80,11 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
             new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands()));
 
       NewRecipe->insertBefore(Ingredient);
-      if (NewRecipe->getNumDefinedValues() == 1)
-        VPInst->replaceAllUsesWith(NewRecipe->getVPValue());
-      else
-        assert(NewRecipe->getNumDefinedValues() == 0 &&
-               "Only recpies with zero or one defined values expected");
+      if (NewRecipe->getNumDefinedValues() == 1) 
+        VPInst->replaceAllUsesWith(NewRecipe->getVPValue()); 
+      else 
+        assert(NewRecipe->getNumDefinedValues() == 0 && 
+               "Only recpies with zero or one defined values expected"); 
       Ingredient->eraseFromParent();
     }
   }
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h
index ed572ca366..b43c8398b6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h
@@ -10,9 +10,9 @@
 /// This file contains the declarations of the entities induced by Vectorization
 /// Plans, e.g. the instructions the VPlan intends to generate if executed.
 /// VPlan models the following entities:
-/// VPValue   VPUser   VPDef
-///    |        |
-///   VPInstruction
+/// VPValue   VPUser   VPDef 
+///    |        | 
+///   VPInstruction 
 /// These are documented in docs/VectorizationPlan.rst.
 ///
 //===----------------------------------------------------------------------===//
@@ -21,9 +21,9 @@
 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLExtras.h" 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/ADT/TinyPtrVector.h" 
 #include "llvm/ADT/iterator_range.h"
 
 namespace llvm {
@@ -31,11 +31,11 @@ namespace llvm {
 // Forward declarations.
 class raw_ostream;
 class Value;
-class VPDef;
+class VPDef; 
 class VPSlotTracker;
 class VPUser;
-class VPRecipeBase;
-class VPWidenMemoryInstructionRecipe;
+class VPRecipeBase; 
+class VPWidenMemoryInstructionRecipe; 
 
 // This is the base class of the VPlan Def/Use graph, used for modeling the data
 // flow into, within and out of the VPlan. VPValues can stand for live-ins
@@ -43,14 +43,14 @@ class VPWidenMemoryInstructionRecipe;
 // and live-outs which the VPlan will need to fix accordingly.
 class VPValue {
   friend class VPBuilder;
-  friend class VPDef;
-  friend class VPInstruction;
+  friend class VPDef; 
+  friend class VPInstruction; 
   friend struct VPlanTransforms;
   friend class VPBasicBlock;
   friend class VPInterleavedAccessInfo;
   friend class VPSlotTracker;
-  friend class VPRecipeBase;
-  friend class VPWidenMemoryInstructionRecipe;
+  friend class VPRecipeBase; 
+  friend class VPWidenMemoryInstructionRecipe; 
 
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
@@ -60,12 +60,12 @@ protected:
   // Hold the underlying Value, if any, attached to this VPValue.
   Value *UnderlyingVal;
 
-  /// Pointer to the VPDef that defines this VPValue. If it is nullptr, the
-  /// VPValue is not defined by any recipe modeled in VPlan.
-  VPDef *Def;
-
-  VPValue(const unsigned char SC, Value *UV = nullptr, VPDef *Def = nullptr);
+  /// Pointer to the VPDef that defines this VPValue. If it is nullptr, the 
+  /// VPValue is not defined by any recipe modeled in VPlan. 
+  VPDef *Def; 
 
+  VPValue(const unsigned char SC, Value *UV = nullptr, VPDef *Def = nullptr); 
+ 
   // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to
   // the front-end and back-end of VPlan so that the middle-end is as
   // independent as possible of the underlying IR. We grant access to the
@@ -80,33 +80,33 @@ protected:
   }
 
 public:
-  /// Return the underlying Value attached to this VPValue.
-  Value *getUnderlyingValue() { return UnderlyingVal; }
-  const Value *getUnderlyingValue() const { return UnderlyingVal; }
-
+  /// Return the underlying Value attached to this VPValue. 
+  Value *getUnderlyingValue() { return UnderlyingVal; } 
+  const Value *getUnderlyingValue() const { return UnderlyingVal; } 
+ 
   /// An enumeration for keeping track of the concrete subclass of VPValue that
   /// are actually instantiated. Values of this enumeration are kept in the
   /// SubclassID field of the VPValue objects. They are used for concrete
   /// type identification.
-  enum {
-    VPValueSC,
-    VPVInstructionSC,
-    VPVMemoryInstructionSC,
-    VPVReductionSC,
-    VPVReplicateSC,
-    VPVWidenSC,
-    VPVWidenCallSC,
-    VPVWidenGEPSC,
-    VPVWidenSelectSC,
-  };
-
-  VPValue(Value *UV = nullptr, VPDef *Def = nullptr)
-      : VPValue(VPValueSC, UV, Def) {}
+  enum { 
+    VPValueSC, 
+    VPVInstructionSC, 
+    VPVMemoryInstructionSC, 
+    VPVReductionSC, 
+    VPVReplicateSC, 
+    VPVWidenSC, 
+    VPVWidenCallSC, 
+    VPVWidenGEPSC, 
+    VPVWidenSelectSC, 
+  }; 
+
+  VPValue(Value *UV = nullptr, VPDef *Def = nullptr) 
+      : VPValue(VPValueSC, UV, Def) {} 
   VPValue(const VPValue &) = delete;
   VPValue &operator=(const VPValue &) = delete;
 
-  virtual ~VPValue();
-
+  virtual ~VPValue(); 
+ 
   /// \return an ID for the concrete type of this object.
   /// This is used to implement the classof checks. This should not be used
   /// for any other purpose, as the values may change as LLVM evolves.
@@ -115,28 +115,28 @@ public:
   void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const;
   void print(raw_ostream &OS, VPSlotTracker &Tracker) const;
 
-  /// Dump the value to stderr (for debugging).
-  void dump() const;
-
+  /// Dump the value to stderr (for debugging). 
+  void dump() const; 
+ 
   unsigned getNumUsers() const { return Users.size(); }
   void addUser(VPUser &User) { Users.push_back(&User); }
 
-  /// Remove a single \p User from the list of users.
-  void removeUser(VPUser &User) {
-    bool Found = false;
-    // The same user can be added multiple times, e.g. because the same VPValue
-    // is used twice by the same VPUser. Remove a single one.
-    erase_if(Users, [&User, &Found](VPUser *Other) {
-      if (Found)
-        return false;
-      if (Other == &User) {
-        Found = true;
-        return true;
-      }
-      return false;
-    });
-  }
-
+  /// Remove a single \p User from the list of users. 
+  void removeUser(VPUser &User) { 
+    bool Found = false; 
+    // The same user can be added multiple times, e.g. because the same VPValue 
+    // is used twice by the same VPUser. Remove a single one. 
+    erase_if(Users, [&User, &Found](VPUser *Other) { 
+      if (Found) 
+        return false; 
+      if (Other == &User) { 
+        Found = true; 
+        return true; 
+      } 
+      return false; 
+    }); 
+  } 
+ 
   typedef SmallVectorImpl<VPUser *>::iterator user_iterator;
   typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator;
   typedef iterator_range<user_iterator> user_range;
@@ -164,17 +164,17 @@ public:
   }
 
   void replaceAllUsesWith(VPValue *New);
-
-  VPDef *getDef() { return Def; }
-
-  /// Returns the underlying IR value, if this VPValue is defined outside the
-  /// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef
-  /// inside a VPlan.
-  Value *getLiveInIRValue() {
-    assert(!getDef() &&
-           "VPValue is not a live-in; it is defined by a VPDef inside a VPlan");
-    return getUnderlyingValue();
-  }
+ 
+  VPDef *getDef() { return Def; } 
+ 
+  /// Returns the underlying IR value, if this VPValue is defined outside the 
+  /// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef 
+  /// inside a VPlan. 
+  Value *getLiveInIRValue() { 
+    assert(!getDef() && 
+           "VPValue is not a live-in; it is defined by a VPDef inside a VPlan"); 
+    return getUnderlyingValue(); 
+  } 
 };
 
 typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
@@ -184,32 +184,32 @@ raw_ostream &operator<<(raw_ostream &OS, const VPValue &V);
 
 /// This class augments VPValue with operands which provide the inverse def-use
 /// edges from VPValue's users to their defs.
-class VPUser {
+class VPUser { 
   SmallVector<VPValue *, 2> Operands;
 
 protected:
-  /// Print the operands to \p O.
-  void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const;
-
-public:
-  VPUser() {}
-  VPUser(ArrayRef<VPValue *> Operands) {
+  /// Print the operands to \p O. 
+  void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const; 
+ 
+public: 
+  VPUser() {} 
+  VPUser(ArrayRef<VPValue *> Operands) { 
     for (VPValue *Operand : Operands)
       addOperand(Operand);
   }
 
   VPUser(std::initializer_list<VPValue *> Operands)
       : VPUser(ArrayRef<VPValue *>(Operands)) {}
-  template <typename IterT> VPUser(iterator_range<IterT> Operands) {
+  template <typename IterT> VPUser(iterator_range<IterT> Operands) { 
     for (VPValue *Operand : Operands)
       addOperand(Operand);
   }
 
   VPUser(const VPUser &) = delete;
   VPUser &operator=(const VPUser &) = delete;
-  virtual ~VPUser() {
-    for (VPValue *Op : operands())
-      Op->removeUser(*this);
+  virtual ~VPUser() { 
+    for (VPValue *Op : operands()) 
+      Op->removeUser(*this); 
   }
 
   void addOperand(VPValue *Operand) {
@@ -223,11 +223,11 @@ public:
     return Operands[N];
   }
 
-  void setOperand(unsigned I, VPValue *New) {
-    Operands[I]->removeUser(*this);
-    Operands[I] = New;
-    New->addUser(*this);
-  }
+  void setOperand(unsigned I, VPValue *New) { 
+    Operands[I]->removeUser(*this); 
+    Operands[I] = New; 
+    New->addUser(*this); 
+  } 
 
   typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
   typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
@@ -242,110 +242,110 @@ public:
   const_operand_range operands() const {
     return const_operand_range(op_begin(), op_end());
   }
-
-  /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *Recipe);
+ 
+  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+  static inline bool classof(const VPDef *Recipe); 
 };
-
-/// This class augments a recipe with a set of VPValues defined by the recipe.
-/// It allows recipes to define zero, one or multiple VPValues. A VPDef owns
-/// the VPValues it defines and is responsible for deleting its defined values.
-/// Single-value VPDefs that also inherit from VPValue must make sure to inherit
-/// from VPDef before VPValue.
-class VPDef {
-  friend class VPValue;
-
-  /// Subclass identifier (for isa/dyn_cast).
-  const unsigned char SubclassID;
-
-  /// The VPValues defined by this VPDef.
-  TinyPtrVector<VPValue *> DefinedValues;
-
-  /// Add \p V as a defined value by this VPDef.
-  void addDefinedValue(VPValue *V) {
-    assert(V->getDef() == this &&
-           "can only add VPValue already linked with this VPDef");
-    DefinedValues.push_back(V);
-  }
-
-  /// Remove \p V from the values defined by this VPDef. \p V must be a defined
-  /// value of this VPDef.
-  void removeDefinedValue(VPValue *V) {
-    assert(V->getDef() == this &&
-           "can only remove VPValue linked with this VPDef");
-    assert(is_contained(DefinedValues, V) &&
-           "VPValue to remove must be in DefinedValues");
-    erase_value(DefinedValues, V);
-    V->Def = nullptr;
-  }
-
-public:
-  /// An enumeration for keeping track of the concrete subclass of VPRecipeBase
-  /// that is actually instantiated. Values of this enumeration are kept in the
-  /// SubclassID field of the VPRecipeBase objects. They are used for concrete
-  /// type identification.
-  using VPRecipeTy = enum {
-    VPBlendSC,
-    VPBranchOnMaskSC,
-    VPInstructionSC,
-    VPInterleaveSC,
-    VPPredInstPHISC,
-    VPReductionSC,
-    VPReplicateSC,
-    VPWidenCallSC,
-    VPWidenCanonicalIVSC,
-    VPWidenGEPSC,
-    VPWidenIntOrFpInductionSC,
-    VPWidenMemoryInstructionSC,
-    VPWidenPHISC,
-    VPWidenSC,
-    VPWidenSelectSC
-  };
-
-  VPDef(const unsigned char SC) : SubclassID(SC) {}
-
-  virtual ~VPDef() {
-    for (VPValue *D : make_early_inc_range(DefinedValues)) {
-      assert(D->Def == this &&
-             "all defined VPValues should point to the containing VPDef");
-      assert(D->getNumUsers() == 0 &&
-             "all defined VPValues should have no more users");
-      D->Def = nullptr;
-      delete D;
-    }
-  }
-
-  /// Returns the VPValue with index \p I defined by the VPDef.
-  VPValue *getVPValue(unsigned I = 0) {
-    assert(DefinedValues[I] && "defined value must be non-null");
-    return DefinedValues[I];
-  }
-  const VPValue *getVPValue(unsigned I = 0) const {
-    assert(DefinedValues[I] && "defined value must be non-null");
-    return DefinedValues[I];
-  }
-
-  /// Returns an ArrayRef of the values defined by the VPDef.
-  ArrayRef<VPValue *> definedValues() { return DefinedValues; }
-  /// Returns an ArrayRef of the values defined by the VPDef.
-  ArrayRef<VPValue *> definedValues() const { return DefinedValues; }
-
-  /// Returns the number of values defined by the VPDef.
-  unsigned getNumDefinedValues() const { return DefinedValues.size(); }
-
-  /// \return an ID for the concrete type of this object.
-  /// This is used to implement the classof checks. This should not be used
-  /// for any other purpose, as the values may change as LLVM evolves.
-  unsigned getVPDefID() const { return SubclassID; }
-
-  /// Dump the VPDef to stderr (for debugging).
-  void dump() const;
-
-  /// Each concrete VPDef prints itself.
-  virtual void print(raw_ostream &O, const Twine &Indent,
-                     VPSlotTracker &SlotTracker) const = 0;
-};
-
+ 
+/// This class augments a recipe with a set of VPValues defined by the recipe. 
+/// It allows recipes to define zero, one or multiple VPValues. A VPDef owns 
+/// the VPValues it defines and is responsible for deleting its defined values. 
+/// Single-value VPDefs that also inherit from VPValue must make sure to inherit 
+/// from VPDef before VPValue. 
+class VPDef { 
+  friend class VPValue; 
+ 
+  /// Subclass identifier (for isa/dyn_cast). 
+  const unsigned char SubclassID; 
+ 
+  /// The VPValues defined by this VPDef. 
+  TinyPtrVector<VPValue *> DefinedValues; 
+ 
+  /// Add \p V as a defined value by this VPDef. 
+  void addDefinedValue(VPValue *V) { 
+    assert(V->getDef() == this && 
+           "can only add VPValue already linked with this VPDef"); 
+    DefinedValues.push_back(V); 
+  } 
+ 
+  /// Remove \p V from the values defined by this VPDef. \p V must be a defined 
+  /// value of this VPDef. 
+  void removeDefinedValue(VPValue *V) { 
+    assert(V->getDef() == this && 
+           "can only remove VPValue linked with this VPDef"); 
+    assert(is_contained(DefinedValues, V) && 
+           "VPValue to remove must be in DefinedValues"); 
+    erase_value(DefinedValues, V); 
+    V->Def = nullptr; 
+  } 
+ 
+public: 
+  /// An enumeration for keeping track of the concrete subclass of VPRecipeBase 
+  /// that is actually instantiated. Values of this enumeration are kept in the 
+  /// SubclassID field of the VPRecipeBase objects. They are used for concrete 
+  /// type identification. 
+  using VPRecipeTy = enum { 
+    VPBlendSC, 
+    VPBranchOnMaskSC, 
+    VPInstructionSC, 
+    VPInterleaveSC, 
+    VPPredInstPHISC, 
+    VPReductionSC, 
+    VPReplicateSC, 
+    VPWidenCallSC, 
+    VPWidenCanonicalIVSC, 
+    VPWidenGEPSC, 
+    VPWidenIntOrFpInductionSC, 
+    VPWidenMemoryInstructionSC, 
+    VPWidenPHISC, 
+    VPWidenSC, 
+    VPWidenSelectSC 
+  }; 
+ 
+  VPDef(const unsigned char SC) : SubclassID(SC) {} 
+ 
+  virtual ~VPDef() { 
+    for (VPValue *D : make_early_inc_range(DefinedValues)) { 
+      assert(D->Def == this && 
+             "all defined VPValues should point to the containing VPDef"); 
+      assert(D->getNumUsers() == 0 && 
+             "all defined VPValues should have no more users"); 
+      D->Def = nullptr; 
+      delete D; 
+    } 
+  } 
+ 
+  /// Returns the VPValue with index \p I defined by the VPDef. 
+  VPValue *getVPValue(unsigned I = 0) { 
+    assert(DefinedValues[I] && "defined value must be non-null"); 
+    return DefinedValues[I]; 
+  } 
+  const VPValue *getVPValue(unsigned I = 0) const { 
+    assert(DefinedValues[I] && "defined value must be non-null"); 
+    return DefinedValues[I]; 
+  } 
+ 
+  /// Returns an ArrayRef of the values defined by the VPDef. 
+  ArrayRef<VPValue *> definedValues() { return DefinedValues; } 
+  /// Returns an ArrayRef of the values defined by the VPDef. 
+  ArrayRef<VPValue *> definedValues() const { return DefinedValues; } 
+ 
+  /// Returns the number of values defined by the VPDef. 
+  unsigned getNumDefinedValues() const { return DefinedValues.size(); } 
+ 
+  /// \return an ID for the concrete type of this object. 
+  /// This is used to implement the classof checks. This should not be used 
+  /// for any other purpose, as the values may change as LLVM evolves. 
+  unsigned getVPDefID() const { return SubclassID; } 
+ 
+  /// Dump the VPDef to stderr (for debugging). 
+  void dump() const; 
+ 
+  /// Each concrete VPDef prints itself. 
+  virtual void print(raw_ostream &O, const Twine &Indent, 
+                     VPSlotTracker &SlotTracker) const = 0; 
+}; 
+ 
 class VPlan;
 class VPBasicBlock;
 class VPRegionBlock;
@@ -365,7 +365,7 @@ class VPSlotTracker {
   void assignSlots(const VPlan &Plan);
 
 public:
-  VPSlotTracker(const VPlan *Plan = nullptr) {
+  VPSlotTracker(const VPlan *Plan = nullptr) { 
     if (Plan)
       assignSlots(*Plan);
   }
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 6eec8d14de..b8abab63df 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -65,7 +65,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) {
     for (const VPBlockBase *Succ : Successors) {
       // There must be a bi-directional link between block and successor.
       const auto &SuccPreds = Succ->getPredecessors();
-      assert(llvm::is_contained(SuccPreds, VPB) && "Missing predecessor link.");
+      assert(llvm::is_contained(SuccPreds, VPB) && "Missing predecessor link."); 
       (void)SuccPreds;
     }
 
@@ -84,7 +84,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) {
 
       // There must be a bi-directional link between block and predecessor.
       const auto &PredSuccs = Pred->getSuccessors();
-      assert(llvm::is_contained(PredSuccs, VPB) && "Missing successor link.");
+      assert(llvm::is_contained(PredSuccs, VPB) && "Missing successor link."); 
       (void)PredSuccs;
     }
   }
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp
index 787f146bdd..7b0a72de4e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/Loads.h" 
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -34,7 +34,7 @@ using namespace llvm;
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "vector-combine"
-STATISTIC(NumVecLoad, "Number of vector loads formed");
+STATISTIC(NumVecLoad, "Number of vector loads formed"); 
 STATISTIC(NumVecCmp, "Number of vector compares formed");
 STATISTIC(NumVecBO, "Number of vector binops formed");
 STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
@@ -67,7 +67,7 @@ private:
   const TargetTransformInfo &TTI;
   const DominatorTree &DT;
 
-  bool vectorizeLoadInsert(Instruction &I);
+  bool vectorizeLoadInsert(Instruction &I); 
   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
                                         ExtractElementInst *Ext1,
                                         unsigned PreferredExtractIndex) const;
@@ -91,138 +91,138 @@ static void replaceValue(Value &Old, Value &New) {
   New.takeName(&Old);
 }
 
-bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
-  // Match insert into fixed vector of scalar value.
-  // TODO: Handle non-zero insert index.
-  auto *Ty = dyn_cast<FixedVectorType>(I.getType());
-  Value *Scalar;
-  if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
-      !Scalar->hasOneUse())
-    return false;
-
-  // Optionally match an extract from another vector.
-  Value *X;
-  bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
-  if (!HasExtract)
-    X = Scalar;
-
-  // Match source value as load of scalar or vector.
-  // Do not vectorize scalar load (widening) if atomic/volatile or under
-  // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions
-  // or create data races non-existent in the source.
-  auto *Load = dyn_cast<LoadInst>(X);
-  if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
-      Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
-      mustSuppressSpeculation(*Load))
-    return false;
-
-  const DataLayout &DL = I.getModule()->getDataLayout();
-  Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
-  assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
-
-  // If original AS != Load's AS, we can't bitcast the original pointer and have
-  // to use Load's operand instead. Ideally we would want to strip pointer casts
-  // without changing AS, but there's no API to do that ATM.
-  unsigned AS = Load->getPointerAddressSpace();
-  if (AS != SrcPtr->getType()->getPointerAddressSpace())
-    SrcPtr = Load->getPointerOperand();
-
-  // We are potentially transforming byte-sized (8-bit) memory accesses, so make
-  // sure we have all of our type-based constraints in place for this target.
-  Type *ScalarTy = Scalar->getType();
-  uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
-  unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
-  if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
-      ScalarSize % 8 != 0)
-    return false;
-
-  // Check safety of replacing the scalar load with a larger vector load.
-  // We use minimal alignment (maximum flexibility) because we only care about
-  // the dereferenceable region. When calculating cost and creating a new op,
-  // we may use a larger value based on alignment attributes.
-  unsigned MinVecNumElts = MinVectorSize / ScalarSize;
-  auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
-  unsigned OffsetEltIndex = 0;
-  Align Alignment = Load->getAlign();
-  if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) {
-    // It is not safe to load directly from the pointer, but we can still peek
-    // through gep offsets and check if it safe to load from a base address with
-    // updated alignment. If it is, we can shuffle the element(s) into place
-    // after loading.
-    unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType());
-    APInt Offset(OffsetBitWidth, 0);
-    SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
-
-    // We want to shuffle the result down from a high element of a vector, so
-    // the offset must be positive.
-    if (Offset.isNegative())
-      return false;
-
-    // The offset must be a multiple of the scalar element to shuffle cleanly
-    // in the element's size.
-    uint64_t ScalarSizeInBytes = ScalarSize / 8;
-    if (Offset.urem(ScalarSizeInBytes) != 0)
-      return false;
-
-    // If we load MinVecNumElts, will our target element still be loaded?
-    OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
-    if (OffsetEltIndex >= MinVecNumElts)
-      return false;
-
-    if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT))
-      return false;
-
-    // Update alignment with offset value. Note that the offset could be negated
-    // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
-    // negation does not change the result of the alignment calculation.
-    Alignment = commonAlignment(Alignment, Offset.getZExtValue());
-  }
-
-  // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
-  // Use the greater of the alignment on the load or its source pointer.
-  Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
-  Type *LoadTy = Load->getType();
-  InstructionCost OldCost =
-      TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
-  APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
-  OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
-                                          /* Insert */ true, HasExtract);
-
-  // New pattern: load VecPtr
-  InstructionCost NewCost =
-      TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
-  // Optionally, we are shuffling the loaded vector element(s) into place.
-  if (OffsetEltIndex)
-    NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy);
-
-  // We can aggressively convert to the vector form because the backend can
-  // invert this transform if it does not result in a performance win.
-  if (OldCost < NewCost || !NewCost.isValid())
-    return false;
-
-  // It is safe and potentially profitable to load a vector directly:
-  // inselt undef, load Scalar, 0 --> load VecPtr
-  IRBuilder<> Builder(Load);
-  Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));
-  Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
-
-  // Set everything but element 0 to undef to prevent poison from propagating
-  // from the extra loaded memory. This will also optionally shrink/grow the
-  // vector from the loaded size to the output size.
-  // We assume this operation has no cost in codegen if there was no offset.
-  // Note that we could use freeze to avoid poison problems, but then we might
-  // still need a shuffle to change the vector size.
-  unsigned OutputNumElts = Ty->getNumElements();
-  SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
-  assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
-  Mask[0] = OffsetEltIndex;
-  VecLd = Builder.CreateShuffleVector(VecLd, Mask);
-
-  replaceValue(I, *VecLd);
-  ++NumVecLoad;
-  return true;
-}
-
+bool VectorCombine::vectorizeLoadInsert(Instruction &I) { 
+  // Match insert into fixed vector of scalar value. 
+  // TODO: Handle non-zero insert index. 
+  auto *Ty = dyn_cast<FixedVectorType>(I.getType()); 
+  Value *Scalar; 
+  if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || 
+      !Scalar->hasOneUse()) 
+    return false; 
+ 
+  // Optionally match an extract from another vector. 
+  Value *X; 
+  bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt())); 
+  if (!HasExtract) 
+    X = Scalar; 
+ 
+  // Match source value as load of scalar or vector. 
+  // Do not vectorize scalar load (widening) if atomic/volatile or under 
+  // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions 
+  // or create data races non-existent in the source. 
+  auto *Load = dyn_cast<LoadInst>(X); 
+  if (!Load || !Load->isSimple() || !Load->hasOneUse() || 
+      Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || 
+      mustSuppressSpeculation(*Load)) 
+    return false; 
+ 
+  const DataLayout &DL = I.getModule()->getDataLayout(); 
+  Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); 
+  assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); 
+ 
+  // If original AS != Load's AS, we can't bitcast the original pointer and have 
+  // to use Load's operand instead. Ideally we would want to strip pointer casts 
+  // without changing AS, but there's no API to do that ATM. 
+  unsigned AS = Load->getPointerAddressSpace(); 
+  if (AS != SrcPtr->getType()->getPointerAddressSpace()) 
+    SrcPtr = Load->getPointerOperand(); 
+ 
+  // We are potentially transforming byte-sized (8-bit) memory accesses, so make 
+  // sure we have all of our type-based constraints in place for this target. 
+  Type *ScalarTy = Scalar->getType(); 
+  uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); 
+  unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); 
+  if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || 
+      ScalarSize % 8 != 0) 
+    return false; 
+ 
+  // Check safety of replacing the scalar load with a larger vector load. 
+  // We use minimal alignment (maximum flexibility) because we only care about 
+  // the dereferenceable region. When calculating cost and creating a new op, 
+  // we may use a larger value based on alignment attributes. 
+  unsigned MinVecNumElts = MinVectorSize / ScalarSize; 
+  auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); 
+  unsigned OffsetEltIndex = 0; 
+  Align Alignment = Load->getAlign(); 
+  if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) { 
+    // It is not safe to load directly from the pointer, but we can still peek 
+    // through gep offsets and check if it safe to load from a base address with 
+    // updated alignment. If it is, we can shuffle the element(s) into place 
+    // after loading. 
+    unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType()); 
+    APInt Offset(OffsetBitWidth, 0); 
+    SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); 
+ 
+    // We want to shuffle the result down from a high element of a vector, so 
+    // the offset must be positive. 
+    if (Offset.isNegative()) 
+      return false; 
+ 
+    // The offset must be a multiple of the scalar element to shuffle cleanly 
+    // in the element's size. 
+    uint64_t ScalarSizeInBytes = ScalarSize / 8; 
+    if (Offset.urem(ScalarSizeInBytes) != 0) 
+      return false; 
+ 
+    // If we load MinVecNumElts, will our target element still be loaded? 
+    OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); 
+    if (OffsetEltIndex >= MinVecNumElts) 
+      return false; 
+ 
+    if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) 
+      return false; 
+ 
+    // Update alignment with offset value. Note that the offset could be negated 
+    // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but 
+    // negation does not change the result of the alignment calculation. 
+    Alignment = commonAlignment(Alignment, Offset.getZExtValue()); 
+  } 
+ 
+  // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 
+  // Use the greater of the alignment on the load or its source pointer. 
+  Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); 
+  Type *LoadTy = Load->getType(); 
+  InstructionCost OldCost = 
+      TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); 
+  APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); 
+  OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, 
+                                          /* Insert */ true, HasExtract); 
+ 
+  // New pattern: load VecPtr 
+  InstructionCost NewCost = 
+      TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); 
+  // Optionally, we are shuffling the loaded vector element(s) into place. 
+  if (OffsetEltIndex) 
+    NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy); 
+ 
+  // We can aggressively convert to the vector form because the backend can 
+  // invert this transform if it does not result in a performance win. 
+  if (OldCost < NewCost || !NewCost.isValid()) 
+    return false; 
+ 
+  // It is safe and potentially profitable to load a vector directly: 
+  // inselt undef, load Scalar, 0 --> load VecPtr 
+  IRBuilder<> Builder(Load); 
+  Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS)); 
+  Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); 
+ 
+  // Set everything but element 0 to undef to prevent poison from propagating 
+  // from the extra loaded memory. This will also optionally shrink/grow the 
+  // vector from the loaded size to the output size. 
+  // We assume this operation has no cost in codegen if there was no offset. 
+  // Note that we could use freeze to avoid poison problems, but then we might 
+  // still need a shuffle to change the vector size. 
+  unsigned OutputNumElts = Ty->getNumElements(); 
+  SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem); 
+  assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); 
+  Mask[0] = OffsetEltIndex; 
+  VecLd = Builder.CreateShuffleVector(VecLd, Mask); 
+ 
+  replaceValue(I, *VecLd); 
+  ++NumVecLoad; 
+  return true; 
+} 
+ 
 /// Determine which, if any, of the inputs should be replaced by a shuffle
 /// followed by extract from a different index.
 ExtractElementInst *VectorCombine::getShuffleExtract(
@@ -241,15 +241,15 @@ ExtractElementInst *VectorCombine::getShuffleExtract(
 
   Type *VecTy = Ext0->getVectorOperand()->getType();
   assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
-  InstructionCost Cost0 =
-      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
-  InstructionCost Cost1 =
-      TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
-
-  // If both costs are invalid no shuffle is needed
-  if (!Cost0.isValid() && !Cost1.isValid())
-    return nullptr;
-
+  InstructionCost Cost0 = 
+      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); 
+  InstructionCost Cost1 = 
+      TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); 
+
+  // If both costs are invalid no shuffle is needed 
+  if (!Cost0.isValid() && !Cost1.isValid()) 
+    return nullptr; 
+ 
   // We are extracting from 2 different indexes, so one operand must be shuffled
   // before performing a vector operation and/or extract. The more expensive
   // extract will be replaced by a shuffle.
@@ -284,7 +284,7 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
          "Expected constant extract indexes");
   Type *ScalarTy = Ext0->getType();
   auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
-  InstructionCost ScalarOpCost, VectorOpCost;
+  InstructionCost ScalarOpCost, VectorOpCost; 
 
   // Get cost estimates for scalar and vector versions of the operation.
   bool IsBinOp = Instruction::isBinaryOp(Opcode);
@@ -305,9 +305,9 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
   unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
   unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
 
-  InstructionCost Extract0Cost =
+  InstructionCost Extract0Cost = 
       TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
-  InstructionCost Extract1Cost =
+  InstructionCost Extract1Cost = 
       TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
 
   // A more expensive extract will always be replaced by a splat shuffle.
@@ -317,11 +317,11 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
   // TODO: Evaluate whether that always results in lowest cost. Alternatively,
   //       check the cost of creating a broadcast shuffle and shuffling both
   //       operands to element 0.
-  InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
+  InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost); 
 
   // Extra uses of the extracts mean that we include those costs in the
   // vector total because those instructions will not be eliminated.
-  InstructionCost OldCost, NewCost;
+  InstructionCost OldCost, NewCost; 
   if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
     // Handle a special case. If the 2 extracts are identical, adjust the
     // formulas to account for that. The extra use charge allows for either the
@@ -372,7 +372,7 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
   auto *VecTy = cast<FixedVectorType>(Vec->getType());
   SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
   ShufMask[NewIndex] = OldIndex;
-  return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
+  return Builder.CreateShuffleVector(Vec, ShufMask, "shift"); 
 }
 
 /// Given an extract element instruction with constant index operand, shuffle
@@ -506,23 +506,23 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
                      m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask))))))
     return false;
 
-  // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
-  // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
-  // mask for scalable type is a splat or not.
-  // 2) Disallow non-vector casts and length-changing shuffles.
+  // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for 
+  // scalable type is unknown; Second, we cannot reason if the narrowed shuffle 
+  // mask for scalable type is a splat or not. 
+  // 2) Disallow non-vector casts and length-changing shuffles. 
   // TODO: We could allow any shuffle.
-  auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
-  auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
-  if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy)
+  auto *DestTy = dyn_cast<FixedVectorType>(I.getType()); 
+  auto *SrcTy = dyn_cast<FixedVectorType>(V->getType()); 
+  if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy) 
     return false;
 
   // The new shuffle must not cost more than the old shuffle. The bitcast is
   // moved ahead of the shuffle, so assume that it has the same cost as before.
-  InstructionCost DestCost =
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy);
-  InstructionCost SrcCost =
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy);
-  if (DestCost > SrcCost || !DestCost.isValid())
+  InstructionCost DestCost = 
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy); 
+  InstructionCost SrcCost = 
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy); 
+  if (DestCost > SrcCost || !DestCost.isValid()) 
     return false;
 
   unsigned DestNumElts = DestTy->getNumElements();
@@ -545,7 +545,7 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
   // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
   ++NumShufOfBitcast;
   Value *CastV = Builder.CreateBitCast(V, DestTy);
-  Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
+  Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask); 
   replaceValue(I, *Shuf);
   return true;
 }
@@ -612,7 +612,7 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
          "Unexpected types for insert element into binop or cmp");
 
   unsigned Opcode = I.getOpcode();
-  InstructionCost ScalarOpCost, VectorOpCost;
+  InstructionCost ScalarOpCost, VectorOpCost; 
   if (IsCmp) {
     ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
     VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
@@ -623,16 +623,16 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 
   // Get cost estimate for the insert element. This cost will factor into
   // both sequences.
-  InstructionCost InsertCost =
+  InstructionCost InsertCost = 
       TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
-  InstructionCost OldCost =
-      (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
-  InstructionCost NewCost = ScalarOpCost + InsertCost +
-                            (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
-                            (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
+  InstructionCost OldCost = 
+      (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; 
+  InstructionCost NewCost = ScalarOpCost + InsertCost + 
+                            (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + 
+                            (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); 
 
   // We want to scalarize unless the vector variant actually has lower cost.
-  if (OldCost < NewCost || !NewCost.isValid())
+  if (OldCost < NewCost || !NewCost.isValid()) 
     return false;
 
   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
@@ -712,8 +712,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   if (!VecTy)
     return false;
 
-  InstructionCost OldCost =
-      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
+  InstructionCost OldCost = 
+      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); 
   OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
   OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
   OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
@@ -724,7 +724,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
   int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
   auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
-  InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
+  InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType()); 
   NewCost +=
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy);
   NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
@@ -733,7 +733,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   // Aggressively form vector ops if the cost is equal because the transform
   // may enable further optimization.
   // Codegen can reverse this transform (scalarize) if it was not profitable.
-  if (OldCost < NewCost || !NewCost.isValid())
+  if (OldCost < NewCost || !NewCost.isValid()) 
     return false;
 
   // Create a vector constant from the 2 scalar constants.
@@ -758,10 +758,10 @@ bool VectorCombine::run() {
   if (DisableVectorCombine)
     return false;
 
-  // Don't attempt vectorization if the target does not support vectors.
-  if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
-    return false;
-
+  // Don't attempt vectorization if the target does not support vectors. 
+  if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true))) 
+    return false; 
+ 
   bool MadeChange = false;
   for (BasicBlock &BB : F) {
     // Ignore unreachable basic blocks.
@@ -775,7 +775,7 @@ bool VectorCombine::run() {
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       Builder.SetInsertPoint(&I);
-      MadeChange |= vectorizeLoadInsert(I);
+      MadeChange |= vectorizeLoadInsert(I); 
       MadeChange |= foldExtractExtract(I);
       MadeChange |= foldBitcastShuf(I);
       MadeChange |= scalarizeBinopOrCmp(I);
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make b/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make
index a68c667bde..a3879c3129 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make
@@ -12,12 +12,12 @@ LICENSE(Apache-2.0 WITH LLVM-exception)
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
 PEERDIR(
-    contrib/libs/llvm12
-    contrib/libs/llvm12/include
-    contrib/libs/llvm12/lib/Analysis
-    contrib/libs/llvm12/lib/IR
-    contrib/libs/llvm12/lib/Support
-    contrib/libs/llvm12/lib/Transforms/Utils
+    contrib/libs/llvm12 
+    contrib/libs/llvm12/include 
+    contrib/libs/llvm12/lib/Analysis 
+    contrib/libs/llvm12/lib/IR 
+    contrib/libs/llvm12/lib/Support 
+    contrib/libs/llvm12/lib/Transforms/Utils 
 )
 
 ADDINCL(
author	shadchin <shadchin@yandex-team.ru>	2022-02-10 16:44:30 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:44:30 +0300
commit	2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch)
tree	012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Transforms/Vectorize
parent	6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff)
download	ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz