Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.

author: shadchin <shadchin@yandex-team.ru> 2022-02-10 16:44:30 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:44:30 +0300
commit: 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch)
tree: 012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp
parent: 6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff)
download: ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz
1 files changed, 676 insertions, 676 deletions
diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 8901934013..e4e4252041 100644
--- a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -20,18 +20,18 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Intrinsics.h" 
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/KnownBits.h" 
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h" 
+#include "llvm/Transforms/Utils/Local.h" 
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <algorithm>
 #include <cassert>
@@ -50,38 +50,38 @@ static cl::opt<bool> DisableLowOverheadLoops(
   "disable-arm-loloops", cl::Hidden, cl::init(false),
   cl::desc("Disable the generation of low-overhead loops"));
 
-static cl::opt<bool>
-    AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
-                  cl::desc("Enable the generation of WLS loops"));
-
+static cl::opt<bool> 
+    AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), 
+                  cl::desc("Enable the generation of WLS loops")); 
+ 
 extern cl::opt<TailPredication::Mode> EnableTailPredication;
 
 extern cl::opt<bool> EnableMaskedGatherScatters;
 
-extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
-
-/// Convert a vector load intrinsic into a simple llvm load instruction.
-/// This is beneficial when the underlying object being addressed comes
-/// from a constant, since we get constant-folding for free.
-static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
-                               InstCombiner::BuilderTy &Builder) {
-  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
-
-  if (!IntrAlign)
-    return nullptr;
-
-  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
-                           ? MemAlign
-                           : IntrAlign->getLimitedValue();
-
-  if (!isPowerOf2_32(Alignment))
-    return nullptr;
-
-  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
-                                          PointerType::get(II.getType(), 0));
-  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
-}
-
+extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor; 
+ 
+/// Convert a vector load intrinsic into a simple llvm load instruction. 
+/// This is beneficial when the underlying object being addressed comes 
+/// from a constant, since we get constant-folding for free. 
+static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, 
+                               InstCombiner::BuilderTy &Builder) { 
+  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); 
+ 
+  if (!IntrAlign) 
+    return nullptr; 
+ 
+  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign 
+                           ? MemAlign 
+                           : IntrAlign->getLimitedValue(); 
+ 
+  if (!isPowerOf2_32(Alignment)) 
+    return nullptr; 
+ 
+  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), 
+                                          PointerType::get(II.getType(), 0)); 
+  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); 
+} 
+ 
 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                      const Function *Callee) const {
   const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -114,138 +114,138 @@ bool ARMTTIImpl::shouldFavorPostInc() const {
   return false;
 }
 
-Optional<Instruction *>
-ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
-  using namespace PatternMatch;
-  Intrinsic::ID IID = II.getIntrinsicID();
-  switch (IID) {
-  default:
-    break;
-  case Intrinsic::arm_neon_vld1: {
-    Align MemAlign =
-        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
-                          &IC.getAssumptionCache(), &IC.getDominatorTree());
-    if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
-      return IC.replaceInstUsesWith(II, V);
-    }
-    break;
-  }
-
-  case Intrinsic::arm_neon_vld2:
-  case Intrinsic::arm_neon_vld3:
-  case Intrinsic::arm_neon_vld4:
-  case Intrinsic::arm_neon_vld2lane:
-  case Intrinsic::arm_neon_vld3lane:
-  case Intrinsic::arm_neon_vld4lane:
-  case Intrinsic::arm_neon_vst1:
-  case Intrinsic::arm_neon_vst2:
-  case Intrinsic::arm_neon_vst3:
-  case Intrinsic::arm_neon_vst4:
-  case Intrinsic::arm_neon_vst2lane:
-  case Intrinsic::arm_neon_vst3lane:
-  case Intrinsic::arm_neon_vst4lane: {
-    Align MemAlign =
-        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
-                          &IC.getAssumptionCache(), &IC.getDominatorTree());
-    unsigned AlignArg = II.getNumArgOperands() - 1;
-    Value *AlignArgOp = II.getArgOperand(AlignArg);
-    MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
-    if (Align && *Align < MemAlign) {
-      return IC.replaceOperand(
-          II, AlignArg,
-          ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
-                           false));
-    }
-    break;
-  }
-
-  case Intrinsic::arm_mve_pred_i2v: {
-    Value *Arg = II.getArgOperand(0);
-    Value *ArgArg;
-    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
-                       PatternMatch::m_Value(ArgArg))) &&
-        II.getType() == ArgArg->getType()) {
-      return IC.replaceInstUsesWith(II, ArgArg);
-    }
-    Constant *XorMask;
-    if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
-                             PatternMatch::m_Value(ArgArg)),
-                         PatternMatch::m_Constant(XorMask))) &&
-        II.getType() == ArgArg->getType()) {
-      if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
-        if (CI->getValue().trunc(16).isAllOnesValue()) {
-          auto TrueVector = IC.Builder.CreateVectorSplat(
-              cast<FixedVectorType>(II.getType())->getNumElements(),
-              IC.Builder.getTrue());
-          return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
-        }
-      }
-    }
-    KnownBits ScalarKnown(32);
-    if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
-                                ScalarKnown, 0)) {
-      return &II;
-    }
-    break;
-  }
-  case Intrinsic::arm_mve_pred_v2i: {
-    Value *Arg = II.getArgOperand(0);
-    Value *ArgArg;
-    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
-                       PatternMatch::m_Value(ArgArg)))) {
-      return IC.replaceInstUsesWith(II, ArgArg);
-    }
-    if (!II.getMetadata(LLVMContext::MD_range)) {
-      Type *IntTy32 = Type::getInt32Ty(II.getContext());
-      Metadata *M[] = {
-          ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
-          ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))};
-      II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
-      return &II;
-    }
-    break;
-  }
-  case Intrinsic::arm_mve_vadc:
-  case Intrinsic::arm_mve_vadc_predicated: {
-    unsigned CarryOp =
-        (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
-    assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
-           "Bad type for intrinsic!");
-
-    KnownBits CarryKnown(32);
-    if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
-                                CarryKnown)) {
-      return &II;
-    }
-    break;
-  }
-  case Intrinsic::arm_mve_vmldava: {
-    Instruction *I = cast<Instruction>(&II);
-    if (I->hasOneUse()) {
-      auto *User = cast<Instruction>(*I->user_begin());
-      Value *OpZ;
-      if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
-          match(I->getOperand(3), m_Zero())) {
-        Value *OpX = I->getOperand(4);
-        Value *OpY = I->getOperand(5);
-        Type *OpTy = OpX->getType();
-
-        IC.Builder.SetInsertPoint(User);
-        Value *V =
-            IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
-                                       {I->getOperand(0), I->getOperand(1),
-                                        I->getOperand(2), OpZ, OpX, OpY});
-
-        IC.replaceInstUsesWith(*User, V);
-        return IC.eraseInstFromFunction(*User);
-      }
-    }
-    return None;
-  }
-  }
-  return None;
-}
-
+Optional<Instruction *> 
+ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 
+  using namespace PatternMatch; 
+  Intrinsic::ID IID = II.getIntrinsicID(); 
+  switch (IID) { 
+  default: 
+    break; 
+  case Intrinsic::arm_neon_vld1: { 
+    Align MemAlign = 
+        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, 
+                          &IC.getAssumptionCache(), &IC.getDominatorTree()); 
+    if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) { 
+      return IC.replaceInstUsesWith(II, V); 
+    } 
+    break; 
+  } 
+ 
+  case Intrinsic::arm_neon_vld2: 
+  case Intrinsic::arm_neon_vld3: 
+  case Intrinsic::arm_neon_vld4: 
+  case Intrinsic::arm_neon_vld2lane: 
+  case Intrinsic::arm_neon_vld3lane: 
+  case Intrinsic::arm_neon_vld4lane: 
+  case Intrinsic::arm_neon_vst1: 
+  case Intrinsic::arm_neon_vst2: 
+  case Intrinsic::arm_neon_vst3: 
+  case Intrinsic::arm_neon_vst4: 
+  case Intrinsic::arm_neon_vst2lane: 
+  case Intrinsic::arm_neon_vst3lane: 
+  case Intrinsic::arm_neon_vst4lane: { 
+    Align MemAlign = 
+        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, 
+                          &IC.getAssumptionCache(), &IC.getDominatorTree()); 
+    unsigned AlignArg = II.getNumArgOperands() - 1; 
+    Value *AlignArgOp = II.getArgOperand(AlignArg); 
+    MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue(); 
+    if (Align && *Align < MemAlign) { 
+      return IC.replaceOperand( 
+          II, AlignArg, 
+          ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(), 
+                           false)); 
+    } 
+    break; 
+  } 
+ 
+  case Intrinsic::arm_mve_pred_i2v: { 
+    Value *Arg = II.getArgOperand(0); 
+    Value *ArgArg; 
+    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( 
+                       PatternMatch::m_Value(ArgArg))) && 
+        II.getType() == ArgArg->getType()) { 
+      return IC.replaceInstUsesWith(II, ArgArg); 
+    } 
+    Constant *XorMask; 
+    if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( 
+                             PatternMatch::m_Value(ArgArg)), 
+                         PatternMatch::m_Constant(XorMask))) && 
+        II.getType() == ArgArg->getType()) { 
+      if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { 
+        if (CI->getValue().trunc(16).isAllOnesValue()) { 
+          auto TrueVector = IC.Builder.CreateVectorSplat( 
+              cast<FixedVectorType>(II.getType())->getNumElements(), 
+              IC.Builder.getTrue()); 
+          return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); 
+        } 
+      } 
+    } 
+    KnownBits ScalarKnown(32); 
+    if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16), 
+                                ScalarKnown, 0)) { 
+      return &II; 
+    } 
+    break; 
+  } 
+  case Intrinsic::arm_mve_pred_v2i: { 
+    Value *Arg = II.getArgOperand(0); 
+    Value *ArgArg; 
+    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>( 
+                       PatternMatch::m_Value(ArgArg)))) { 
+      return IC.replaceInstUsesWith(II, ArgArg); 
+    } 
+    if (!II.getMetadata(LLVMContext::MD_range)) { 
+      Type *IntTy32 = Type::getInt32Ty(II.getContext()); 
+      Metadata *M[] = { 
+          ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), 
+          ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))}; 
+      II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M)); 
+      return &II; 
+    } 
+    break; 
+  } 
+  case Intrinsic::arm_mve_vadc: 
+  case Intrinsic::arm_mve_vadc_predicated: { 
+    unsigned CarryOp = 
+        (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; 
+    assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && 
+           "Bad type for intrinsic!"); 
+ 
+    KnownBits CarryKnown(32); 
+    if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29), 
+                                CarryKnown)) { 
+      return &II; 
+    } 
+    break; 
+  } 
+  case Intrinsic::arm_mve_vmldava: { 
+    Instruction *I = cast<Instruction>(&II); 
+    if (I->hasOneUse()) { 
+      auto *User = cast<Instruction>(*I->user_begin()); 
+      Value *OpZ; 
+      if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) && 
+          match(I->getOperand(3), m_Zero())) { 
+        Value *OpX = I->getOperand(4); 
+        Value *OpY = I->getOperand(5); 
+        Type *OpTy = OpX->getType(); 
+ 
+        IC.Builder.SetInsertPoint(User); 
+        Value *V = 
+            IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy}, 
+                                       {I->getOperand(0), I->getOperand(1), 
+                                        I->getOperand(2), OpZ, OpX, OpY}); 
+ 
+        IC.replaceInstUsesWith(*User, V); 
+        return IC.eraseInstFromFunction(*User); 
+      } 
+    } 
+    return None; 
+  } 
+  } 
+  return None; 
+} 
+ 
 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                               TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
@@ -289,43 +289,43 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
   return 1;
 }
 
-// Checks whether Inst is part of a min(max()) or max(min()) pattern
-// that will match to an SSAT instruction
-static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
-  Value *LHS, *RHS;
-  ConstantInt *C;
-  SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
-
-  if (InstSPF == SPF_SMAX &&
-      PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
-      C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
-
-    auto isSSatMin = [&](Value *MinInst) {
-      if (isa<SelectInst>(MinInst)) {
-        Value *MinLHS, *MinRHS;
-        ConstantInt *MinC;
-        SelectPatternFlavor MinSPF =
-            matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
-        if (MinSPF == SPF_SMIN &&
-            PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
-            MinC->getValue() == ((-Imm) - 1))
-          return true;
-      }
-      return false;
-    };
-
-    if (isSSatMin(Inst->getOperand(1)) ||
-        (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
-                               isSSatMin(*(++Inst->user_begin())))))
-      return true;
-  }
-  return false;
-}
-
-int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
-                                  const APInt &Imm, Type *Ty,
-                                  TTI::TargetCostKind CostKind,
-                                  Instruction *Inst) {
+// Checks whether Inst is part of a min(max()) or max(min()) pattern 
+// that will match to an SSAT instruction 
+static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { 
+  Value *LHS, *RHS; 
+  ConstantInt *C; 
+  SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor; 
+ 
+  if (InstSPF == SPF_SMAX && 
+      PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) && 
+      C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) { 
+ 
+    auto isSSatMin = [&](Value *MinInst) { 
+      if (isa<SelectInst>(MinInst)) { 
+        Value *MinLHS, *MinRHS; 
+        ConstantInt *MinC; 
+        SelectPatternFlavor MinSPF = 
+            matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor; 
+        if (MinSPF == SPF_SMIN && 
+            PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) && 
+            MinC->getValue() == ((-Imm) - 1)) 
+          return true; 
+      } 
+      return false; 
+    }; 
+ 
+    if (isSSatMin(Inst->getOperand(1)) || 
+        (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) || 
+                               isSSatMin(*(++Inst->user_begin()))))) 
+      return true; 
+  } 
+  return false; 
+} 
+ 
+int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 
+                                  const APInt &Imm, Type *Ty, 
+                                  TTI::TargetCostKind CostKind, 
+                                  Instruction *Inst) { 
   // Division by a constant can be turned into multiplication, but only if we
   // know it's constant. So it's not so much that the immediate is cheap (it's
   // not), but that the alternative is worse.
@@ -364,33 +364,33 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
   if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
     return 0;
 
-  // Ensures negative constant of min(max()) or max(min()) patterns that
-  // match to SSAT instructions don't get hoisted
-  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
-      Ty->getIntegerBitWidth() <= 32) {
-    if (isSSATMinMaxPattern(Inst, Imm) ||
-        (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
-         isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
-      return 0;
-  }
-
+  // Ensures negative constant of min(max()) or max(min()) patterns that 
+  // match to SSAT instructions don't get hoisted 
+  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) && 
+      Ty->getIntegerBitWidth() <= 32) { 
+    if (isSSATMinMaxPattern(Inst, Imm) || 
+        (isa<ICmpInst>(Inst) && Inst->hasOneUse() && 
+         isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm))) 
+      return 0; 
+  } 
+ 
   return getIntImmCost(Imm, Ty, CostKind);
 }
 
-int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
-  if (CostKind == TTI::TCK_RecipThroughput &&
-      (ST->hasNEON() || ST->hasMVEIntegerOps())) {
-    // FIXME: The vectorizer is highly sensistive to the cost of these
-    // instructions, which suggests that it may be using the costs incorrectly.
-    // But, for now, just make them free to avoid performance regressions for
-    // vector targets.
-    return 0;
-  }
-  return BaseT::getCFInstrCost(Opcode, CostKind);
-}
-
+int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { 
+  if (CostKind == TTI::TCK_RecipThroughput && 
+      (ST->hasNEON() || ST->hasMVEIntegerOps())) { 
+    // FIXME: The vectorizer is highly sensistive to the cost of these 
+    // instructions, which suggests that it may be using the costs incorrectly. 
+    // But, for now, just make them free to avoid performance regressions for 
+    // vector targets. 
+    return 0; 
+  } 
+  return BaseT::getCFInstrCost(Opcode, CostKind); 
+} 
+ 
 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                                 TTI::CastContextHint CCH,
+                                 TTI::CastContextHint CCH, 
                                  TTI::TargetCostKind CostKind,
                                  const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -402,35 +402,35 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       return Cost == 0 ? 0 : 1;
     return Cost;
   };
-  auto IsLegalFPType = [this](EVT VT) {
-    EVT EltVT = VT.getScalarType();
-    return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
-            (EltVT == MVT::f64 && ST->hasFP64()) ||
-            (EltVT == MVT::f16 && ST->hasFullFP16());
-  };
+  auto IsLegalFPType = [this](EVT VT) { 
+    EVT EltVT = VT.getScalarType(); 
+    return (EltVT == MVT::f32 && ST->hasVFP2Base()) || 
+            (EltVT == MVT::f64 && ST->hasFP64()) || 
+            (EltVT == MVT::f16 && ST->hasFullFP16()); 
+  }; 
 
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return AdjustCost(
-        BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
-
-  // Extending masked load/Truncating masked stores is expensive because we
-  // currently don't split them. This means that we'll likely end up
-  // loading/storing each element individually (hence the high cost).
-  if ((ST->hasMVEIntegerOps() &&
-       (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
-        Opcode == Instruction::SExt)) ||
-      (ST->hasMVEFloatOps() &&
-       (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
-       IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
-    if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
-      return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor();
-
-  // The extend of other kinds of load is free
-  if (CCH == TTI::CastContextHint::Normal ||
-      CCH == TTI::CastContextHint::Masked) {
+    return AdjustCost( 
+        BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 
+
+  // Extending masked load/Truncating masked stores is expensive because we 
+  // currently don't split them. This means that we'll likely end up 
+  // loading/storing each element individually (hence the high cost). 
+  if ((ST->hasMVEIntegerOps() && 
+       (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt || 
+        Opcode == Instruction::SExt)) || 
+      (ST->hasMVEFloatOps() && 
+       (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) && 
+       IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))) 
+    if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128) 
+      return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor(); 
+ 
+  // The extend of other kinds of load is free 
+  if (CCH == TTI::CastContextHint::Normal || 
+      CCH == TTI::CastContextHint::Masked) { 
     static const TypeConversionCostTblEntry LoadConversionTbl[] = {
         {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
         {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
@@ -485,31 +485,31 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
         return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
     }
 
-    // The truncate of a store is free. This is the mirror of extends above.
-    static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
+    // The truncate of a store is free. This is the mirror of extends above. 
+    static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = { 
         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
         {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
-        {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
+        {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1}, 
         {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
         {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
     };
     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
       if (const auto *Entry =
-              ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
-                                     SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
+              ConvertCostTableLookup(MVEStoreConversionTbl, ISD, 
+                                     SrcTy.getSimpleVT(), DstTy.getSimpleVT())) 
         return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
     }
 
-    static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
+    static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = { 
         {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
         {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
     };
     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
       if (const auto *Entry =
-              ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
-                                     SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
+              ConvertCostTableLookup(MVEFStoreConversionTbl, ISD, 
+                                     SrcTy.getSimpleVT(), DstTy.getSimpleVT())) 
         return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
     }
   }
@@ -746,24 +746,24 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (SrcTy.isFixedLengthVector())
       Lanes = SrcTy.getVectorNumElements();
 
-    if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
+    if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)) 
       return Lanes;
     else
       return Lanes * CallCost;
   }
 
-  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
-      SrcTy.isFixedLengthVector()) {
-    // Treat a truncate with larger than legal source (128bits for MVE) as
-    // expensive, 2 instructions per lane.
-    if ((SrcTy.getScalarType() == MVT::i8 ||
-         SrcTy.getScalarType() == MVT::i16 ||
-         SrcTy.getScalarType() == MVT::i32) &&
-        SrcTy.getSizeInBits() > 128 &&
-        SrcTy.getSizeInBits() > DstTy.getSizeInBits())
-      return SrcTy.getVectorNumElements() * 2;
-  }
-
+  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() && 
+      SrcTy.isFixedLengthVector()) { 
+    // Treat a truncate with larger than legal source (128bits for MVE) as 
+    // expensive, 2 instructions per lane. 
+    if ((SrcTy.getScalarType() == MVT::i8 || 
+         SrcTy.getScalarType() == MVT::i16 || 
+         SrcTy.getScalarType() == MVT::i32) && 
+        SrcTy.getSizeInBits() > 128 && 
+        SrcTy.getSizeInBits() > DstTy.getSizeInBits()) 
+      return SrcTy.getVectorNumElements() * 2; 
+  } 
+ 
   // Scalar integer conversion costs.
   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
     // i16 -> i64 requires two dependent operations.
@@ -787,7 +787,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                      ? ST->getMVEVectorCostFactor()
                      : 1;
   return AdjustCost(
-      BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
+      BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 
 }
 
 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -827,37 +827,37 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
 }
 
 int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                                   CmpInst::Predicate VecPred,
+                                   CmpInst::Predicate VecPred, 
                                    TTI::TargetCostKind CostKind,
                                    const Instruction *I) {
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-
-  // Thumb scalar code size cost for select.
-  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
-      ST->isThumb() && !ValTy->isVectorTy()) {
-    // Assume expensive structs.
-    if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
-      return TTI::TCC_Expensive;
-
-    // Select costs can vary because they:
-    // - may require one or more conditional mov (including an IT),
-    // - can't operate directly on immediates,
-    // - require live flags, which we can't copy around easily.
-    int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
-
-    // Possible IT instruction for Thumb2, or more for Thumb1.
-    ++Cost;
-
-    // i1 values may need rematerialising by using mov immediates and/or
-    // flag setting instructions.
-    if (ValTy->isIntegerTy(1))
-      ++Cost;
-
-    return Cost;
-  }
-
+  int ISD = TLI->InstructionOpcodeToISD(Opcode); 
+
+  // Thumb scalar code size cost for select. 
+  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT && 
+      ST->isThumb() && !ValTy->isVectorTy()) { 
+    // Assume expensive structs. 
+    if (TLI->getValueType(DL, ValTy, true) == MVT::Other) 
+      return TTI::TCC_Expensive; 
+ 
+    // Select costs can vary because they: 
+    // - may require one or more conditional mov (including an IT), 
+    // - can't operate directly on immediates, 
+    // - require live flags, which we can't copy around easily. 
+    int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first; 
+ 
+    // Possible IT instruction for Thumb2, or more for Thumb1. 
+    ++Cost; 
+ 
+    // i1 values may need rematerialising by using mov immediates and/or 
+    // flag setting instructions. 
+    if (ValTy->isIntegerTy(1)) 
+      ++Cost; 
+ 
+    return Cost; 
+  } 
+ 
   // On NEON a vector select gets lowered to vbsl.
-  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
+  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) { 
     // Lowering of some vector selects is currently far from perfect.
     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
@@ -878,15 +878,15 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
     return LT.first;
   }
 
-  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
-  // for "multiple beats" potentially needed by MVE instructions.
-  int BaseCost = 1;
-  if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() &&
-      ValTy->isVectorTy())
-    BaseCost = ST->getMVEVectorCostFactor();
-
-  return BaseCost *
-         BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
+  // Default to cheap (throughput/size of 1 instruction) but adjust throughput 
+  // for "multiple beats" potentially needed by MVE instructions. 
+  int BaseCost = 1; 
+  if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() && 
+      ValTy->isVectorTy()) 
+    BaseCost = ST->getMVEVectorCostFactor(); 
+ 
+  return BaseCost * 
+         BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 
 }
 
 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -968,85 +968,85 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
           (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
 }
 
-/// Given a memcpy/memset/memmove instruction, return the number of memory
-/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
-/// call is used.
-int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
-  MemOp MOp;
-  unsigned DstAddrSpace = ~0u;
-  unsigned SrcAddrSpace = ~0u;
-  const Function *F = I->getParent()->getParent();
-
-  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
-    ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
-    // If 'size' is not a constant, a library call will be generated.
-    if (!C)
-      return -1;
-
-    const unsigned Size = C->getValue().getZExtValue();
-    const Align DstAlign = *MC->getDestAlign();
-    const Align SrcAlign = *MC->getSourceAlign();
-
-    MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
-                      /*IsVolatile*/ false);
-    DstAddrSpace = MC->getDestAddressSpace();
-    SrcAddrSpace = MC->getSourceAddressSpace();
-  }
-  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
-    ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
-    // If 'size' is not a constant, a library call will be generated.
-    if (!C)
-      return -1;
-
-    const unsigned Size = C->getValue().getZExtValue();
-    const Align DstAlign = *MS->getDestAlign();
-
-    MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
-                     /*IsZeroMemset*/ false, /*IsVolatile*/ false);
-    DstAddrSpace = MS->getDestAddressSpace();
-  }
-  else
-    llvm_unreachable("Expected a memcpy/move or memset!");
-
-  unsigned Limit, Factor = 2;
-  switch(I->getIntrinsicID()) {
-    case Intrinsic::memcpy:
-      Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
-      break;
-    case Intrinsic::memmove:
-      Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
-      break;
-    case Intrinsic::memset:
-      Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
-      Factor = 1;
-      break;
-    default:
-      llvm_unreachable("Expected a memcpy/move or memset!");
-  }
-
+/// Given a memcpy/memset/memmove instruction, return the number of memory 
+/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a 
+/// call is used. 
+int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const { 
+  MemOp MOp; 
+  unsigned DstAddrSpace = ~0u; 
+  unsigned SrcAddrSpace = ~0u; 
+  const Function *F = I->getParent()->getParent(); 
+
+  if (const auto *MC = dyn_cast<MemTransferInst>(I)) { 
+    ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength()); 
+    // If 'size' is not a constant, a library call will be generated. 
+    if (!C) 
+      return -1; 
+
+    const unsigned Size = C->getValue().getZExtValue(); 
+    const Align DstAlign = *MC->getDestAlign(); 
+    const Align SrcAlign = *MC->getSourceAlign(); 
+
+    MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, 
+                      /*IsVolatile*/ false); 
+    DstAddrSpace = MC->getDestAddressSpace(); 
+    SrcAddrSpace = MC->getSourceAddressSpace(); 
+  } 
+  else if (const auto *MS = dyn_cast<MemSetInst>(I)) { 
+    ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength()); 
+    // If 'size' is not a constant, a library call will be generated. 
+    if (!C) 
+      return -1; 
+
+    const unsigned Size = C->getValue().getZExtValue(); 
+    const Align DstAlign = *MS->getDestAlign(); 
+ 
+    MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign, 
+                     /*IsZeroMemset*/ false, /*IsVolatile*/ false); 
+    DstAddrSpace = MS->getDestAddressSpace(); 
+  } 
+  else 
+    llvm_unreachable("Expected a memcpy/move or memset!"); 
+ 
+  unsigned Limit, Factor = 2; 
+  switch(I->getIntrinsicID()) { 
+    case Intrinsic::memcpy: 
+      Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize()); 
+      break; 
+    case Intrinsic::memmove: 
+      Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); 
+      break; 
+    case Intrinsic::memset: 
+      Limit = TLI->getMaxStoresPerMemset(F->hasMinSize()); 
+      Factor = 1; 
+      break; 
+    default: 
+      llvm_unreachable("Expected a memcpy/move or memset!"); 
+  } 
+ 
   // MemOps will be poplulated with a list of data types that needs to be
   // loaded and stored. That's why we multiply the number of elements by 2 to
   // get the cost for this memcpy.
-  std::vector<EVT> MemOps;
+  std::vector<EVT> MemOps; 
   if (getTLI()->findOptimalMemOpLowering(
-          MemOps, Limit, MOp, DstAddrSpace,
-          SrcAddrSpace, F->getAttributes()))
-    return MemOps.size() * Factor;
+          MemOps, Limit, MOp, DstAddrSpace, 
+          SrcAddrSpace, F->getAttributes())) 
+    return MemOps.size() * Factor; 
 
   // If we can't find an optimal memop lowering, return the default cost
-  return -1;
-}
-
-int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
-  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
-
-  // To model the cost of a library call, we assume 1 for the call, and
-  // 3 for the argument setup.
-  if (NumOps == -1)
-    return 4;
-  return NumOps;
+  return -1; 
 }
 
+int ARMTTIImpl::getMemcpyCost(const Instruction *I) { 
+  int NumOps = getNumMemOps(cast<IntrinsicInst>(I)); 
+ 
+  // To model the cost of a library call, we assume 1 for the call, and 
+  // 3 for the argument setup. 
+  if (NumOps == -1) 
+    return 4; 
+  return NumOps; 
+} 
+ 
 int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                int Index, VectorType *SubTp) {
   if (ST->hasNEON()) {
@@ -1149,21 +1149,21 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                        TTI::OperandValueProperties Opd2PropInfo,
                                        ArrayRef<const Value *> Args,
                                        const Instruction *CxtI) {
-  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
-  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
-    // Make operations on i1 relatively expensive as this often involves
-    // combining predicates. AND and XOR should be easier to handle with IT
-    // blocks.
-    switch (ISDOpcode) {
-    default:
-      break;
-    case ISD::AND:
-    case ISD::XOR:
-      return 2;
-    case ISD::OR:
-      return 3;
-    }
-  }
+  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); 
+  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) { 
+    // Make operations on i1 relatively expensive as this often involves 
+    // combining predicates. AND and XOR should be easier to handle with IT 
+    // blocks. 
+    switch (ISDOpcode) { 
+    default: 
+      break; 
+    case ISD::AND: 
+    case ISD::XOR: 
+      return 2; 
+    case ISD::OR: 
+      return 3; 
+    } 
+  } 
 
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
@@ -1259,12 +1259,12 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   if (LooksLikeAFreeShift())
     return 0;
 
-  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
-  // for "multiple beats" potentially needed by MVE instructions.
-  int BaseCost = 1;
-  if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() &&
-      Ty->isVectorTy())
-    BaseCost = ST->getMVEVectorCostFactor();
+  // Default to cheap (throughput/size of 1 instruction) but adjust throughput 
+  // for "multiple beats" potentially needed by MVE instructions. 
+  int BaseCost = 1; 
+  if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() && 
+      Ty->isVectorTy()) 
+    BaseCost = ST->getMVEVectorCostFactor(); 
 
   // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
   // without treating floats as more expensive that scalars or increasing the
@@ -1331,24 +1331,24 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                            CostKind, I);
 }
 
-unsigned ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
-                                           Align Alignment,
-                                           unsigned AddressSpace,
-                                           TTI::TargetCostKind CostKind) {
-  if (ST->hasMVEIntegerOps()) {
-    if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
-      return ST->getMVEVectorCostFactor();
-    if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
-      return ST->getMVEVectorCostFactor();
-  }
-  if (!isa<FixedVectorType>(Src))
-    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
-                                        CostKind);
-  // Scalar cost, which is currently very high due to the efficiency of the
-  // generated code.
-  return cast<FixedVectorType>(Src)->getNumElements() * 8;
-}
-
+unsigned ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 
+                                           Align Alignment, 
+                                           unsigned AddressSpace, 
+                                           TTI::TargetCostKind CostKind) { 
+  if (ST->hasMVEIntegerOps()) { 
+    if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment)) 
+      return ST->getMVEVectorCostFactor(); 
+    if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment)) 
+      return ST->getMVEVectorCostFactor(); 
+  } 
+  if (!isa<FixedVectorType>(Src)) 
+    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 
+                                        CostKind); 
+  // Scalar cost, which is currently very high due to the efficiency of the 
+  // generated code. 
+  return cast<FixedVectorType>(Src)->getNumElements() * 8; 
+} 
+ 
 int ARMTTIImpl::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -1379,8 +1379,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(
     // promoted differently). The cost of 2 here is then a load and vrev or
     // vmovn.
     if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
-        VecTy->isIntOrIntVectorTy() &&
-        DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
+        VecTy->isIntOrIntVectorTy() && 
+        DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64) 
       return 2 * BaseCost;
   }
 
@@ -1413,13 +1413,13 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
   // multiplied by the number of elements being loaded. This is possibly very
   // conservative, but even so we still end up vectorising loops because the
   // cost per iteration for many loops is lower than for scalar loops.
-  unsigned VectorCost = NumElems * LT.first * ST->getMVEVectorCostFactor();
+  unsigned VectorCost = NumElems * LT.first * ST->getMVEVectorCostFactor(); 
   // The scalarization cost should be a lot higher. We use the number of vector
   // elements plus the scalarization overhead.
   unsigned ScalarCost =
       NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {});
 
-  if (EltSize < 8 || Alignment < EltSize / 8)
+  if (EltSize < 8 || Alignment < EltSize / 8) 
     return ScalarCost;
 
   unsigned ExtSize = EltSize;
@@ -1488,92 +1488,92 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
   return ScalarCost;
 }
 
-int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
-                                           bool IsPairwiseForm,
-                                           TTI::TargetCostKind CostKind) {
-  EVT ValVT = TLI->getValueType(DL, ValTy);
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
-    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
-                                             CostKind);
-
-  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
-
-  static const CostTblEntry CostTblAdd[]{
-      {ISD::ADD, MVT::v16i8, 1},
-      {ISD::ADD, MVT::v8i16, 1},
-      {ISD::ADD, MVT::v4i32, 1},
-  };
-  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
-    return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first;
-
-  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
-                                           CostKind);
-}
-
-InstructionCost
-ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
-                                        Type *ResTy, VectorType *ValTy,
-                                        TTI::TargetCostKind CostKind) {
-  EVT ValVT = TLI->getValueType(DL, ValTy);
-  EVT ResVT = TLI->getValueType(DL, ResTy);
-  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
-    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
-    if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) ||
-        (LT.second == MVT::v8i16 &&
-         ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) ||
-        (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64))
-      return ST->getMVEVectorCostFactor() * LT.first;
-  }
-
-  return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
-                                            CostKind);
-}
-
-int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
-                                      TTI::TargetCostKind CostKind) {
-  switch (ICA.getID()) {
-  case Intrinsic::get_active_lane_mask:
-    // Currently we make a somewhat optimistic assumption that
-    // active_lane_mask's are always free. In reality it may be freely folded
-    // into a tail predicated loop, expanded into a VCPT or expanded into a lot
-    // of add/icmp code. We may need to improve this in the future, but being
-    // able to detect if it is free or not involves looking at a lot of other
-    // code. We currently assume that the vectorizer inserted these, and knew
-    // what it was doing in adding one.
-    if (ST->hasMVEIntegerOps())
-      return 0;
-    break;
-  case Intrinsic::sadd_sat:
-  case Intrinsic::ssub_sat:
-  case Intrinsic::uadd_sat:
-  case Intrinsic::usub_sat: {
-    if (!ST->hasMVEIntegerOps())
-      break;
-    // Get the Return type, either directly of from ICA.ReturnType and ICA.VF.
-    Type *VT = ICA.getReturnType();
-    if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar())
-      VT = VectorType::get(VT, ICA.getVectorFactor());
-
-    std::pair<int, MVT> LT =
-        TLI->getTypeLegalizationCost(DL, VT);
-    if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
-        LT.second == MVT::v16i8) {
-      // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
-      // need to extend the type, as it uses shr(qadd(shl, shl)).
-      unsigned Instrs = LT.second.getScalarSizeInBits() ==
-                                ICA.getReturnType()->getScalarSizeInBits()
-                            ? 1
-                            : 4;
-      return LT.first * ST->getMVEVectorCostFactor() * Instrs;
-    }
-    break;
-  }
-  }
-
-  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
-}
-
+int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 
+                                           bool IsPairwiseForm, 
+                                           TTI::TargetCostKind CostKind) { 
+  EVT ValVT = TLI->getValueType(DL, ValTy); 
+  int ISD = TLI->InstructionOpcodeToISD(Opcode); 
+  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) 
+    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, 
+                                             CostKind); 
+ 
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 
+ 
+  static const CostTblEntry CostTblAdd[]{ 
+      {ISD::ADD, MVT::v16i8, 1}, 
+      {ISD::ADD, MVT::v8i16, 1}, 
+      {ISD::ADD, MVT::v4i32, 1}, 
+  }; 
+  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) 
+    return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first; 
+ 
+  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, 
+                                           CostKind); 
+} 
+ 
+InstructionCost 
+ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, 
+                                        Type *ResTy, VectorType *ValTy, 
+                                        TTI::TargetCostKind CostKind) { 
+  EVT ValVT = TLI->getValueType(DL, ValTy); 
+  EVT ResVT = TLI->getValueType(DL, ResTy); 
+  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { 
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 
+    if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) || 
+        (LT.second == MVT::v8i16 && 
+         ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) || 
+        (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64)) 
+      return ST->getMVEVectorCostFactor() * LT.first; 
+  } 
+ 
+  return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy, 
+                                            CostKind); 
+} 
+ 
+int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 
+                                      TTI::TargetCostKind CostKind) { 
+  switch (ICA.getID()) { 
+  case Intrinsic::get_active_lane_mask: 
+    // Currently we make a somewhat optimistic assumption that 
+    // active_lane_mask's are always free. In reality it may be freely folded 
+    // into a tail predicated loop, expanded into a VCPT or expanded into a lot 
+    // of add/icmp code. We may need to improve this in the future, but being 
+    // able to detect if it is free or not involves looking at a lot of other 
+    // code. We currently assume that the vectorizer inserted these, and knew 
+    // what it was doing in adding one. 
+    if (ST->hasMVEIntegerOps()) 
+      return 0; 
+    break; 
+  case Intrinsic::sadd_sat: 
+  case Intrinsic::ssub_sat: 
+  case Intrinsic::uadd_sat: 
+  case Intrinsic::usub_sat: { 
+    if (!ST->hasMVEIntegerOps()) 
+      break; 
+    // Get the Return type, either directly of from ICA.ReturnType and ICA.VF. 
+    Type *VT = ICA.getReturnType(); 
+    if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar()) 
+      VT = VectorType::get(VT, ICA.getVectorFactor()); 
+ 
+    std::pair<int, MVT> LT = 
+        TLI->getTypeLegalizationCost(DL, VT); 
+    if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || 
+        LT.second == MVT::v16i8) { 
+      // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 
+      // need to extend the type, as it uses shr(qadd(shl, shl)). 
+      unsigned Instrs = LT.second.getScalarSizeInBits() == 
+                                ICA.getReturnType()->getScalarSizeInBits() 
+                            ? 1 
+                            : 4; 
+      return LT.first * ST->getMVEVectorCostFactor() * Instrs; 
+    } 
+    break; 
+  } 
+  } 
+ 
+  return BaseT::getIntrinsicInstrCost(ICA, CostKind); 
+} 
+ 
 bool ARMTTIImpl::isLoweredToCall(const Function *F) {
   if (!F->isIntrinsic())
     BaseT::isLoweredToCall(F);
@@ -1635,93 +1635,93 @@ bool ARMTTIImpl::isLoweredToCall(const Function *F) {
   return BaseT::isLoweredToCall(F);
 }
 
-bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
-  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
-  EVT VT = TLI->getValueType(DL, I.getType(), true);
-  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
-    return true;
-
-  // Check if an intrinsic will be lowered to a call and assume that any
-  // other CallInst will generate a bl.
-  if (auto *Call = dyn_cast<CallInst>(&I)) {
-    if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
-      switch(II->getIntrinsicID()) {
-        case Intrinsic::memcpy:
-        case Intrinsic::memset:
-        case Intrinsic::memmove:
-          return getNumMemOps(II) == -1;
-        default:
-          if (const Function *F = Call->getCalledFunction())
-            return isLoweredToCall(F);
-      }
-    }
-    return true;
-  }
-
-  // FPv5 provides conversions between integer, double-precision,
-  // single-precision, and half-precision formats.
-  switch (I.getOpcode()) {
-  default:
-    break;
-  case Instruction::FPToSI:
-  case Instruction::FPToUI:
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-  case Instruction::FPTrunc:
-  case Instruction::FPExt:
-    return !ST->hasFPARMv8Base();
-  }
-
-  // FIXME: Unfortunately the approach of checking the Operation Action does
-  // not catch all cases of Legalization that use library calls. Our
-  // Legalization step categorizes some transformations into library calls as
-  // Custom, Expand or even Legal when doing type legalization. So for now
-  // we have to special case for instance the SDIV of 64bit integers and the
-  // use of floating point emulation.
-  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
-    switch (ISD) {
-    default:
-      break;
-    case ISD::SDIV:
-    case ISD::UDIV:
-    case ISD::SREM:
-    case ISD::UREM:
-    case ISD::SDIVREM:
-    case ISD::UDIVREM:
-      return true;
-    }
-  }
-
-  // Assume all other non-float operations are supported.
-  if (!VT.isFloatingPoint())
-    return false;
-
-  // We'll need a library call to handle most floats when using soft.
-  if (TLI->useSoftFloat()) {
-    switch (I.getOpcode()) {
-    default:
-      return true;
-    case Instruction::Alloca:
-    case Instruction::Load:
-    case Instruction::Store:
-    case Instruction::Select:
-    case Instruction::PHI:
-      return false;
-    }
-  }
-
-  // We'll need a libcall to perform double precision operations on a single
-  // precision only FPU.
-  if (I.getType()->isDoubleTy() && !ST->hasFP64())
-    return true;
-
-  // Likewise for half precision arithmetic.
-  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
-    return true;
-
-  return false;
-}
-
+bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) { 
+  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); 
+  EVT VT = TLI->getValueType(DL, I.getType(), true); 
+  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) 
+    return true; 
+ 
+  // Check if an intrinsic will be lowered to a call and assume that any 
+  // other CallInst will generate a bl. 
+  if (auto *Call = dyn_cast<CallInst>(&I)) { 
+    if (auto *II = dyn_cast<IntrinsicInst>(Call)) { 
+      switch(II->getIntrinsicID()) { 
+        case Intrinsic::memcpy: 
+        case Intrinsic::memset: 
+        case Intrinsic::memmove: 
+          return getNumMemOps(II) == -1; 
+        default: 
+          if (const Function *F = Call->getCalledFunction()) 
+            return isLoweredToCall(F); 
+      } 
+    } 
+    return true; 
+  } 
+ 
+  // FPv5 provides conversions between integer, double-precision, 
+  // single-precision, and half-precision formats. 
+  switch (I.getOpcode()) { 
+  default: 
+    break; 
+  case Instruction::FPToSI: 
+  case Instruction::FPToUI: 
+  case Instruction::SIToFP: 
+  case Instruction::UIToFP: 
+  case Instruction::FPTrunc: 
+  case Instruction::FPExt: 
+    return !ST->hasFPARMv8Base(); 
+  } 
+ 
+  // FIXME: Unfortunately the approach of checking the Operation Action does 
+  // not catch all cases of Legalization that use library calls. Our 
+  // Legalization step categorizes some transformations into library calls as 
+  // Custom, Expand or even Legal when doing type legalization. So for now 
+  // we have to special case for instance the SDIV of 64bit integers and the 
+  // use of floating point emulation. 
+  if (VT.isInteger() && VT.getSizeInBits() >= 64) { 
+    switch (ISD) { 
+    default: 
+      break; 
+    case ISD::SDIV: 
+    case ISD::UDIV: 
+    case ISD::SREM: 
+    case ISD::UREM: 
+    case ISD::SDIVREM: 
+    case ISD::UDIVREM: 
+      return true; 
+    } 
+  } 
+ 
+  // Assume all other non-float operations are supported. 
+  if (!VT.isFloatingPoint()) 
+    return false; 
+ 
+  // We'll need a library call to handle most floats when using soft. 
+  if (TLI->useSoftFloat()) { 
+    switch (I.getOpcode()) { 
+    default: 
+      return true; 
+    case Instruction::Alloca: 
+    case Instruction::Load: 
+    case Instruction::Store: 
+    case Instruction::Select: 
+    case Instruction::PHI: 
+      return false; 
+    } 
+  } 
+ 
+  // We'll need a libcall to perform double precision operations on a single 
+  // precision only FPU. 
+  if (I.getType()->isDoubleTy() && !ST->hasFP64()) 
+    return true; 
+ 
+  // Likewise for half precision arithmetic. 
+  if (I.getType()->isHalfTy() && !ST->hasFullFP16()) 
+    return true; 
+ 
+  return false; 
+} 
+ 
 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                           AssumptionCache &AC,
                                           TargetLibraryInfo *LibInfo,
@@ -1762,7 +1762,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
       switch (Call->getIntrinsicID()) {
       default:
         break;
-      case Intrinsic::start_loop_iterations:
+      case Intrinsic::start_loop_iterations: 
       case Intrinsic::test_set_loop_iterations:
       case Intrinsic::loop_decrement:
       case Intrinsic::loop_decrement_reg:
@@ -1773,24 +1773,24 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
   };
 
   // Scan the instructions to see if there's any that we know will turn into a
-  // call or if this loop is already a low-overhead loop or will become a tail
-  // predicated loop.
-  bool IsTailPredLoop = false;
+  // call or if this loop is already a low-overhead loop or will become a tail 
+  // predicated loop. 
+  bool IsTailPredLoop = false; 
   auto ScanLoop = [&](Loop *L) {
     for (auto *BB : L->getBlocks()) {
       for (auto &I : *BB) {
-        if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
-            isa<InlineAsm>(I)) {
+        if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) || 
+            isa<InlineAsm>(I)) { 
           LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
           return false;
         }
-        if (auto *II = dyn_cast<IntrinsicInst>(&I))
-          IsTailPredLoop |=
-              II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
-              II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
-              II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
-              II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
-              II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
+        if (auto *II = dyn_cast<IntrinsicInst>(&I)) 
+          IsTailPredLoop |= 
+              II->getIntrinsicID() == Intrinsic::get_active_lane_mask || 
+              II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 || 
+              II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 || 
+              II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 || 
+              II->getIntrinsicID() == Intrinsic::arm_mve_vctp64; 
       }
     }
     return true;
@@ -1811,7 +1811,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
   LLVMContext &C = L->getHeader()->getContext();
   HWLoopInfo.CounterInReg = true;
   HWLoopInfo.IsNestingLegal = false;
-  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
+  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop; 
   HWLoopInfo.CountType = Type::getInt32Ty(C);
   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
   return true;
@@ -1859,28 +1859,28 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                  const LoopAccessInfo *LAI) {
   LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
 
-  // If there are live-out values, it is probably a reduction. We can predicate
-  // most reduction operations freely under MVE using a combination of
-  // prefer-predicated-reduction-select and inloop reductions. We limit this to
-  // floating point and integer reductions, but don't check for operators
-  // specifically here. If the value ends up not being a reduction (and so the
-  // vectorizer cannot tailfold the loop), we should fall back to standard
-  // vectorization automatically.
+  // If there are live-out values, it is probably a reduction. We can predicate 
+  // most reduction operations freely under MVE using a combination of 
+  // prefer-predicated-reduction-select and inloop reductions. We limit this to 
+  // floating point and integer reductions, but don't check for operators 
+  // specifically here. If the value ends up not being a reduction (and so the 
+  // vectorizer cannot tailfold the loop), we should fall back to standard 
+  // vectorization automatically. 
   SmallVector< Instruction *, 8 > LiveOuts;
   LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
-  bool ReductionsDisabled =
+  bool ReductionsDisabled = 
       EnableTailPredication == TailPredication::EnabledNoReductions ||
       EnableTailPredication == TailPredication::ForceEnabledNoReductions;
 
   for (auto *I : LiveOuts) {
-    if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
-        !I->getType()->isHalfTy()) {
-      LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
+    if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() && 
+        !I->getType()->isHalfTy()) { 
+      LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float " 
                            "live-out value\n");
       return false;
     }
-    if (ReductionsDisabled) {
-      LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
+    if (ReductionsDisabled) { 
+      LLVM_DEBUG(dbgs() << "Reductions not enabled\n"); 
       return false;
     }
   }
@@ -1910,35 +1910,35 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
         Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
         int64_t NextStride = getPtrStride(PSE, Ptr, L);
-        if (NextStride == 1) {
-          // TODO: for now only allow consecutive strides of 1. We could support
-          // other strides as long as it is uniform, but let's keep it simple
-          // for now.
+        if (NextStride == 1) { 
+          // TODO: for now only allow consecutive strides of 1. We could support 
+          // other strides as long as it is uniform, but let's keep it simple 
+          // for now. 
           continue;
-        } else if (NextStride == -1 ||
-                   (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
-                   (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
-          LLVM_DEBUG(dbgs()
-                     << "Consecutive strides of 2 found, vld2/vstr2 can't "
-                        "be tail-predicated\n.");
+        } else if (NextStride == -1 || 
+                   (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) || 
+                   (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) { 
+          LLVM_DEBUG(dbgs() 
+                     << "Consecutive strides of 2 found, vld2/vstr2 can't " 
+                        "be tail-predicated\n."); 
           return false;
-          // TODO: don't tail predicate if there is a reversed load?
-        } else if (EnableMaskedGatherScatters) {
-          // Gather/scatters do allow loading from arbitrary strides, at
-          // least if they are loop invariant.
-          // TODO: Loop variant strides should in theory work, too, but
-          // this requires further testing.
-          const SCEV *PtrScev =
-              replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr);
-          if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
-            const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
-            if (PSE.getSE()->isLoopInvariant(Step, L))
-              continue;
-          }
+          // TODO: don't tail predicate if there is a reversed load? 
+        } else if (EnableMaskedGatherScatters) { 
+          // Gather/scatters do allow loading from arbitrary strides, at 
+          // least if they are loop invariant. 
+          // TODO: Loop variant strides should in theory work, too, but 
+          // this requires further testing. 
+          const SCEV *PtrScev = 
+              replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr); 
+          if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) { 
+            const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); 
+            if (PSE.getSE()->isLoopInvariant(Step, L)) 
+              continue; 
+          } 
         }
-        LLVM_DEBUG(dbgs() << "Bad stride found, can't "
-                             "tail-predicate\n.");
-        return false;
+        LLVM_DEBUG(dbgs() << "Bad stride found, can't " 
+                             "tail-predicate\n."); 
+        return false; 
       }
     }
   }
@@ -1971,7 +1971,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
     return false;
   }
 
-  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
+  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected"); 
 
   HardwareLoopInfo HWLoopInfo(L);
   if (!HWLoopInfo.canAnalyze(*LI)) {
@@ -2039,10 +2039,10 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
     return;
 
-  // Don't unroll vectorized loops, including the remainder loop
-  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
-    return;
-
+  // Don't unroll vectorized loops, including the remainder loop 
+  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 
+    return; 
+ 
   // Scan the loop: don't unroll loops with calls as this could prevent
   // inlining.
   unsigned Cost = 0;
@@ -2061,9 +2061,9 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
         return;
       }
 
-      SmallVector<const Value*, 4> Operands(I.operand_values());
-      Cost +=
-        getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
+      SmallVector<const Value*, 4> Operands(I.operand_values()); 
+      Cost += 
+        getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency); 
     }
   }
 
@@ -2092,24 +2092,24 @@ bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
                                        TTI::ReductionFlags Flags) const {
   return ST->hasMVEIntegerOps();
 }
-
-bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
-                                       TTI::ReductionFlags Flags) const {
-  if (!ST->hasMVEIntegerOps())
-    return false;
-
-  unsigned ScalarBits = Ty->getScalarSizeInBits();
-  switch (Opcode) {
-  case Instruction::Add:
-    return ScalarBits <= 64;
-  default:
-    return false;
-  }
-}
-
-bool ARMTTIImpl::preferPredicatedReductionSelect(
-    unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
-  if (!ST->hasMVEIntegerOps())
-    return false;
-  return true;
-}
+ 
+bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty, 
+                                       TTI::ReductionFlags Flags) const { 
+  if (!ST->hasMVEIntegerOps()) 
+    return false; 
+ 
+  unsigned ScalarBits = Ty->getScalarSizeInBits(); 
+  switch (Opcode) { 
+  case Instruction::Add: 
+    return ScalarBits <= 64; 
+  default: 
+    return false; 
+  } 
+} 
+ 
+bool ARMTTIImpl::preferPredicatedReductionSelect( 
+    unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { 
+  if (!ST->hasMVEIntegerOps()) 
+    return false; 
+  return true; 
+}
author	shadchin <shadchin@yandex-team.ru>	2022-02-10 16:44:30 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:44:30 +0300
commit	2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch)
tree	012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp
parent	6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff)
download	ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz