aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp
diff options
context:
space:
mode:
authorshadchin <shadchin@yandex-team.ru>2022-02-10 16:44:30 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:44:30 +0300
commit2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch)
tree012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp
parent6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff)
downloadydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp')
-rw-r--r--contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp1352
1 files changed, 676 insertions, 676 deletions
diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 8901934013..e4e4252041 100644
--- a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -20,18 +20,18 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/Casting.h"
-#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
#include <cassert>
@@ -50,38 +50,38 @@ static cl::opt<bool> DisableLowOverheadLoops(
"disable-arm-loloops", cl::Hidden, cl::init(false),
cl::desc("Disable the generation of low-overhead loops"));
-static cl::opt<bool>
- AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
- cl::desc("Enable the generation of WLS loops"));
-
+static cl::opt<bool>
+ AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
+ cl::desc("Enable the generation of WLS loops"));
+
extern cl::opt<TailPredication::Mode> EnableTailPredication;
extern cl::opt<bool> EnableMaskedGatherScatters;
-extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
-
-/// Convert a vector load intrinsic into a simple llvm load instruction.
-/// This is beneficial when the underlying object being addressed comes
-/// from a constant, since we get constant-folding for free.
-static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
- InstCombiner::BuilderTy &Builder) {
- auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
-
- if (!IntrAlign)
- return nullptr;
-
- unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
- ? MemAlign
- : IntrAlign->getLimitedValue();
-
- if (!isPowerOf2_32(Alignment))
- return nullptr;
-
- auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
- PointerType::get(II.getType(), 0));
- return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
-}
-
+extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
+
+/// Convert a vector load intrinsic into a simple llvm load instruction.
+/// This is beneficial when the underlying object being addressed comes
+/// from a constant, since we get constant-folding for free.
+static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
+ InstCombiner::BuilderTy &Builder) {
+ auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
+
+ if (!IntrAlign)
+ return nullptr;
+
+ unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
+ ? MemAlign
+ : IntrAlign->getLimitedValue();
+
+ if (!isPowerOf2_32(Alignment))
+ return nullptr;
+
+ auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
+ PointerType::get(II.getType(), 0));
+ return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
+}
+
bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -114,138 +114,138 @@ bool ARMTTIImpl::shouldFavorPostInc() const {
return false;
}
-Optional<Instruction *>
-ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
- using namespace PatternMatch;
- Intrinsic::ID IID = II.getIntrinsicID();
- switch (IID) {
- default:
- break;
- case Intrinsic::arm_neon_vld1: {
- Align MemAlign =
- getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
- &IC.getAssumptionCache(), &IC.getDominatorTree());
- if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
- }
-
- case Intrinsic::arm_neon_vld2:
- case Intrinsic::arm_neon_vld3:
- case Intrinsic::arm_neon_vld4:
- case Intrinsic::arm_neon_vld2lane:
- case Intrinsic::arm_neon_vld3lane:
- case Intrinsic::arm_neon_vld4lane:
- case Intrinsic::arm_neon_vst1:
- case Intrinsic::arm_neon_vst2:
- case Intrinsic::arm_neon_vst3:
- case Intrinsic::arm_neon_vst4:
- case Intrinsic::arm_neon_vst2lane:
- case Intrinsic::arm_neon_vst3lane:
- case Intrinsic::arm_neon_vst4lane: {
- Align MemAlign =
- getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
- &IC.getAssumptionCache(), &IC.getDominatorTree());
- unsigned AlignArg = II.getNumArgOperands() - 1;
- Value *AlignArgOp = II.getArgOperand(AlignArg);
- MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
- if (Align && *Align < MemAlign) {
- return IC.replaceOperand(
- II, AlignArg,
- ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
- false));
- }
- break;
- }
-
- case Intrinsic::arm_mve_pred_i2v: {
- Value *Arg = II.getArgOperand(0);
- Value *ArgArg;
- if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
- PatternMatch::m_Value(ArgArg))) &&
- II.getType() == ArgArg->getType()) {
- return IC.replaceInstUsesWith(II, ArgArg);
- }
- Constant *XorMask;
- if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
- PatternMatch::m_Value(ArgArg)),
- PatternMatch::m_Constant(XorMask))) &&
- II.getType() == ArgArg->getType()) {
- if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
- if (CI->getValue().trunc(16).isAllOnesValue()) {
- auto TrueVector = IC.Builder.CreateVectorSplat(
- cast<FixedVectorType>(II.getType())->getNumElements(),
- IC.Builder.getTrue());
- return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
- }
- }
- }
- KnownBits ScalarKnown(32);
- if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
- ScalarKnown, 0)) {
- return &II;
- }
- break;
- }
- case Intrinsic::arm_mve_pred_v2i: {
- Value *Arg = II.getArgOperand(0);
- Value *ArgArg;
- if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
- PatternMatch::m_Value(ArgArg)))) {
- return IC.replaceInstUsesWith(II, ArgArg);
- }
- if (!II.getMetadata(LLVMContext::MD_range)) {
- Type *IntTy32 = Type::getInt32Ty(II.getContext());
- Metadata *M[] = {
- ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
- ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))};
- II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
- return &II;
- }
- break;
- }
- case Intrinsic::arm_mve_vadc:
- case Intrinsic::arm_mve_vadc_predicated: {
- unsigned CarryOp =
- (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
- assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
- "Bad type for intrinsic!");
-
- KnownBits CarryKnown(32);
- if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
- CarryKnown)) {
- return &II;
- }
- break;
- }
- case Intrinsic::arm_mve_vmldava: {
- Instruction *I = cast<Instruction>(&II);
- if (I->hasOneUse()) {
- auto *User = cast<Instruction>(*I->user_begin());
- Value *OpZ;
- if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
- match(I->getOperand(3), m_Zero())) {
- Value *OpX = I->getOperand(4);
- Value *OpY = I->getOperand(5);
- Type *OpTy = OpX->getType();
-
- IC.Builder.SetInsertPoint(User);
- Value *V =
- IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
- {I->getOperand(0), I->getOperand(1),
- I->getOperand(2), OpZ, OpX, OpY});
-
- IC.replaceInstUsesWith(*User, V);
- return IC.eraseInstFromFunction(*User);
- }
- }
- return None;
- }
- }
- return None;
-}
-
+Optional<Instruction *>
+ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+ using namespace PatternMatch;
+ Intrinsic::ID IID = II.getIntrinsicID();
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::arm_neon_vld1: {
+ Align MemAlign =
+ getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
+ &IC.getAssumptionCache(), &IC.getDominatorTree());
+ if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+ }
+
+ case Intrinsic::arm_neon_vld2:
+ case Intrinsic::arm_neon_vld3:
+ case Intrinsic::arm_neon_vld4:
+ case Intrinsic::arm_neon_vld2lane:
+ case Intrinsic::arm_neon_vld3lane:
+ case Intrinsic::arm_neon_vld4lane:
+ case Intrinsic::arm_neon_vst1:
+ case Intrinsic::arm_neon_vst2:
+ case Intrinsic::arm_neon_vst3:
+ case Intrinsic::arm_neon_vst4:
+ case Intrinsic::arm_neon_vst2lane:
+ case Intrinsic::arm_neon_vst3lane:
+ case Intrinsic::arm_neon_vst4lane: {
+ Align MemAlign =
+ getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
+ &IC.getAssumptionCache(), &IC.getDominatorTree());
+ unsigned AlignArg = II.getNumArgOperands() - 1;
+ Value *AlignArgOp = II.getArgOperand(AlignArg);
+ MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
+ if (Align && *Align < MemAlign) {
+ return IC.replaceOperand(
+ II, AlignArg,
+ ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
+ false));
+ }
+ break;
+ }
+
+ case Intrinsic::arm_mve_pred_i2v: {
+ Value *Arg = II.getArgOperand(0);
+ Value *ArgArg;
+ if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
+ PatternMatch::m_Value(ArgArg))) &&
+ II.getType() == ArgArg->getType()) {
+ return IC.replaceInstUsesWith(II, ArgArg);
+ }
+ Constant *XorMask;
+ if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
+ PatternMatch::m_Value(ArgArg)),
+ PatternMatch::m_Constant(XorMask))) &&
+ II.getType() == ArgArg->getType()) {
+ if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
+ if (CI->getValue().trunc(16).isAllOnesValue()) {
+ auto TrueVector = IC.Builder.CreateVectorSplat(
+ cast<FixedVectorType>(II.getType())->getNumElements(),
+ IC.Builder.getTrue());
+ return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
+ }
+ }
+ }
+ KnownBits ScalarKnown(32);
+ if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
+ ScalarKnown, 0)) {
+ return &II;
+ }
+ break;
+ }
+ case Intrinsic::arm_mve_pred_v2i: {
+ Value *Arg = II.getArgOperand(0);
+ Value *ArgArg;
+ if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
+ PatternMatch::m_Value(ArgArg)))) {
+ return IC.replaceInstUsesWith(II, ArgArg);
+ }
+ if (!II.getMetadata(LLVMContext::MD_range)) {
+ Type *IntTy32 = Type::getInt32Ty(II.getContext());
+ Metadata *M[] = {
+ ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
+ ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))};
+ II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
+ return &II;
+ }
+ break;
+ }
+ case Intrinsic::arm_mve_vadc:
+ case Intrinsic::arm_mve_vadc_predicated: {
+ unsigned CarryOp =
+ (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
+ assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
+ "Bad type for intrinsic!");
+
+ KnownBits CarryKnown(32);
+ if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
+ CarryKnown)) {
+ return &II;
+ }
+ break;
+ }
+ case Intrinsic::arm_mve_vmldava: {
+ Instruction *I = cast<Instruction>(&II);
+ if (I->hasOneUse()) {
+ auto *User = cast<Instruction>(*I->user_begin());
+ Value *OpZ;
+ if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
+ match(I->getOperand(3), m_Zero())) {
+ Value *OpX = I->getOperand(4);
+ Value *OpY = I->getOperand(5);
+ Type *OpTy = OpX->getType();
+
+ IC.Builder.SetInsertPoint(User);
+ Value *V =
+ IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
+ {I->getOperand(0), I->getOperand(1),
+ I->getOperand(2), OpZ, OpX, OpY});
+
+ IC.replaceInstUsesWith(*User, V);
+ return IC.eraseInstFromFunction(*User);
+ }
+ }
+ return None;
+ }
+ }
+ return None;
+}
+
int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
@@ -289,43 +289,43 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
return 1;
}
-// Checks whether Inst is part of a min(max()) or max(min()) pattern
-// that will match to an SSAT instruction
-static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
- Value *LHS, *RHS;
- ConstantInt *C;
- SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
-
- if (InstSPF == SPF_SMAX &&
- PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
- C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
-
- auto isSSatMin = [&](Value *MinInst) {
- if (isa<SelectInst>(MinInst)) {
- Value *MinLHS, *MinRHS;
- ConstantInt *MinC;
- SelectPatternFlavor MinSPF =
- matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
- if (MinSPF == SPF_SMIN &&
- PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
- MinC->getValue() == ((-Imm) - 1))
- return true;
- }
- return false;
- };
-
- if (isSSatMin(Inst->getOperand(1)) ||
- (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
- isSSatMin(*(++Inst->user_begin())))))
- return true;
- }
- return false;
-}
-
-int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind,
- Instruction *Inst) {
+// Checks whether Inst is part of a min(max()) or max(min()) pattern
+// that will match to an SSAT instruction
+static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
+ Value *LHS, *RHS;
+ ConstantInt *C;
+ SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
+
+ if (InstSPF == SPF_SMAX &&
+ PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
+ C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
+
+ auto isSSatMin = [&](Value *MinInst) {
+ if (isa<SelectInst>(MinInst)) {
+ Value *MinLHS, *MinRHS;
+ ConstantInt *MinC;
+ SelectPatternFlavor MinSPF =
+ matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
+ if (MinSPF == SPF_SMIN &&
+ PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
+ MinC->getValue() == ((-Imm) - 1))
+ return true;
+ }
+ return false;
+ };
+
+ if (isSSatMin(Inst->getOperand(1)) ||
+ (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
+ isSSatMin(*(++Inst->user_begin())))))
+ return true;
+ }
+ return false;
+}
+
+int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
// Division by a constant can be turned into multiplication, but only if we
// know it's constant. So it's not so much that the immediate is cheap (it's
// not), but that the alternative is worse.
@@ -364,33 +364,33 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
return 0;
- // Ensures negative constant of min(max()) or max(min()) patterns that
- // match to SSAT instructions don't get hoisted
- if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
- Ty->getIntegerBitWidth() <= 32) {
- if (isSSATMinMaxPattern(Inst, Imm) ||
- (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
- isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
- return 0;
- }
-
+ // Ensures negative constant of min(max()) or max(min()) patterns that
+ // match to SSAT instructions don't get hoisted
+ if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
+ Ty->getIntegerBitWidth() <= 32) {
+ if (isSSATMinMaxPattern(Inst, Imm) ||
+ (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
+ isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
+ return 0;
+ }
+
return getIntImmCost(Imm, Ty, CostKind);
}
-int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
- if (CostKind == TTI::TCK_RecipThroughput &&
- (ST->hasNEON() || ST->hasMVEIntegerOps())) {
- // FIXME: The vectorizer is highly sensistive to the cost of these
- // instructions, which suggests that it may be using the costs incorrectly.
- // But, for now, just make them free to avoid performance regressions for
- // vector targets.
- return 0;
- }
- return BaseT::getCFInstrCost(Opcode, CostKind);
-}
-
+int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+ if (CostKind == TTI::TCK_RecipThroughput &&
+ (ST->hasNEON() || ST->hasMVEIntegerOps())) {
+ // FIXME: The vectorizer is highly sensistive to the cost of these
+ // instructions, which suggests that it may be using the costs incorrectly.
+ // But, for now, just make them free to avoid performance regressions for
+ // vector targets.
+ return 0;
+ }
+ return BaseT::getCFInstrCost(Opcode, CostKind);
+}
+
int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH,
+ TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -402,35 +402,35 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return Cost == 0 ? 0 : 1;
return Cost;
};
- auto IsLegalFPType = [this](EVT VT) {
- EVT EltVT = VT.getScalarType();
- return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
- (EltVT == MVT::f64 && ST->hasFP64()) ||
- (EltVT == MVT::f16 && ST->hasFullFP16());
- };
+ auto IsLegalFPType = [this](EVT VT) {
+ EVT EltVT = VT.getScalarType();
+ return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
+ (EltVT == MVT::f64 && ST->hasFP64()) ||
+ (EltVT == MVT::f16 && ST->hasFullFP16());
+ };
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
if (!SrcTy.isSimple() || !DstTy.isSimple())
- return AdjustCost(
- BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
-
- // Extending masked load/Truncating masked stores is expensive because we
- // currently don't split them. This means that we'll likely end up
- // loading/storing each element individually (hence the high cost).
- if ((ST->hasMVEIntegerOps() &&
- (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
- Opcode == Instruction::SExt)) ||
- (ST->hasMVEFloatOps() &&
- (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
- IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
- if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
- return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor();
-
- // The extend of other kinds of load is free
- if (CCH == TTI::CastContextHint::Normal ||
- CCH == TTI::CastContextHint::Masked) {
+ return AdjustCost(
+ BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
+
+ // Extending masked load/Truncating masked stores is expensive because we
+ // currently don't split them. This means that we'll likely end up
+ // loading/storing each element individually (hence the high cost).
+ if ((ST->hasMVEIntegerOps() &&
+ (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
+ Opcode == Instruction::SExt)) ||
+ (ST->hasMVEFloatOps() &&
+ (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
+ IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
+ if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
+ return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor();
+
+ // The extend of other kinds of load is free
+ if (CCH == TTI::CastContextHint::Normal ||
+ CCH == TTI::CastContextHint::Masked) {
static const TypeConversionCostTblEntry LoadConversionTbl[] = {
{ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
{ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
@@ -485,31 +485,31 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
- // The truncate of a store is free. This is the mirror of extends above.
- static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
+ // The truncate of a store is free. This is the mirror of extends above.
+ static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
{ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
{ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
{ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
{ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
- {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
+ {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
{ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
{ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
};
if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
if (const auto *Entry =
- ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
- SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
+ ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
+ SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
- static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
+ static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
{ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
{ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
};
if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
if (const auto *Entry =
- ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
- SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
+ ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
+ SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
}
@@ -746,24 +746,24 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (SrcTy.isFixedLengthVector())
Lanes = SrcTy.getVectorNumElements();
- if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
+ if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
return Lanes;
else
return Lanes * CallCost;
}
- if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
- SrcTy.isFixedLengthVector()) {
- // Treat a truncate with larger than legal source (128bits for MVE) as
- // expensive, 2 instructions per lane.
- if ((SrcTy.getScalarType() == MVT::i8 ||
- SrcTy.getScalarType() == MVT::i16 ||
- SrcTy.getScalarType() == MVT::i32) &&
- SrcTy.getSizeInBits() > 128 &&
- SrcTy.getSizeInBits() > DstTy.getSizeInBits())
- return SrcTy.getVectorNumElements() * 2;
- }
-
+ if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
+ SrcTy.isFixedLengthVector()) {
+ // Treat a truncate with larger than legal source (128bits for MVE) as
+ // expensive, 2 instructions per lane.
+ if ((SrcTy.getScalarType() == MVT::i8 ||
+ SrcTy.getScalarType() == MVT::i16 ||
+ SrcTy.getScalarType() == MVT::i32) &&
+ SrcTy.getSizeInBits() > 128 &&
+ SrcTy.getSizeInBits() > DstTy.getSizeInBits())
+ return SrcTy.getVectorNumElements() * 2;
+ }
+
// Scalar integer conversion costs.
static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
// i16 -> i64 requires two dependent operations.
@@ -787,7 +787,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
? ST->getMVEVectorCostFactor()
: 1;
return AdjustCost(
- BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
+ BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
}
int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -827,37 +827,37 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
}
int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
+ CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I) {
- int ISD = TLI->InstructionOpcodeToISD(Opcode);
-
- // Thumb scalar code size cost for select.
- if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
- ST->isThumb() && !ValTy->isVectorTy()) {
- // Assume expensive structs.
- if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
- return TTI::TCC_Expensive;
-
- // Select costs can vary because they:
- // - may require one or more conditional mov (including an IT),
- // - can't operate directly on immediates,
- // - require live flags, which we can't copy around easily.
- int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
-
- // Possible IT instruction for Thumb2, or more for Thumb1.
- ++Cost;
-
- // i1 values may need rematerialising by using mov immediates and/or
- // flag setting instructions.
- if (ValTy->isIntegerTy(1))
- ++Cost;
-
- return Cost;
- }
-
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+ // Thumb scalar code size cost for select.
+ if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
+ ST->isThumb() && !ValTy->isVectorTy()) {
+ // Assume expensive structs.
+ if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
+ return TTI::TCC_Expensive;
+
+ // Select costs can vary because they:
+ // - may require one or more conditional mov (including an IT),
+ // - can't operate directly on immediates,
+ // - require live flags, which we can't copy around easily.
+ int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
+
+ // Possible IT instruction for Thumb2, or more for Thumb1.
+ ++Cost;
+
+ // i1 values may need rematerialising by using mov immediates and/or
+ // flag setting instructions.
+ if (ValTy->isIntegerTy(1))
+ ++Cost;
+
+ return Cost;
+ }
+
// On NEON a vector select gets lowered to vbsl.
- if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
+ if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
// Lowering of some vector selects is currently far from perfect.
static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
@@ -878,15 +878,15 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
return LT.first;
}
- // Default to cheap (throughput/size of 1 instruction) but adjust throughput
- // for "multiple beats" potentially needed by MVE instructions.
- int BaseCost = 1;
- if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() &&
- ValTy->isVectorTy())
- BaseCost = ST->getMVEVectorCostFactor();
-
- return BaseCost *
- BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
+ // Default to cheap (throughput/size of 1 instruction) but adjust throughput
+ // for "multiple beats" potentially needed by MVE instructions.
+ int BaseCost = 1;
+ if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() &&
+ ValTy->isVectorTy())
+ BaseCost = ST->getMVEVectorCostFactor();
+
+ return BaseCost *
+ BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
}
int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -968,85 +968,85 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
(EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
}
-/// Given a memcpy/memset/memmove instruction, return the number of memory
-/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
-/// call is used.
-int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
- MemOp MOp;
- unsigned DstAddrSpace = ~0u;
- unsigned SrcAddrSpace = ~0u;
- const Function *F = I->getParent()->getParent();
-
- if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
- ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
- // If 'size' is not a constant, a library call will be generated.
- if (!C)
- return -1;
-
- const unsigned Size = C->getValue().getZExtValue();
- const Align DstAlign = *MC->getDestAlign();
- const Align SrcAlign = *MC->getSourceAlign();
-
- MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
- /*IsVolatile*/ false);
- DstAddrSpace = MC->getDestAddressSpace();
- SrcAddrSpace = MC->getSourceAddressSpace();
- }
- else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
- ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
- // If 'size' is not a constant, a library call will be generated.
- if (!C)
- return -1;
-
- const unsigned Size = C->getValue().getZExtValue();
- const Align DstAlign = *MS->getDestAlign();
-
- MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
- /*IsZeroMemset*/ false, /*IsVolatile*/ false);
- DstAddrSpace = MS->getDestAddressSpace();
- }
- else
- llvm_unreachable("Expected a memcpy/move or memset!");
-
- unsigned Limit, Factor = 2;
- switch(I->getIntrinsicID()) {
- case Intrinsic::memcpy:
- Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
- break;
- case Intrinsic::memmove:
- Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
- break;
- case Intrinsic::memset:
- Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
- Factor = 1;
- break;
- default:
- llvm_unreachable("Expected a memcpy/move or memset!");
- }
-
+/// Given a memcpy/memset/memmove instruction, return the number of memory
+/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
+/// call is used.
+int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
+ MemOp MOp;
+ unsigned DstAddrSpace = ~0u;
+ unsigned SrcAddrSpace = ~0u;
+ const Function *F = I->getParent()->getParent();
+
+ if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
+ ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
+ // If 'size' is not a constant, a library call will be generated.
+ if (!C)
+ return -1;
+
+ const unsigned Size = C->getValue().getZExtValue();
+ const Align DstAlign = *MC->getDestAlign();
+ const Align SrcAlign = *MC->getSourceAlign();
+
+ MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
+ /*IsVolatile*/ false);
+ DstAddrSpace = MC->getDestAddressSpace();
+ SrcAddrSpace = MC->getSourceAddressSpace();
+ }
+ else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
+ ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
+ // If 'size' is not a constant, a library call will be generated.
+ if (!C)
+ return -1;
+
+ const unsigned Size = C->getValue().getZExtValue();
+ const Align DstAlign = *MS->getDestAlign();
+
+ MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
+ /*IsZeroMemset*/ false, /*IsVolatile*/ false);
+ DstAddrSpace = MS->getDestAddressSpace();
+ }
+ else
+ llvm_unreachable("Expected a memcpy/move or memset!");
+
+ unsigned Limit, Factor = 2;
+ switch(I->getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
+ break;
+ case Intrinsic::memmove:
+ Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
+ break;
+ case Intrinsic::memset:
+ Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
+ Factor = 1;
+ break;
+ default:
+ llvm_unreachable("Expected a memcpy/move or memset!");
+ }
+
// MemOps will be poplulated with a list of data types that needs to be
// loaded and stored. That's why we multiply the number of elements by 2 to
// get the cost for this memcpy.
- std::vector<EVT> MemOps;
+ std::vector<EVT> MemOps;
if (getTLI()->findOptimalMemOpLowering(
- MemOps, Limit, MOp, DstAddrSpace,
- SrcAddrSpace, F->getAttributes()))
- return MemOps.size() * Factor;
+ MemOps, Limit, MOp, DstAddrSpace,
+ SrcAddrSpace, F->getAttributes()))
+ return MemOps.size() * Factor;
// If we can't find an optimal memop lowering, return the default cost
- return -1;
-}
-
-int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
- int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
-
- // To model the cost of a library call, we assume 1 for the call, and
- // 3 for the argument setup.
- if (NumOps == -1)
- return 4;
- return NumOps;
+ return -1;
}
+int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
+ int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
+
+ // To model the cost of a library call, we assume 1 for the call, and
+ // 3 for the argument setup.
+ if (NumOps == -1)
+ return 4;
+ return NumOps;
+}
+
int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
int Index, VectorType *SubTp) {
if (ST->hasNEON()) {
@@ -1149,21 +1149,21 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
TTI::OperandValueProperties Opd2PropInfo,
ArrayRef<const Value *> Args,
const Instruction *CxtI) {
- int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
- if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
- // Make operations on i1 relatively expensive as this often involves
- // combining predicates. AND and XOR should be easier to handle with IT
- // blocks.
- switch (ISDOpcode) {
- default:
- break;
- case ISD::AND:
- case ISD::XOR:
- return 2;
- case ISD::OR:
- return 3;
- }
- }
+ int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
+ if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
+ // Make operations on i1 relatively expensive as this often involves
+ // combining predicates. AND and XOR should be easier to handle with IT
+ // blocks.
+ switch (ISDOpcode) {
+ default:
+ break;
+ case ISD::AND:
+ case ISD::XOR:
+ return 2;
+ case ISD::OR:
+ return 3;
+ }
+ }
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
@@ -1259,12 +1259,12 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
if (LooksLikeAFreeShift())
return 0;
- // Default to cheap (throughput/size of 1 instruction) but adjust throughput
- // for "multiple beats" potentially needed by MVE instructions.
- int BaseCost = 1;
- if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() &&
- Ty->isVectorTy())
- BaseCost = ST->getMVEVectorCostFactor();
+ // Default to cheap (throughput/size of 1 instruction) but adjust throughput
+ // for "multiple beats" potentially needed by MVE instructions.
+ int BaseCost = 1;
+ if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() &&
+ Ty->isVectorTy())
+ BaseCost = ST->getMVEVectorCostFactor();
// The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
// without treating floats as more expensive that scalars or increasing the
@@ -1331,24 +1331,24 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
CostKind, I);
}
-unsigned ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
- Align Alignment,
- unsigned AddressSpace,
- TTI::TargetCostKind CostKind) {
- if (ST->hasMVEIntegerOps()) {
- if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
- return ST->getMVEVectorCostFactor();
- if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
- return ST->getMVEVectorCostFactor();
- }
- if (!isa<FixedVectorType>(Src))
- return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
- CostKind);
- // Scalar cost, which is currently very high due to the efficiency of the
- // generated code.
- return cast<FixedVectorType>(Src)->getNumElements() * 8;
-}
-
+unsigned ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+ Align Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind) {
+ if (ST->hasMVEIntegerOps()) {
+ if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
+ return ST->getMVEVectorCostFactor();
+ if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
+ return ST->getMVEVectorCostFactor();
+ }
+ if (!isa<FixedVectorType>(Src))
+ return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
+ // Scalar cost, which is currently very high due to the efficiency of the
+ // generated code.
+ return cast<FixedVectorType>(Src)->getNumElements() * 8;
+}
+
int ARMTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -1379,8 +1379,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(
// promoted differently). The cost of 2 here is then a load and vrev or
// vmovn.
if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
- VecTy->isIntOrIntVectorTy() &&
- DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
+ VecTy->isIntOrIntVectorTy() &&
+ DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
return 2 * BaseCost;
}
@@ -1413,13 +1413,13 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
// multiplied by the number of elements being loaded. This is possibly very
// conservative, but even so we still end up vectorising loops because the
// cost per iteration for many loops is lower than for scalar loops.
- unsigned VectorCost = NumElems * LT.first * ST->getMVEVectorCostFactor();
+ unsigned VectorCost = NumElems * LT.first * ST->getMVEVectorCostFactor();
// The scalarization cost should be a lot higher. We use the number of vector
// elements plus the scalarization overhead.
unsigned ScalarCost =
NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {});
- if (EltSize < 8 || Alignment < EltSize / 8)
+ if (EltSize < 8 || Alignment < EltSize / 8)
return ScalarCost;
unsigned ExtSize = EltSize;
@@ -1488,92 +1488,92 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
return ScalarCost;
}
-int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
- bool IsPairwiseForm,
- TTI::TargetCostKind CostKind) {
- EVT ValVT = TLI->getValueType(DL, ValTy);
- int ISD = TLI->InstructionOpcodeToISD(Opcode);
- if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
- CostKind);
-
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
-
- static const CostTblEntry CostTblAdd[]{
- {ISD::ADD, MVT::v16i8, 1},
- {ISD::ADD, MVT::v8i16, 1},
- {ISD::ADD, MVT::v4i32, 1},
- };
- if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
- return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first;
-
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
- CostKind);
-}
-
-InstructionCost
-ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
- Type *ResTy, VectorType *ValTy,
- TTI::TargetCostKind CostKind) {
- EVT ValVT = TLI->getValueType(DL, ValTy);
- EVT ResVT = TLI->getValueType(DL, ResTy);
- if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
- if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) ||
- (LT.second == MVT::v8i16 &&
- ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) ||
- (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64))
- return ST->getMVEVectorCostFactor() * LT.first;
- }
-
- return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
- CostKind);
-}
-
-int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind) {
- switch (ICA.getID()) {
- case Intrinsic::get_active_lane_mask:
- // Currently we make a somewhat optimistic assumption that
- // active_lane_mask's are always free. In reality it may be freely folded
- // into a tail predicated loop, expanded into a VCPT or expanded into a lot
- // of add/icmp code. We may need to improve this in the future, but being
- // able to detect if it is free or not involves looking at a lot of other
- // code. We currently assume that the vectorizer inserted these, and knew
- // what it was doing in adding one.
- if (ST->hasMVEIntegerOps())
- return 0;
- break;
- case Intrinsic::sadd_sat:
- case Intrinsic::ssub_sat:
- case Intrinsic::uadd_sat:
- case Intrinsic::usub_sat: {
- if (!ST->hasMVEIntegerOps())
- break;
- // Get the Return type, either directly of from ICA.ReturnType and ICA.VF.
- Type *VT = ICA.getReturnType();
- if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar())
- VT = VectorType::get(VT, ICA.getVectorFactor());
-
- std::pair<int, MVT> LT =
- TLI->getTypeLegalizationCost(DL, VT);
- if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
- LT.second == MVT::v16i8) {
- // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
- // need to extend the type, as it uses shr(qadd(shl, shl)).
- unsigned Instrs = LT.second.getScalarSizeInBits() ==
- ICA.getReturnType()->getScalarSizeInBits()
- ? 1
- : 4;
- return LT.first * ST->getMVEVectorCostFactor() * Instrs;
- }
- break;
- }
- }
-
- return BaseT::getIntrinsicInstrCost(ICA, CostKind);
-}
-
+int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+ bool IsPairwiseForm,
+ TTI::TargetCostKind CostKind) {
+ EVT ValVT = TLI->getValueType(DL, ValTy);
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
+ CostKind);
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ static const CostTblEntry CostTblAdd[]{
+ {ISD::ADD, MVT::v16i8, 1},
+ {ISD::ADD, MVT::v8i16, 1},
+ {ISD::ADD, MVT::v4i32, 1},
+ };
+ if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
+ return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first;
+
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
+ CostKind);
+}
+
+InstructionCost
+ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
+ Type *ResTy, VectorType *ValTy,
+ TTI::TargetCostKind CostKind) {
+ EVT ValVT = TLI->getValueType(DL, ValTy);
+ EVT ResVT = TLI->getValueType(DL, ResTy);
+ if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) ||
+ (LT.second == MVT::v8i16 &&
+ ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) ||
+ (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64))
+ return ST->getMVEVectorCostFactor() * LT.first;
+ }
+
+ return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
+ CostKind);
+}
+
+int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
+ switch (ICA.getID()) {
+ case Intrinsic::get_active_lane_mask:
+ // Currently we make a somewhat optimistic assumption that
+ // active_lane_mask's are always free. In reality it may be freely folded
+ // into a tail predicated loop, expanded into a VCPT or expanded into a lot
+ // of add/icmp code. We may need to improve this in the future, but being
+ // able to detect if it is free or not involves looking at a lot of other
+ // code. We currently assume that the vectorizer inserted these, and knew
+ // what it was doing in adding one.
+ if (ST->hasMVEIntegerOps())
+ return 0;
+ break;
+ case Intrinsic::sadd_sat:
+ case Intrinsic::ssub_sat:
+ case Intrinsic::uadd_sat:
+ case Intrinsic::usub_sat: {
+ if (!ST->hasMVEIntegerOps())
+ break;
+ // Get the Return type, either directly of from ICA.ReturnType and ICA.VF.
+ Type *VT = ICA.getReturnType();
+ if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar())
+ VT = VectorType::get(VT, ICA.getVectorFactor());
+
+ std::pair<int, MVT> LT =
+ TLI->getTypeLegalizationCost(DL, VT);
+ if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
+ LT.second == MVT::v16i8) {
+ // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
+ // need to extend the type, as it uses shr(qadd(shl, shl)).
+ unsigned Instrs = LT.second.getScalarSizeInBits() ==
+ ICA.getReturnType()->getScalarSizeInBits()
+ ? 1
+ : 4;
+ return LT.first * ST->getMVEVectorCostFactor() * Instrs;
+ }
+ break;
+ }
+ }
+
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+}
+
bool ARMTTIImpl::isLoweredToCall(const Function *F) {
if (!F->isIntrinsic())
BaseT::isLoweredToCall(F);
@@ -1635,93 +1635,93 @@ bool ARMTTIImpl::isLoweredToCall(const Function *F) {
return BaseT::isLoweredToCall(F);
}
-bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
- unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
- EVT VT = TLI->getValueType(DL, I.getType(), true);
- if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
- return true;
-
- // Check if an intrinsic will be lowered to a call and assume that any
- // other CallInst will generate a bl.
- if (auto *Call = dyn_cast<CallInst>(&I)) {
- if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
- switch(II->getIntrinsicID()) {
- case Intrinsic::memcpy:
- case Intrinsic::memset:
- case Intrinsic::memmove:
- return getNumMemOps(II) == -1;
- default:
- if (const Function *F = Call->getCalledFunction())
- return isLoweredToCall(F);
- }
- }
- return true;
- }
-
- // FPv5 provides conversions between integer, double-precision,
- // single-precision, and half-precision formats.
- switch (I.getOpcode()) {
- default:
- break;
- case Instruction::FPToSI:
- case Instruction::FPToUI:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::FPTrunc:
- case Instruction::FPExt:
- return !ST->hasFPARMv8Base();
- }
-
- // FIXME: Unfortunately the approach of checking the Operation Action does
- // not catch all cases of Legalization that use library calls. Our
- // Legalization step categorizes some transformations into library calls as
- // Custom, Expand or even Legal when doing type legalization. So for now
- // we have to special case for instance the SDIV of 64bit integers and the
- // use of floating point emulation.
- if (VT.isInteger() && VT.getSizeInBits() >= 64) {
- switch (ISD) {
- default:
- break;
- case ISD::SDIV:
- case ISD::UDIV:
- case ISD::SREM:
- case ISD::UREM:
- case ISD::SDIVREM:
- case ISD::UDIVREM:
- return true;
- }
- }
-
- // Assume all other non-float operations are supported.
- if (!VT.isFloatingPoint())
- return false;
-
- // We'll need a library call to handle most floats when using soft.
- if (TLI->useSoftFloat()) {
- switch (I.getOpcode()) {
- default:
- return true;
- case Instruction::Alloca:
- case Instruction::Load:
- case Instruction::Store:
- case Instruction::Select:
- case Instruction::PHI:
- return false;
- }
- }
-
- // We'll need a libcall to perform double precision operations on a single
- // precision only FPU.
- if (I.getType()->isDoubleTy() && !ST->hasFP64())
- return true;
-
- // Likewise for half precision arithmetic.
- if (I.getType()->isHalfTy() && !ST->hasFullFP16())
- return true;
-
- return false;
-}
-
+bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
+ unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
+ EVT VT = TLI->getValueType(DL, I.getType(), true);
+ if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
+ return true;
+
+ // Check if an intrinsic will be lowered to a call and assume that any
+ // other CallInst will generate a bl.
+ if (auto *Call = dyn_cast<CallInst>(&I)) {
+ if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
+ switch(II->getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ return getNumMemOps(II) == -1;
+ default:
+ if (const Function *F = Call->getCalledFunction())
+ return isLoweredToCall(F);
+ }
+ }
+ return true;
+ }
+
+ // FPv5 provides conversions between integer, double-precision,
+ // single-precision, and half-precision formats.
+ switch (I.getOpcode()) {
+ default:
+ break;
+ case Instruction::FPToSI:
+ case Instruction::FPToUI:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ return !ST->hasFPARMv8Base();
+ }
+
+ // FIXME: Unfortunately the approach of checking the Operation Action does
+ // not catch all cases of Legalization that use library calls. Our
+ // Legalization step categorizes some transformations into library calls as
+ // Custom, Expand or even Legal when doing type legalization. So for now
+ // we have to special case for instance the SDIV of 64bit integers and the
+ // use of floating point emulation.
+ if (VT.isInteger() && VT.getSizeInBits() >= 64) {
+ switch (ISD) {
+ default:
+ break;
+ case ISD::SDIV:
+ case ISD::UDIV:
+ case ISD::SREM:
+ case ISD::UREM:
+ case ISD::SDIVREM:
+ case ISD::UDIVREM:
+ return true;
+ }
+ }
+
+ // Assume all other non-float operations are supported.
+ if (!VT.isFloatingPoint())
+ return false;
+
+ // We'll need a library call to handle most floats when using soft.
+ if (TLI->useSoftFloat()) {
+ switch (I.getOpcode()) {
+ default:
+ return true;
+ case Instruction::Alloca:
+ case Instruction::Load:
+ case Instruction::Store:
+ case Instruction::Select:
+ case Instruction::PHI:
+ return false;
+ }
+ }
+
+ // We'll need a libcall to perform double precision operations on a single
+ // precision only FPU.
+ if (I.getType()->isDoubleTy() && !ST->hasFP64())
+ return true;
+
+ // Likewise for half precision arithmetic.
+ if (I.getType()->isHalfTy() && !ST->hasFullFP16())
+ return true;
+
+ return false;
+}
+
bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
@@ -1762,7 +1762,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
switch (Call->getIntrinsicID()) {
default:
break;
- case Intrinsic::start_loop_iterations:
+ case Intrinsic::start_loop_iterations:
case Intrinsic::test_set_loop_iterations:
case Intrinsic::loop_decrement:
case Intrinsic::loop_decrement_reg:
@@ -1773,24 +1773,24 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
};
// Scan the instructions to see if there's any that we know will turn into a
- // call or if this loop is already a low-overhead loop or will become a tail
- // predicated loop.
- bool IsTailPredLoop = false;
+ // call or if this loop is already a low-overhead loop or will become a tail
+ // predicated loop.
+ bool IsTailPredLoop = false;
auto ScanLoop = [&](Loop *L) {
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
- if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
- isa<InlineAsm>(I)) {
+ if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
+ isa<InlineAsm>(I)) {
LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
return false;
}
- if (auto *II = dyn_cast<IntrinsicInst>(&I))
- IsTailPredLoop |=
- II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
- II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
- II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
- II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
- II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ IsTailPredLoop |=
+ II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
+ II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
+ II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
+ II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
+ II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
}
}
return true;
@@ -1811,7 +1811,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
LLVMContext &C = L->getHeader()->getContext();
HWLoopInfo.CounterInReg = true;
HWLoopInfo.IsNestingLegal = false;
- HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
+ HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
HWLoopInfo.CountType = Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
return true;
@@ -1859,28 +1859,28 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
const LoopAccessInfo *LAI) {
LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
- // If there are live-out values, it is probably a reduction. We can predicate
- // most reduction operations freely under MVE using a combination of
- // prefer-predicated-reduction-select and inloop reductions. We limit this to
- // floating point and integer reductions, but don't check for operators
- // specifically here. If the value ends up not being a reduction (and so the
- // vectorizer cannot tailfold the loop), we should fall back to standard
- // vectorization automatically.
+ // If there are live-out values, it is probably a reduction. We can predicate
+ // most reduction operations freely under MVE using a combination of
+ // prefer-predicated-reduction-select and inloop reductions. We limit this to
+ // floating point and integer reductions, but don't check for operators
+ // specifically here. If the value ends up not being a reduction (and so the
+ // vectorizer cannot tailfold the loop), we should fall back to standard
+ // vectorization automatically.
SmallVector< Instruction *, 8 > LiveOuts;
LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
- bool ReductionsDisabled =
+ bool ReductionsDisabled =
EnableTailPredication == TailPredication::EnabledNoReductions ||
EnableTailPredication == TailPredication::ForceEnabledNoReductions;
for (auto *I : LiveOuts) {
- if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
- !I->getType()->isHalfTy()) {
- LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
+ if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
+ !I->getType()->isHalfTy()) {
+ LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
"live-out value\n");
return false;
}
- if (ReductionsDisabled) {
- LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
+ if (ReductionsDisabled) {
+ LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
return false;
}
}
@@ -1910,35 +1910,35 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
int64_t NextStride = getPtrStride(PSE, Ptr, L);
- if (NextStride == 1) {
- // TODO: for now only allow consecutive strides of 1. We could support
- // other strides as long as it is uniform, but let's keep it simple
- // for now.
+ if (NextStride == 1) {
+ // TODO: for now only allow consecutive strides of 1. We could support
+ // other strides as long as it is uniform, but let's keep it simple
+ // for now.
continue;
- } else if (NextStride == -1 ||
- (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
- (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
- LLVM_DEBUG(dbgs()
- << "Consecutive strides of 2 found, vld2/vstr2 can't "
- "be tail-predicated\n.");
+ } else if (NextStride == -1 ||
+ (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
+ (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
+ LLVM_DEBUG(dbgs()
+ << "Consecutive strides of 2 found, vld2/vstr2 can't "
+ "be tail-predicated\n.");
return false;
- // TODO: don't tail predicate if there is a reversed load?
- } else if (EnableMaskedGatherScatters) {
- // Gather/scatters do allow loading from arbitrary strides, at
- // least if they are loop invariant.
- // TODO: Loop variant strides should in theory work, too, but
- // this requires further testing.
- const SCEV *PtrScev =
- replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr);
- if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
- const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
- if (PSE.getSE()->isLoopInvariant(Step, L))
- continue;
- }
+ // TODO: don't tail predicate if there is a reversed load?
+ } else if (EnableMaskedGatherScatters) {
+ // Gather/scatters do allow loading from arbitrary strides, at
+ // least if they are loop invariant.
+ // TODO: Loop variant strides should in theory work, too, but
+ // this requires further testing.
+ const SCEV *PtrScev =
+ replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr);
+ if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
+ const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
+ if (PSE.getSE()->isLoopInvariant(Step, L))
+ continue;
+ }
}
- LLVM_DEBUG(dbgs() << "Bad stride found, can't "
- "tail-predicate\n.");
- return false;
+ LLVM_DEBUG(dbgs() << "Bad stride found, can't "
+ "tail-predicate\n.");
+ return false;
}
}
}
@@ -1971,7 +1971,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
return false;
}
- assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
+ assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
HardwareLoopInfo HWLoopInfo(L);
if (!HWLoopInfo.canAnalyze(*LI)) {
@@ -2039,10 +2039,10 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
return;
- // Don't unroll vectorized loops, including the remainder loop
- if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
- return;
-
+ // Don't unroll vectorized loops, including the remainder loop
+ if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
+ return;
+
// Scan the loop: don't unroll loops with calls as this could prevent
// inlining.
unsigned Cost = 0;
@@ -2061,9 +2061,9 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
return;
}
- SmallVector<const Value*, 4> Operands(I.operand_values());
- Cost +=
- getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
+ SmallVector<const Value*, 4> Operands(I.operand_values());
+ Cost +=
+ getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
}
}
@@ -2092,24 +2092,24 @@ bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
return ST->hasMVEIntegerOps();
}
-
-bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
- TTI::ReductionFlags Flags) const {
- if (!ST->hasMVEIntegerOps())
- return false;
-
- unsigned ScalarBits = Ty->getScalarSizeInBits();
- switch (Opcode) {
- case Instruction::Add:
- return ScalarBits <= 64;
- default:
- return false;
- }
-}
-
-bool ARMTTIImpl::preferPredicatedReductionSelect(
- unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
- if (!ST->hasMVEIntegerOps())
- return false;
- return true;
-}
+
+bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const {
+ if (!ST->hasMVEIntegerOps())
+ return false;
+
+ unsigned ScalarBits = Ty->getScalarSizeInBits();
+ switch (Opcode) {
+ case Instruction::Add:
+ return ScalarBits <= 64;
+ default:
+ return false;
+ }
+}
+
+bool ARMTTIImpl::preferPredicatedReductionSelect(
+ unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
+ if (!ST->hasMVEIntegerOps())
+ return false;
+ return true;
+}