diff options
author | shadchin <shadchin@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
commit | 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch) | |
tree | 012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp | |
parent | 6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff) | |
download | ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz |
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp')
-rw-r--r-- | contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp | 1352 |
1 files changed, 676 insertions, 676 deletions
diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp index 8901934013..e4e4252041 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -20,18 +20,18 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/KnownBits.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/InstCombine/InstCombiner.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <algorithm> #include <cassert> @@ -50,38 +50,38 @@ static cl::opt<bool> DisableLowOverheadLoops( "disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops")); -static cl::opt<bool> - AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), - cl::desc("Enable the generation of WLS loops")); - +static cl::opt<bool> + AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), + cl::desc("Enable the generation of WLS loops")); + extern cl::opt<TailPredication::Mode> EnableTailPredication; extern cl::opt<bool> EnableMaskedGatherScatters; -extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor; - -/// Convert a vector load intrinsic into a simple llvm load instruction. -/// This is beneficial when the underlying object being addressed comes -/// from a constant, since we get constant-folding for free. -static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, - InstCombiner::BuilderTy &Builder) { - auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); - - if (!IntrAlign) - return nullptr; - - unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign - ? MemAlign - : IntrAlign->getLimitedValue(); - - if (!isPowerOf2_32(Alignment)) - return nullptr; - - auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), - PointerType::get(II.getType(), 0)); - return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); -} - +extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor; + +/// Convert a vector load intrinsic into a simple llvm load instruction. +/// This is beneficial when the underlying object being addressed comes +/// from a constant, since we get constant-folding for free. +static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, + InstCombiner::BuilderTy &Builder) { + auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); + + if (!IntrAlign) + return nullptr; + + unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign + ? MemAlign + : IntrAlign->getLimitedValue(); + + if (!isPowerOf2_32(Alignment)) + return nullptr; + + auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), + PointerType::get(II.getType(), 0)); + return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); +} + bool ARMTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -114,138 +114,138 @@ bool ARMTTIImpl::shouldFavorPostInc() const { return false; } -Optional<Instruction *> -ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { - using namespace PatternMatch; - Intrinsic::ID IID = II.getIntrinsicID(); - switch (IID) { - default: - break; - case Intrinsic::arm_neon_vld1: { - Align MemAlign = - getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()); - if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - } - - case Intrinsic::arm_neon_vld2: - case Intrinsic::arm_neon_vld3: - case Intrinsic::arm_neon_vld4: - case Intrinsic::arm_neon_vld2lane: - case Intrinsic::arm_neon_vld3lane: - case Intrinsic::arm_neon_vld4lane: - case Intrinsic::arm_neon_vst1: - case Intrinsic::arm_neon_vst2: - case Intrinsic::arm_neon_vst3: - case Intrinsic::arm_neon_vst4: - case Intrinsic::arm_neon_vst2lane: - case Intrinsic::arm_neon_vst3lane: - case Intrinsic::arm_neon_vst4lane: { - Align MemAlign = - getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()); - unsigned AlignArg = II.getNumArgOperands() - 1; - Value *AlignArgOp = II.getArgOperand(AlignArg); - MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue(); - if (Align && *Align < MemAlign) { - return IC.replaceOperand( - II, AlignArg, - ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(), - false)); - } - break; - } - - case Intrinsic::arm_mve_pred_i2v: { - Value *Arg = II.getArgOperand(0); - Value *ArgArg; - if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( - PatternMatch::m_Value(ArgArg))) && - II.getType() == ArgArg->getType()) { - return IC.replaceInstUsesWith(II, ArgArg); - } - Constant *XorMask; - if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( - PatternMatch::m_Value(ArgArg)), - PatternMatch::m_Constant(XorMask))) && - II.getType() == ArgArg->getType()) { - if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { - if (CI->getValue().trunc(16).isAllOnesValue()) { - auto TrueVector = IC.Builder.CreateVectorSplat( - cast<FixedVectorType>(II.getType())->getNumElements(), - IC.Builder.getTrue()); - return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); - } - } - } - KnownBits ScalarKnown(32); - if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16), - ScalarKnown, 0)) { - return &II; - } - break; - } - case Intrinsic::arm_mve_pred_v2i: { - Value *Arg = II.getArgOperand(0); - Value *ArgArg; - if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>( - PatternMatch::m_Value(ArgArg)))) { - return IC.replaceInstUsesWith(II, ArgArg); - } - if (!II.getMetadata(LLVMContext::MD_range)) { - Type *IntTy32 = Type::getInt32Ty(II.getContext()); - Metadata *M[] = { - ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), - ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))}; - II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M)); - return &II; - } - break; - } - case Intrinsic::arm_mve_vadc: - case Intrinsic::arm_mve_vadc_predicated: { - unsigned CarryOp = - (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; - assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && - "Bad type for intrinsic!"); - - KnownBits CarryKnown(32); - if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29), - CarryKnown)) { - return &II; - } - break; - } - case Intrinsic::arm_mve_vmldava: { - Instruction *I = cast<Instruction>(&II); - if (I->hasOneUse()) { - auto *User = cast<Instruction>(*I->user_begin()); - Value *OpZ; - if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) && - match(I->getOperand(3), m_Zero())) { - Value *OpX = I->getOperand(4); - Value *OpY = I->getOperand(5); - Type *OpTy = OpX->getType(); - - IC.Builder.SetInsertPoint(User); - Value *V = - IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy}, - {I->getOperand(0), I->getOperand(1), - I->getOperand(2), OpZ, OpX, OpY}); - - IC.replaceInstUsesWith(*User, V); - return IC.eraseInstFromFunction(*User); - } - } - return None; - } - } - return None; -} - +Optional<Instruction *> +ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { + using namespace PatternMatch; + Intrinsic::ID IID = II.getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::arm_neon_vld1: { + Align MemAlign = + getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()); + if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + } + + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + Align MemAlign = + getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()); + unsigned AlignArg = II.getNumArgOperands() - 1; + Value *AlignArgOp = II.getArgOperand(AlignArg); + MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue(); + if (Align && *Align < MemAlign) { + return IC.replaceOperand( + II, AlignArg, + ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(), + false)); + } + break; + } + + case Intrinsic::arm_mve_pred_i2v: { + Value *Arg = II.getArgOperand(0); + Value *ArgArg; + if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( + PatternMatch::m_Value(ArgArg))) && + II.getType() == ArgArg->getType()) { + return IC.replaceInstUsesWith(II, ArgArg); + } + Constant *XorMask; + if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( + PatternMatch::m_Value(ArgArg)), + PatternMatch::m_Constant(XorMask))) && + II.getType() == ArgArg->getType()) { + if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { + if (CI->getValue().trunc(16).isAllOnesValue()) { + auto TrueVector = IC.Builder.CreateVectorSplat( + cast<FixedVectorType>(II.getType())->getNumElements(), + IC.Builder.getTrue()); + return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); + } + } + } + KnownBits ScalarKnown(32); + if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16), + ScalarKnown, 0)) { + return &II; + } + break; + } + case Intrinsic::arm_mve_pred_v2i: { + Value *Arg = II.getArgOperand(0); + Value *ArgArg; + if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>( + PatternMatch::m_Value(ArgArg)))) { + return IC.replaceInstUsesWith(II, ArgArg); + } + if (!II.getMetadata(LLVMContext::MD_range)) { + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + Metadata *M[] = { + ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), + ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))}; + II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M)); + return &II; + } + break; + } + case Intrinsic::arm_mve_vadc: + case Intrinsic::arm_mve_vadc_predicated: { + unsigned CarryOp = + (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; + assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && + "Bad type for intrinsic!"); + + KnownBits CarryKnown(32); + if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29), + CarryKnown)) { + return &II; + } + break; + } + case Intrinsic::arm_mve_vmldava: { + Instruction *I = cast<Instruction>(&II); + if (I->hasOneUse()) { + auto *User = cast<Instruction>(*I->user_begin()); + Value *OpZ; + if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) && + match(I->getOperand(3), m_Zero())) { + Value *OpX = I->getOperand(4); + Value *OpY = I->getOperand(5); + Type *OpTy = OpX->getType(); + + IC.Builder.SetInsertPoint(User); + Value *V = + IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy}, + {I->getOperand(0), I->getOperand(1), + I->getOperand(2), OpZ, OpX, OpY}); + + IC.replaceInstUsesWith(*User, V); + return IC.eraseInstFromFunction(*User); + } + } + return None; + } + } + return None; +} + int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); @@ -289,43 +289,43 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, return 1; } -// Checks whether Inst is part of a min(max()) or max(min()) pattern -// that will match to an SSAT instruction -static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { - Value *LHS, *RHS; - ConstantInt *C; - SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor; - - if (InstSPF == SPF_SMAX && - PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) && - C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) { - - auto isSSatMin = [&](Value *MinInst) { - if (isa<SelectInst>(MinInst)) { - Value *MinLHS, *MinRHS; - ConstantInt *MinC; - SelectPatternFlavor MinSPF = - matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor; - if (MinSPF == SPF_SMIN && - PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) && - MinC->getValue() == ((-Imm) - 1)) - return true; - } - return false; - }; - - if (isSSatMin(Inst->getOperand(1)) || - (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) || - isSSatMin(*(++Inst->user_begin()))))) - return true; - } - return false; -} - -int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty, - TTI::TargetCostKind CostKind, - Instruction *Inst) { +// Checks whether Inst is part of a min(max()) or max(min()) pattern +// that will match to an SSAT instruction +static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { + Value *LHS, *RHS; + ConstantInt *C; + SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor; + + if (InstSPF == SPF_SMAX && + PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) && + C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) { + + auto isSSatMin = [&](Value *MinInst) { + if (isa<SelectInst>(MinInst)) { + Value *MinLHS, *MinRHS; + ConstantInt *MinC; + SelectPatternFlavor MinSPF = + matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor; + if (MinSPF == SPF_SMIN && + PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) && + MinC->getValue() == ((-Imm) - 1)) + return true; + } + return false; + }; + + if (isSSatMin(Inst->getOperand(1)) || + (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) || + isSSatMin(*(++Inst->user_begin()))))) + return true; + } + return false; +} + +int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind, + Instruction *Inst) { // Division by a constant can be turned into multiplication, but only if we // know it's constant. So it's not so much that the immediate is cheap (it's // not), but that the alternative is worse. @@ -364,33 +364,33 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, if (Opcode == Instruction::Xor && Imm.isAllOnesValue()) return 0; - // Ensures negative constant of min(max()) or max(min()) patterns that - // match to SSAT instructions don't get hoisted - if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) && - Ty->getIntegerBitWidth() <= 32) { - if (isSSATMinMaxPattern(Inst, Imm) || - (isa<ICmpInst>(Inst) && Inst->hasOneUse() && - isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm))) - return 0; - } - + // Ensures negative constant of min(max()) or max(min()) patterns that + // match to SSAT instructions don't get hoisted + if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) && + Ty->getIntegerBitWidth() <= 32) { + if (isSSATMinMaxPattern(Inst, Imm) || + (isa<ICmpInst>(Inst) && Inst->hasOneUse() && + isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm))) + return 0; + } + return getIntImmCost(Imm, Ty, CostKind); } -int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { - if (CostKind == TTI::TCK_RecipThroughput && - (ST->hasNEON() || ST->hasMVEIntegerOps())) { - // FIXME: The vectorizer is highly sensistive to the cost of these - // instructions, which suggests that it may be using the costs incorrectly. - // But, for now, just make them free to avoid performance regressions for - // vector targets. - return 0; - } - return BaseT::getCFInstrCost(Opcode, CostKind); -} - +int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { + if (CostKind == TTI::TCK_RecipThroughput && + (ST->hasNEON() || ST->hasMVEIntegerOps())) { + // FIXME: The vectorizer is highly sensistive to the cost of these + // instructions, which suggests that it may be using the costs incorrectly. + // But, for now, just make them free to avoid performance regressions for + // vector targets. + return 0; + } + return BaseT::getCFInstrCost(Opcode, CostKind); +} + int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::CastContextHint CCH, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -402,35 +402,35 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return Cost == 0 ? 0 : 1; return Cost; }; - auto IsLegalFPType = [this](EVT VT) { - EVT EltVT = VT.getScalarType(); - return (EltVT == MVT::f32 && ST->hasVFP2Base()) || - (EltVT == MVT::f64 && ST->hasFP64()) || - (EltVT == MVT::f16 && ST->hasFullFP16()); - }; + auto IsLegalFPType = [this](EVT VT) { + EVT EltVT = VT.getScalarType(); + return (EltVT == MVT::f32 && ST->hasVFP2Base()) || + (EltVT == MVT::f64 && ST->hasFP64()) || + (EltVT == MVT::f16 && ST->hasFullFP16()); + }; EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) - return AdjustCost( - BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); - - // Extending masked load/Truncating masked stores is expensive because we - // currently don't split them. This means that we'll likely end up - // loading/storing each element individually (hence the high cost). - if ((ST->hasMVEIntegerOps() && - (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt || - Opcode == Instruction::SExt)) || - (ST->hasMVEFloatOps() && - (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) && - IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))) - if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128) - return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor(); - - // The extend of other kinds of load is free - if (CCH == TTI::CastContextHint::Normal || - CCH == TTI::CastContextHint::Masked) { + return AdjustCost( + BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); + + // Extending masked load/Truncating masked stores is expensive because we + // currently don't split them. This means that we'll likely end up + // loading/storing each element individually (hence the high cost). + if ((ST->hasMVEIntegerOps() && + (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt || + Opcode == Instruction::SExt)) || + (ST->hasMVEFloatOps() && + (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) && + IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))) + if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128) + return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor(); + + // The extend of other kinds of load is free + if (CCH == TTI::CastContextHint::Normal || + CCH == TTI::CastContextHint::Masked) { static const TypeConversionCostTblEntry LoadConversionTbl[] = { {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, @@ -485,31 +485,31 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); } - // The truncate of a store is free. This is the mirror of extends above. - static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = { + // The truncate of a store is free. This is the mirror of extends above. + static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = { {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0}, {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0}, {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0}, {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1}, - {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1}, + {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1}, {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3}, {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1}, }; if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { if (const auto *Entry = - ConvertCostTableLookup(MVEStoreConversionTbl, ISD, - SrcTy.getSimpleVT(), DstTy.getSimpleVT())) + ConvertCostTableLookup(MVEStoreConversionTbl, ISD, + SrcTy.getSimpleVT(), DstTy.getSimpleVT())) return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); } - static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = { + static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = { {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1}, {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3}, }; if (SrcTy.isVector() && ST->hasMVEFloatOps()) { if (const auto *Entry = - ConvertCostTableLookup(MVEFStoreConversionTbl, ISD, - SrcTy.getSimpleVT(), DstTy.getSimpleVT())) + ConvertCostTableLookup(MVEFStoreConversionTbl, ISD, + SrcTy.getSimpleVT(), DstTy.getSimpleVT())) return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); } } @@ -746,24 +746,24 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (SrcTy.isFixedLengthVector()) Lanes = SrcTy.getVectorNumElements(); - if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)) + if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)) return Lanes; else return Lanes * CallCost; } - if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() && - SrcTy.isFixedLengthVector()) { - // Treat a truncate with larger than legal source (128bits for MVE) as - // expensive, 2 instructions per lane. - if ((SrcTy.getScalarType() == MVT::i8 || - SrcTy.getScalarType() == MVT::i16 || - SrcTy.getScalarType() == MVT::i32) && - SrcTy.getSizeInBits() > 128 && - SrcTy.getSizeInBits() > DstTy.getSizeInBits()) - return SrcTy.getVectorNumElements() * 2; - } - + if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() && + SrcTy.isFixedLengthVector()) { + // Treat a truncate with larger than legal source (128bits for MVE) as + // expensive, 2 instructions per lane. + if ((SrcTy.getScalarType() == MVT::i8 || + SrcTy.getScalarType() == MVT::i16 || + SrcTy.getScalarType() == MVT::i32) && + SrcTy.getSizeInBits() > 128 && + SrcTy.getSizeInBits() > DstTy.getSizeInBits()) + return SrcTy.getVectorNumElements() * 2; + } + // Scalar integer conversion costs. static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { // i16 -> i64 requires two dependent operations. @@ -787,7 +787,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, ? ST->getMVEVectorCostFactor() : 1; return AdjustCost( - BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); + BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); } int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, @@ -827,37 +827,37 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, } int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, - CmpInst::Predicate VecPred, + CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) { - int ISD = TLI->InstructionOpcodeToISD(Opcode); - - // Thumb scalar code size cost for select. - if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT && - ST->isThumb() && !ValTy->isVectorTy()) { - // Assume expensive structs. - if (TLI->getValueType(DL, ValTy, true) == MVT::Other) - return TTI::TCC_Expensive; - - // Select costs can vary because they: - // - may require one or more conditional mov (including an IT), - // - can't operate directly on immediates, - // - require live flags, which we can't copy around easily. - int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first; - - // Possible IT instruction for Thumb2, or more for Thumb1. - ++Cost; - - // i1 values may need rematerialising by using mov immediates and/or - // flag setting instructions. - if (ValTy->isIntegerTy(1)) - ++Cost; - - return Cost; - } - + int ISD = TLI->InstructionOpcodeToISD(Opcode); + + // Thumb scalar code size cost for select. + if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT && + ST->isThumb() && !ValTy->isVectorTy()) { + // Assume expensive structs. + if (TLI->getValueType(DL, ValTy, true) == MVT::Other) + return TTI::TCC_Expensive; + + // Select costs can vary because they: + // - may require one or more conditional mov (including an IT), + // - can't operate directly on immediates, + // - require live flags, which we can't copy around easily. + int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first; + + // Possible IT instruction for Thumb2, or more for Thumb1. + ++Cost; + + // i1 values may need rematerialising by using mov immediates and/or + // flag setting instructions. + if (ValTy->isIntegerTy(1)) + ++Cost; + + return Cost; + } + // On NEON a vector select gets lowered to vbsl. - if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) { + if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) { // Lowering of some vector selects is currently far from perfect. static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 }, @@ -878,15 +878,15 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, return LT.first; } - // Default to cheap (throughput/size of 1 instruction) but adjust throughput - // for "multiple beats" potentially needed by MVE instructions. - int BaseCost = 1; - if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() && - ValTy->isVectorTy()) - BaseCost = ST->getMVEVectorCostFactor(); - - return BaseCost * - BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); + // Default to cheap (throughput/size of 1 instruction) but adjust throughput + // for "multiple beats" potentially needed by MVE instructions. + int BaseCost = 1; + if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() && + ValTy->isVectorTy()) + BaseCost = ST->getMVEVectorCostFactor(); + + return BaseCost * + BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); } int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, @@ -968,85 +968,85 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) { (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); } -/// Given a memcpy/memset/memmove instruction, return the number of memory -/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a -/// call is used. -int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const { - MemOp MOp; - unsigned DstAddrSpace = ~0u; - unsigned SrcAddrSpace = ~0u; - const Function *F = I->getParent()->getParent(); - - if (const auto *MC = dyn_cast<MemTransferInst>(I)) { - ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength()); - // If 'size' is not a constant, a library call will be generated. - if (!C) - return -1; - - const unsigned Size = C->getValue().getZExtValue(); - const Align DstAlign = *MC->getDestAlign(); - const Align SrcAlign = *MC->getSourceAlign(); - - MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, - /*IsVolatile*/ false); - DstAddrSpace = MC->getDestAddressSpace(); - SrcAddrSpace = MC->getSourceAddressSpace(); - } - else if (const auto *MS = dyn_cast<MemSetInst>(I)) { - ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength()); - // If 'size' is not a constant, a library call will be generated. - if (!C) - return -1; - - const unsigned Size = C->getValue().getZExtValue(); - const Align DstAlign = *MS->getDestAlign(); - - MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign, - /*IsZeroMemset*/ false, /*IsVolatile*/ false); - DstAddrSpace = MS->getDestAddressSpace(); - } - else - llvm_unreachable("Expected a memcpy/move or memset!"); - - unsigned Limit, Factor = 2; - switch(I->getIntrinsicID()) { - case Intrinsic::memcpy: - Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize()); - break; - case Intrinsic::memmove: - Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); - break; - case Intrinsic::memset: - Limit = TLI->getMaxStoresPerMemset(F->hasMinSize()); - Factor = 1; - break; - default: - llvm_unreachable("Expected a memcpy/move or memset!"); - } - +/// Given a memcpy/memset/memmove instruction, return the number of memory +/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a +/// call is used. +int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const { + MemOp MOp; + unsigned DstAddrSpace = ~0u; + unsigned SrcAddrSpace = ~0u; + const Function *F = I->getParent()->getParent(); + + if (const auto *MC = dyn_cast<MemTransferInst>(I)) { + ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength()); + // If 'size' is not a constant, a library call will be generated. + if (!C) + return -1; + + const unsigned Size = C->getValue().getZExtValue(); + const Align DstAlign = *MC->getDestAlign(); + const Align SrcAlign = *MC->getSourceAlign(); + + MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, + /*IsVolatile*/ false); + DstAddrSpace = MC->getDestAddressSpace(); + SrcAddrSpace = MC->getSourceAddressSpace(); + } + else if (const auto *MS = dyn_cast<MemSetInst>(I)) { + ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength()); + // If 'size' is not a constant, a library call will be generated. + if (!C) + return -1; + + const unsigned Size = C->getValue().getZExtValue(); + const Align DstAlign = *MS->getDestAlign(); + + MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign, + /*IsZeroMemset*/ false, /*IsVolatile*/ false); + DstAddrSpace = MS->getDestAddressSpace(); + } + else + llvm_unreachable("Expected a memcpy/move or memset!"); + + unsigned Limit, Factor = 2; + switch(I->getIntrinsicID()) { + case Intrinsic::memcpy: + Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize()); + break; + case Intrinsic::memmove: + Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); + break; + case Intrinsic::memset: + Limit = TLI->getMaxStoresPerMemset(F->hasMinSize()); + Factor = 1; + break; + default: + llvm_unreachable("Expected a memcpy/move or memset!"); + } + // MemOps will be poplulated with a list of data types that needs to be // loaded and stored. That's why we multiply the number of elements by 2 to // get the cost for this memcpy. - std::vector<EVT> MemOps; + std::vector<EVT> MemOps; if (getTLI()->findOptimalMemOpLowering( - MemOps, Limit, MOp, DstAddrSpace, - SrcAddrSpace, F->getAttributes())) - return MemOps.size() * Factor; + MemOps, Limit, MOp, DstAddrSpace, + SrcAddrSpace, F->getAttributes())) + return MemOps.size() * Factor; // If we can't find an optimal memop lowering, return the default cost - return -1; -} - -int ARMTTIImpl::getMemcpyCost(const Instruction *I) { - int NumOps = getNumMemOps(cast<IntrinsicInst>(I)); - - // To model the cost of a library call, we assume 1 for the call, and - // 3 for the argument setup. - if (NumOps == -1) - return 4; - return NumOps; + return -1; } +int ARMTTIImpl::getMemcpyCost(const Instruction *I) { + int NumOps = getNumMemOps(cast<IntrinsicInst>(I)); + + // To model the cost of a library call, we assume 1 for the call, and + // 3 for the argument setup. + if (NumOps == -1) + return 4; + return NumOps; +} + int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp) { if (ST->hasNEON()) { @@ -1149,21 +1149,21 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, const Instruction *CxtI) { - int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); - if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) { - // Make operations on i1 relatively expensive as this often involves - // combining predicates. AND and XOR should be easier to handle with IT - // blocks. - switch (ISDOpcode) { - default: - break; - case ISD::AND: - case ISD::XOR: - return 2; - case ISD::OR: - return 3; - } - } + int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); + if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) { + // Make operations on i1 relatively expensive as this often involves + // combining predicates. AND and XOR should be easier to handle with IT + // blocks. + switch (ISDOpcode) { + default: + break; + case ISD::AND: + case ISD::XOR: + return 2; + case ISD::OR: + return 3; + } + } std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -1259,12 +1259,12 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, if (LooksLikeAFreeShift()) return 0; - // Default to cheap (throughput/size of 1 instruction) but adjust throughput - // for "multiple beats" potentially needed by MVE instructions. - int BaseCost = 1; - if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() && - Ty->isVectorTy()) - BaseCost = ST->getMVEVectorCostFactor(); + // Default to cheap (throughput/size of 1 instruction) but adjust throughput + // for "multiple beats" potentially needed by MVE instructions. + int BaseCost = 1; + if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() && + Ty->isVectorTy()) + BaseCost = ST->getMVEVectorCostFactor(); // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost, // without treating floats as more expensive that scalars or increasing the @@ -1331,24 +1331,24 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, CostKind, I); } -unsigned ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, - Align Alignment, - unsigned AddressSpace, - TTI::TargetCostKind CostKind) { - if (ST->hasMVEIntegerOps()) { - if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment)) - return ST->getMVEVectorCostFactor(); - if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment)) - return ST->getMVEVectorCostFactor(); - } - if (!isa<FixedVectorType>(Src)) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); - // Scalar cost, which is currently very high due to the efficiency of the - // generated code. - return cast<FixedVectorType>(Src)->getNumElements() * 8; -} - +unsigned ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, + Align Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind) { + if (ST->hasMVEIntegerOps()) { + if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment)) + return ST->getMVEVectorCostFactor(); + if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment)) + return ST->getMVEVectorCostFactor(); + } + if (!isa<FixedVectorType>(Src)) + return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, + CostKind); + // Scalar cost, which is currently very high due to the efficiency of the + // generated code. + return cast<FixedVectorType>(Src)->getNumElements() * 8; +} + int ARMTTIImpl::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, @@ -1379,8 +1379,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost( // promoted differently). The cost of 2 here is then a load and vrev or // vmovn. if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 && - VecTy->isIntOrIntVectorTy() && - DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64) + VecTy->isIntOrIntVectorTy() && + DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64) return 2 * BaseCost; } @@ -1413,13 +1413,13 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, // multiplied by the number of elements being loaded. This is possibly very // conservative, but even so we still end up vectorising loops because the // cost per iteration for many loops is lower than for scalar loops. - unsigned VectorCost = NumElems * LT.first * ST->getMVEVectorCostFactor(); + unsigned VectorCost = NumElems * LT.first * ST->getMVEVectorCostFactor(); // The scalarization cost should be a lot higher. We use the number of vector // elements plus the scalarization overhead. unsigned ScalarCost = NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {}); - if (EltSize < 8 || Alignment < EltSize / 8) + if (EltSize < 8 || Alignment < EltSize / 8) return ScalarCost; unsigned ExtSize = EltSize; @@ -1488,92 +1488,92 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, return ScalarCost; } -int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, - bool IsPairwiseForm, - TTI::TargetCostKind CostKind) { - EVT ValVT = TLI->getValueType(DL, ValTy); - int ISD = TLI->InstructionOpcodeToISD(Opcode); - if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, - CostKind); - - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); - - static const CostTblEntry CostTblAdd[]{ - {ISD::ADD, MVT::v16i8, 1}, - {ISD::ADD, MVT::v8i16, 1}, - {ISD::ADD, MVT::v4i32, 1}, - }; - if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) - return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first; - - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, - CostKind); -} - -InstructionCost -ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, - Type *ResTy, VectorType *ValTy, - TTI::TargetCostKind CostKind) { - EVT ValVT = TLI->getValueType(DL, ValTy); - EVT ResVT = TLI->getValueType(DL, ResTy); - if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); - if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) || - (LT.second == MVT::v8i16 && - ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) || - (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64)) - return ST->getMVEVectorCostFactor() * LT.first; - } - - return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy, - CostKind); -} - -int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, - TTI::TargetCostKind CostKind) { - switch (ICA.getID()) { - case Intrinsic::get_active_lane_mask: - // Currently we make a somewhat optimistic assumption that - // active_lane_mask's are always free. In reality it may be freely folded - // into a tail predicated loop, expanded into a VCPT or expanded into a lot - // of add/icmp code. We may need to improve this in the future, but being - // able to detect if it is free or not involves looking at a lot of other - // code. We currently assume that the vectorizer inserted these, and knew - // what it was doing in adding one. - if (ST->hasMVEIntegerOps()) - return 0; - break; - case Intrinsic::sadd_sat: - case Intrinsic::ssub_sat: - case Intrinsic::uadd_sat: - case Intrinsic::usub_sat: { - if (!ST->hasMVEIntegerOps()) - break; - // Get the Return type, either directly of from ICA.ReturnType and ICA.VF. - Type *VT = ICA.getReturnType(); - if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar()) - VT = VectorType::get(VT, ICA.getVectorFactor()); - - std::pair<int, MVT> LT = - TLI->getTypeLegalizationCost(DL, VT); - if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || - LT.second == MVT::v16i8) { - // This is a base cost of 1 for the vadd, plus 3 extract shifts if we - // need to extend the type, as it uses shr(qadd(shl, shl)). - unsigned Instrs = LT.second.getScalarSizeInBits() == - ICA.getReturnType()->getScalarSizeInBits() - ? 1 - : 4; - return LT.first * ST->getMVEVectorCostFactor() * Instrs; - } - break; - } - } - - return BaseT::getIntrinsicInstrCost(ICA, CostKind); -} - +int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + bool IsPairwiseForm, + TTI::TargetCostKind CostKind) { + EVT ValVT = TLI->getValueType(DL, ValTy); + int ISD = TLI->InstructionOpcodeToISD(Opcode); + if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, + CostKind); + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + + static const CostTblEntry CostTblAdd[]{ + {ISD::ADD, MVT::v16i8, 1}, + {ISD::ADD, MVT::v8i16, 1}, + {ISD::ADD, MVT::v4i32, 1}, + }; + if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) + return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first; + + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, + CostKind); +} + +InstructionCost +ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, + Type *ResTy, VectorType *ValTy, + TTI::TargetCostKind CostKind) { + EVT ValVT = TLI->getValueType(DL, ValTy); + EVT ResVT = TLI->getValueType(DL, ResTy); + if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) || + (LT.second == MVT::v8i16 && + ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) || + (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64)) + return ST->getMVEVectorCostFactor() * LT.first; + } + + return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy, + CostKind); +} + +int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind) { + switch (ICA.getID()) { + case Intrinsic::get_active_lane_mask: + // Currently we make a somewhat optimistic assumption that + // active_lane_mask's are always free. In reality it may be freely folded + // into a tail predicated loop, expanded into a VCPT or expanded into a lot + // of add/icmp code. We may need to improve this in the future, but being + // able to detect if it is free or not involves looking at a lot of other + // code. We currently assume that the vectorizer inserted these, and knew + // what it was doing in adding one. + if (ST->hasMVEIntegerOps()) + return 0; + break; + case Intrinsic::sadd_sat: + case Intrinsic::ssub_sat: + case Intrinsic::uadd_sat: + case Intrinsic::usub_sat: { + if (!ST->hasMVEIntegerOps()) + break; + // Get the Return type, either directly of from ICA.ReturnType and ICA.VF. + Type *VT = ICA.getReturnType(); + if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar()) + VT = VectorType::get(VT, ICA.getVectorFactor()); + + std::pair<int, MVT> LT = + TLI->getTypeLegalizationCost(DL, VT); + if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || + LT.second == MVT::v16i8) { + // This is a base cost of 1 for the vadd, plus 3 extract shifts if we + // need to extend the type, as it uses shr(qadd(shl, shl)). + unsigned Instrs = LT.second.getScalarSizeInBits() == + ICA.getReturnType()->getScalarSizeInBits() + ? 1 + : 4; + return LT.first * ST->getMVEVectorCostFactor() * Instrs; + } + break; + } + } + + return BaseT::getIntrinsicInstrCost(ICA, CostKind); +} + bool ARMTTIImpl::isLoweredToCall(const Function *F) { if (!F->isIntrinsic()) BaseT::isLoweredToCall(F); @@ -1635,93 +1635,93 @@ bool ARMTTIImpl::isLoweredToCall(const Function *F) { return BaseT::isLoweredToCall(F); } -bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) { - unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); - EVT VT = TLI->getValueType(DL, I.getType(), true); - if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) - return true; - - // Check if an intrinsic will be lowered to a call and assume that any - // other CallInst will generate a bl. - if (auto *Call = dyn_cast<CallInst>(&I)) { - if (auto *II = dyn_cast<IntrinsicInst>(Call)) { - switch(II->getIntrinsicID()) { - case Intrinsic::memcpy: - case Intrinsic::memset: - case Intrinsic::memmove: - return getNumMemOps(II) == -1; - default: - if (const Function *F = Call->getCalledFunction()) - return isLoweredToCall(F); - } - } - return true; - } - - // FPv5 provides conversions between integer, double-precision, - // single-precision, and half-precision formats. - switch (I.getOpcode()) { - default: - break; - case Instruction::FPToSI: - case Instruction::FPToUI: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::FPTrunc: - case Instruction::FPExt: - return !ST->hasFPARMv8Base(); - } - - // FIXME: Unfortunately the approach of checking the Operation Action does - // not catch all cases of Legalization that use library calls. Our - // Legalization step categorizes some transformations into library calls as - // Custom, Expand or even Legal when doing type legalization. So for now - // we have to special case for instance the SDIV of 64bit integers and the - // use of floating point emulation. - if (VT.isInteger() && VT.getSizeInBits() >= 64) { - switch (ISD) { - default: - break; - case ISD::SDIV: - case ISD::UDIV: - case ISD::SREM: - case ISD::UREM: - case ISD::SDIVREM: - case ISD::UDIVREM: - return true; - } - } - - // Assume all other non-float operations are supported. - if (!VT.isFloatingPoint()) - return false; - - // We'll need a library call to handle most floats when using soft. - if (TLI->useSoftFloat()) { - switch (I.getOpcode()) { - default: - return true; - case Instruction::Alloca: - case Instruction::Load: - case Instruction::Store: - case Instruction::Select: - case Instruction::PHI: - return false; - } - } - - // We'll need a libcall to perform double precision operations on a single - // precision only FPU. - if (I.getType()->isDoubleTy() && !ST->hasFP64()) - return true; - - // Likewise for half precision arithmetic. - if (I.getType()->isHalfTy() && !ST->hasFullFP16()) - return true; - - return false; -} - +bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) { + unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); + EVT VT = TLI->getValueType(DL, I.getType(), true); + if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) + return true; + + // Check if an intrinsic will be lowered to a call and assume that any + // other CallInst will generate a bl. + if (auto *Call = dyn_cast<CallInst>(&I)) { + if (auto *II = dyn_cast<IntrinsicInst>(Call)) { + switch(II->getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memset: + case Intrinsic::memmove: + return getNumMemOps(II) == -1; + default: + if (const Function *F = Call->getCalledFunction()) + return isLoweredToCall(F); + } + } + return true; + } + + // FPv5 provides conversions between integer, double-precision, + // single-precision, and half-precision formats. + switch (I.getOpcode()) { + default: + break; + case Instruction::FPToSI: + case Instruction::FPToUI: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + return !ST->hasFPARMv8Base(); + } + + // FIXME: Unfortunately the approach of checking the Operation Action does + // not catch all cases of Legalization that use library calls. Our + // Legalization step categorizes some transformations into library calls as + // Custom, Expand or even Legal when doing type legalization. So for now + // we have to special case for instance the SDIV of 64bit integers and the + // use of floating point emulation. + if (VT.isInteger() && VT.getSizeInBits() >= 64) { + switch (ISD) { + default: + break; + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: + case ISD::SDIVREM: + case ISD::UDIVREM: + return true; + } + } + + // Assume all other non-float operations are supported. + if (!VT.isFloatingPoint()) + return false; + + // We'll need a library call to handle most floats when using soft. + if (TLI->useSoftFloat()) { + switch (I.getOpcode()) { + default: + return true; + case Instruction::Alloca: + case Instruction::Load: + case Instruction::Store: + case Instruction::Select: + case Instruction::PHI: + return false; + } + } + + // We'll need a libcall to perform double precision operations on a single + // precision only FPU. + if (I.getType()->isDoubleTy() && !ST->hasFP64()) + return true; + + // Likewise for half precision arithmetic. + if (I.getType()->isHalfTy() && !ST->hasFullFP16()) + return true; + + return false; +} + bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, @@ -1762,7 +1762,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, switch (Call->getIntrinsicID()) { default: break; - case Intrinsic::start_loop_iterations: + case Intrinsic::start_loop_iterations: case Intrinsic::test_set_loop_iterations: case Intrinsic::loop_decrement: case Intrinsic::loop_decrement_reg: @@ -1773,24 +1773,24 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, }; // Scan the instructions to see if there's any that we know will turn into a - // call or if this loop is already a low-overhead loop or will become a tail - // predicated loop. - bool IsTailPredLoop = false; + // call or if this loop is already a low-overhead loop or will become a tail + // predicated loop. + bool IsTailPredLoop = false; auto ScanLoop = [&](Loop *L) { for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { - if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) || - isa<InlineAsm>(I)) { + if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) || + isa<InlineAsm>(I)) { LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n"); return false; } - if (auto *II = dyn_cast<IntrinsicInst>(&I)) - IsTailPredLoop |= - II->getIntrinsicID() == Intrinsic::get_active_lane_mask || - II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 || - II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 || - II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 || - II->getIntrinsicID() == Intrinsic::arm_mve_vctp64; + if (auto *II = dyn_cast<IntrinsicInst>(&I)) + IsTailPredLoop |= + II->getIntrinsicID() == Intrinsic::get_active_lane_mask || + II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 || + II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 || + II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 || + II->getIntrinsicID() == Intrinsic::arm_mve_vctp64; } } return true; @@ -1811,7 +1811,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, LLVMContext &C = L->getHeader()->getContext(); HWLoopInfo.CounterInReg = true; HWLoopInfo.IsNestingLegal = false; - HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop; + HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop; HWLoopInfo.CountType = Type::getInt32Ty(C); HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); return true; @@ -1859,28 +1859,28 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const LoopAccessInfo *LAI) { LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n"); - // If there are live-out values, it is probably a reduction. We can predicate - // most reduction operations freely under MVE using a combination of - // prefer-predicated-reduction-select and inloop reductions. We limit this to - // floating point and integer reductions, but don't check for operators - // specifically here. If the value ends up not being a reduction (and so the - // vectorizer cannot tailfold the loop), we should fall back to standard - // vectorization automatically. + // If there are live-out values, it is probably a reduction. We can predicate + // most reduction operations freely under MVE using a combination of + // prefer-predicated-reduction-select and inloop reductions. We limit this to + // floating point and integer reductions, but don't check for operators + // specifically here. If the value ends up not being a reduction (and so the + // vectorizer cannot tailfold the loop), we should fall back to standard + // vectorization automatically. SmallVector< Instruction *, 8 > LiveOuts; LiveOuts = llvm::findDefsUsedOutsideOfLoop(L); - bool ReductionsDisabled = + bool ReductionsDisabled = EnableTailPredication == TailPredication::EnabledNoReductions || EnableTailPredication == TailPredication::ForceEnabledNoReductions; for (auto *I : LiveOuts) { - if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() && - !I->getType()->isHalfTy()) { - LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float " + if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() && + !I->getType()->isHalfTy()) { + LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float " "live-out value\n"); return false; } - if (ReductionsDisabled) { - LLVM_DEBUG(dbgs() << "Reductions not enabled\n"); + if (ReductionsDisabled) { + LLVM_DEBUG(dbgs() << "Reductions not enabled\n"); return false; } } @@ -1910,35 +1910,35 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, if (isa<StoreInst>(I) || isa<LoadInst>(I)) { Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1); int64_t NextStride = getPtrStride(PSE, Ptr, L); - if (NextStride == 1) { - // TODO: for now only allow consecutive strides of 1. We could support - // other strides as long as it is uniform, but let's keep it simple - // for now. + if (NextStride == 1) { + // TODO: for now only allow consecutive strides of 1. We could support + // other strides as long as it is uniform, but let's keep it simple + // for now. continue; - } else if (NextStride == -1 || - (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) || - (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) { - LLVM_DEBUG(dbgs() - << "Consecutive strides of 2 found, vld2/vstr2 can't " - "be tail-predicated\n."); + } else if (NextStride == -1 || + (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) || + (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) { + LLVM_DEBUG(dbgs() + << "Consecutive strides of 2 found, vld2/vstr2 can't " + "be tail-predicated\n."); return false; - // TODO: don't tail predicate if there is a reversed load? - } else if (EnableMaskedGatherScatters) { - // Gather/scatters do allow loading from arbitrary strides, at - // least if they are loop invariant. - // TODO: Loop variant strides should in theory work, too, but - // this requires further testing. - const SCEV *PtrScev = - replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr); - if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) { - const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); - if (PSE.getSE()->isLoopInvariant(Step, L)) - continue; - } + // TODO: don't tail predicate if there is a reversed load? + } else if (EnableMaskedGatherScatters) { + // Gather/scatters do allow loading from arbitrary strides, at + // least if they are loop invariant. + // TODO: Loop variant strides should in theory work, too, but + // this requires further testing. + const SCEV *PtrScev = + replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr); + if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) { + const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); + if (PSE.getSE()->isLoopInvariant(Step, L)) + continue; + } } - LLVM_DEBUG(dbgs() << "Bad stride found, can't " - "tail-predicate\n."); - return false; + LLVM_DEBUG(dbgs() << "Bad stride found, can't " + "tail-predicate\n."); + return false; } } } @@ -1971,7 +1971,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, return false; } - assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected"); + assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected"); HardwareLoopInfo HWLoopInfo(L); if (!HWLoopInfo.canAnalyze(*LI)) { @@ -2039,10 +2039,10 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, if (ST->hasBranchPredictor() && L->getNumBlocks() > 4) return; - // Don't unroll vectorized loops, including the remainder loop - if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) - return; - + // Don't unroll vectorized loops, including the remainder loop + if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) + return; + // Scan the loop: don't unroll loops with calls as this could prevent // inlining. unsigned Cost = 0; @@ -2061,9 +2061,9 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, return; } - SmallVector<const Value*, 4> Operands(I.operand_values()); - Cost += - getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency); + SmallVector<const Value*, 4> Operands(I.operand_values()); + Cost += + getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency); } } @@ -2092,24 +2092,24 @@ bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { return ST->hasMVEIntegerOps(); } - -bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty, - TTI::ReductionFlags Flags) const { - if (!ST->hasMVEIntegerOps()) - return false; - - unsigned ScalarBits = Ty->getScalarSizeInBits(); - switch (Opcode) { - case Instruction::Add: - return ScalarBits <= 64; - default: - return false; - } -} - -bool ARMTTIImpl::preferPredicatedReductionSelect( - unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { - if (!ST->hasMVEIntegerOps()) - return false; - return true; -} + +bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + if (!ST->hasMVEIntegerOps()) + return false; + + unsigned ScalarBits = Ty->getScalarSizeInBits(); + switch (Opcode) { + case Instruction::Add: + return ScalarBits <= 64; + default: + return false; + } +} + +bool ARMTTIImpl::preferPredicatedReductionSelect( + unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { + if (!ST->hasMVEIntegerOps()) + return false; + return true; +} |