diff options
author | shadchin <shadchin@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
commit | 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch) | |
tree | 012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Transforms/Vectorize | |
parent | 6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff) | |
download | ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz |
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Transforms/Vectorize')
15 files changed, 4969 insertions, 4969 deletions
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 6ec5590d76..12f3203bd8 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -666,10 +666,10 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { cast<IntrinsicInst>(&I)->getIntrinsicID() == Intrinsic::sideeffect) { // Ignore llvm.sideeffect calls. - } else if (isa<IntrinsicInst>(&I) && - cast<IntrinsicInst>(&I)->getIntrinsicID() == - Intrinsic::pseudoprobe) { - // Ignore llvm.pseudoprobe calls. + } else if (isa<IntrinsicInst>(&I) && + cast<IntrinsicInst>(&I)->getIntrinsicID() == + Intrinsic::pseudoprobe) { + // Ignore llvm.pseudoprobe calls. } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n'); @@ -766,8 +766,8 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { return Chain.slice(0, ChainIdx); } -static ChainID getChainID(const Value *Ptr) { - const Value *ObjPtr = getUnderlyingObject(Ptr); +static ChainID getChainID(const Value *Ptr) { + const Value *ObjPtr = getUnderlyingObject(Ptr); if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { // The select's themselves are distinct instructions even if they share the // same condition and evaluate to consecutive pointers for true and false @@ -834,7 +834,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) { continue; // Save the load locations. - const ChainID ID = getChainID(Ptr); + const ChainID ID = getChainID(Ptr); LoadRefs[ID].push_back(LI); } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { if (!SI->isSimple()) @@ -880,7 +880,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) { continue; // Save store location. - const ChainID ID = getChainID(Ptr); + const ChainID ID = getChainID(Ptr); StoreRefs[ID].push_back(SI); } } @@ -1031,8 +1031,8 @@ bool Vectorizer::vectorizeStoreChain( unsigned EltSzInBytes = Sz / 8; unsigned SzInBytes = EltSzInBytes * ChainSize; - FixedVectorType *VecTy; - auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy); + FixedVectorType *VecTy; + auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy); if (VecStoreTy) VecTy = FixedVectorType::get(StoreTy->getScalarType(), Chain.size() * VecStoreTy->getNumElements()); @@ -1184,7 +1184,7 @@ bool Vectorizer::vectorizeLoadChain( unsigned EltSzInBytes = Sz / 8; unsigned SzInBytes = EltSzInBytes * ChainSize; VectorType *VecTy; - auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy); + auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy); if (VecLoadTy) VecTy = FixedVectorType::get(LoadTy->getScalarType(), Chain.size() * VecLoadTy->getNumElements()); diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index b8c21a0e1c..e40cd652e5 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -13,16 +13,16 @@ // pass. It should be easy to create an analysis pass around it if there // is a need (but D45420 needs to happen first). // - + #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/Transforms/Utils/SizeOpts.h" +#include "llvm/Transforms/Utils/SizeOpts.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" using namespace llvm; @@ -66,7 +66,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) { return (Val <= 1); case HK_ISVECTORIZED: case HK_PREDICATE: - case HK_SCALABLE: + case HK_SCALABLE: return (Val == 0 || Val == 1); } return false; @@ -79,8 +79,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), IsVectorized("isvectorized", 0, HK_ISVECTORIZED), - Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), - Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L), + Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), + Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -93,8 +93,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, // If the vectorization width and interleaving count are both 1 then // consider the loop to have been already vectorized because there's // nothing more that we can do. - IsVectorized.Value = - getWidth() == ElementCount::getFixed(1) && Interleave.Value == 1; + IsVectorized.Value = + getWidth() == ElementCount::getFixed(1) && Interleave.Value == 1; LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"); } @@ -167,7 +167,7 @@ void LoopVectorizeHints::emitRemarkWithHints() const { if (Force.Value == LoopVectorizeHints::FK_Enabled) { R << " (Force=" << NV("Force", true); if (Width.Value != 0) - R << ", Vector Width=" << NV("VectorWidth", getWidth()); + R << ", Vector Width=" << NV("VectorWidth", getWidth()); if (Interleave.Value != 0) R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); R << ")"; @@ -178,11 +178,11 @@ void LoopVectorizeHints::emitRemarkWithHints() const { } const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { - if (getWidth() == ElementCount::getFixed(1)) + if (getWidth() == ElementCount::getFixed(1)) return LV_NAME; if (getForce() == LoopVectorizeHints::FK_Disabled) return LV_NAME; - if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero()) + if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero()) return LV_NAME; return OptimizationRemarkAnalysis::AlwaysPrint; } @@ -233,8 +233,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { return; unsigned Val = C->getZExtValue(); - Hint *Hints[] = {&Width, &Interleave, &Force, - &IsVectorized, &Predicate, &Scalable}; + Hint *Hints[] = {&Width, &Interleave, &Force, + &IsVectorized, &Predicate, &Scalable}; for (auto H : Hints) { if (Name == H->Name) { if (H->validate(Val)) @@ -419,11 +419,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); - Function *F = TheLoop->getHeader()->getParent(); - bool OptForSize = F->hasOptSize() || - llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI, - PGSOQueryType::IRPass); - bool CanAddPredicate = !OptForSize; + Function *F = TheLoop->getHeader()->getParent(); + bool OptForSize = F->hasOptSize() || + llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI, + PGSOQueryType::IRPass); + bool CanAddPredicate = !OptForSize; int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false); if (Stride == 1 || Stride == -1) return Stride; @@ -435,7 +435,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) { } bool LoopVectorizationLegality::canVectorizeOuterLoop() { - assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop."); + assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop."); // Store the result and return it at the end instead of exiting early, in case // allowExtraAnalysis is used to report multiple reasons for not vectorizing. bool Result = true; @@ -779,7 +779,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // supported on the target. if (ST->getMetadata(LLVMContext::MD_nontemporal)) { // Arbitrarily try a vector of 2 elements. - auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2); + auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2); assert(VecTy && "did not find vectorized version of stored type"); if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) { reportVectorizationFailure( @@ -794,7 +794,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (LD->getMetadata(LLVMContext::MD_nontemporal)) { // For nontemporal loads, check that a nontemporal vector version is // supported on the target (arbitrarily try a vector of 2 elements). - auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2); + auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2); assert(VecTy && "did not find vectorized version of load type"); if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) { reportVectorizationFailure( @@ -923,9 +923,9 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { } bool LoopVectorizationLegality::blockCanBePredicated( - BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, - SmallPtrSetImpl<const Instruction *> &MaskedOp, - SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const { + BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, + SmallPtrSetImpl<const Instruction *> &MaskedOp, + SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const { for (Instruction &I : *BB) { // Check that we don't have a constant expression that can trap as operand. for (Value *Operand : I.operands()) { @@ -941,19 +941,19 @@ bool LoopVectorizationLegality::blockCanBePredicated( continue; } - // Do not let llvm.experimental.noalias.scope.decl block the vectorization. - // TODO: there might be cases that it should block the vectorization. Let's - // ignore those for now. - if (isa<NoAliasScopeDeclInst>(&I)) - continue; - + // Do not let llvm.experimental.noalias.scope.decl block the vectorization. + // TODO: there might be cases that it should block the vectorization. Let's + // ignore those for now. + if (isa<NoAliasScopeDeclInst>(&I)) + continue; + // We might be able to hoist the load. if (I.mayReadFromMemory()) { auto *LI = dyn_cast<LoadInst>(&I); if (!LI) return false; if (!SafePtrs.count(LI->getPointerOperand())) { - MaskedOp.insert(LI); + MaskedOp.insert(LI); continue; } } @@ -1012,7 +1012,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { ScalarEvolution &SE = *PSE.getSE(); for (Instruction &I : *BB) { LoadInst *LI = dyn_cast<LoadInst>(&I); - if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) && + if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) && isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT)) SafePointers.insert(LI->getPointerOperand()); } @@ -1032,8 +1032,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointers, MaskedOp, - ConditionalAssumes)) { + if (!blockCanBePredicated(BB, SafePointers, MaskedOp, + ConditionalAssumes)) { reportVectorizationFailure( "Control flow cannot be substituted for a select", "control flow cannot be substituted for a select", @@ -1058,7 +1058,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // Helper function to canVectorizeLoopNestCFG. bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, bool UseVPlanNativePath) { - assert((UseVPlanNativePath || Lp->isInnermost()) && + assert((UseVPlanNativePath || Lp->isInnermost()) && "VPlan-native path is not enabled."); // TODO: ORE should be improved to show more accurate information when an @@ -1094,14 +1094,14 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, return false; } - // We currently must have a single "exit block" after the loop. Note that - // multiple "exiting blocks" inside the loop are allowed, provided they all - // reach the single exit block. - // TODO: This restriction can be relaxed in the near future, it's here solely - // to allow separation of changes for review. We need to generalize the phi - // update logic in a number of places. - if (!Lp->getUniqueExitBlock()) { - reportVectorizationFailure("The loop must have a unique exit block", + // We currently must have a single "exit block" after the loop. Note that + // multiple "exiting blocks" inside the loop are allowed, provided they all + // reach the single exit block. + // TODO: This restriction can be relaxed in the near future, it's here solely + // to allow separation of changes for review. We need to generalize the phi + // update logic in a number of places. + if (!Lp->getUniqueExitBlock()) { + reportVectorizationFailure("The loop must have a unique exit block", "loop control flow is not understood by vectorizer", "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) @@ -1159,7 +1159,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { // Specific checks for outer loops. We skip the remaining legal checks at this // point because they don't support outer loops. - if (!TheLoop->isInnermost()) { + if (!TheLoop->isInnermost()) { assert(UseVPlanNativePath && "VPlan-native path is not enabled."); if (!canVectorizeOuterLoop()) { @@ -1176,7 +1176,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return Result; } - assert(TheLoop->isInnermost() && "Inner loop expected."); + assert(TheLoop->isInnermost() && "Inner loop expected."); // Check if we can if-convert non-single-bb loops. unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { @@ -1251,10 +1251,10 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() { Instruction *UI = cast<Instruction>(U); if (TheLoop->contains(UI)) continue; - LLVM_DEBUG( - dbgs() - << "LV: Cannot fold tail by masking, loop has an outside user for " - << *UI << "\n"); + LLVM_DEBUG( + dbgs() + << "LV: Cannot fold tail by masking, loop has an outside user for " + << *UI << "\n"); return false; } } @@ -1262,25 +1262,25 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() { // The list of pointers that we can safely read and write to remains empty. SmallPtrSet<Value *, 8> SafePointers; - SmallPtrSet<const Instruction *, 8> TmpMaskedOp; - SmallPtrSet<Instruction *, 8> TmpConditionalAssumes; - + SmallPtrSet<const Instruction *, 8> TmpMaskedOp; + SmallPtrSet<Instruction *, 8> TmpConditionalAssumes; + // Check and mark all blocks for predication, including those that ordinarily // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, - TmpConditionalAssumes)) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); + if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, + TmpConditionalAssumes)) { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); return false; } } LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); - - MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end()); - ConditionalAssumes.insert(TmpConditionalAssumes.begin(), - TmpConditionalAssumes.end()); - + + MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end()); + ConditionalAssumes.insert(TmpConditionalAssumes.begin(), + TmpConditionalAssumes.end()); + return true; } diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 19797e6f78..25e4a37d63 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -34,7 +34,7 @@ namespace llvm { class LoopVectorizationLegality; class LoopVectorizationCostModel; class PredicatedScalarEvolution; -class VPRecipeBuilder; +class VPRecipeBuilder; /// VPlan-based builder utility analogous to IRBuilder. class VPBuilder { @@ -142,10 +142,10 @@ public: return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}); } - VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) { - return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}); - } - + VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) { + return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}); + } + //===--------------------------------------------------------------------===// // RAII helpers. //===--------------------------------------------------------------------===// @@ -176,22 +176,22 @@ public: /// Information about vectorization costs struct VectorizationFactor { // Vector width with best cost - ElementCount Width; + ElementCount Width; // Cost of the loop with that width unsigned Cost; // Width 1 means no vectorization, cost 0 means uncomputed cost. - static VectorizationFactor Disabled() { - return {ElementCount::getFixed(1), 0}; - } + static VectorizationFactor Disabled() { + return {ElementCount::getFixed(1), 0}; + } bool operator==(const VectorizationFactor &rhs) const { return Width == rhs.Width && Cost == rhs.Cost; } - - bool operator!=(const VectorizationFactor &rhs) const { - return !(*this == rhs); - } + + bool operator!=(const VectorizationFactor &rhs) const { + return !(*this == rhs); + } }; /// Planner drives the vectorization process after having passed @@ -237,10 +237,10 @@ class LoopVectorizationPlanner { /// A builder used to construct the current plan. VPBuilder Builder; - /// The best number of elements of the vector types used in the - /// transformed loop. BestVF = None means that vectorization is - /// disabled. - Optional<ElementCount> BestVF = None; + /// The best number of elements of the vector types used in the + /// transformed loop. BestVF = None means that vectorization is + /// disabled. + Optional<ElementCount> BestVF = None; unsigned BestUF = 0; public: @@ -255,14 +255,14 @@ public: /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC); + Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(ElementCount UserVF); + VectorizationFactor planInVPlanNativePath(ElementCount UserVF); /// Finalize the best decision and dispose of all other VPlans. - void setBestPlan(ElementCount VF, unsigned UF); + void setBestPlan(ElementCount VF, unsigned UF); /// Generate the IR code for the body of the vectorized loop according to the /// best selected VPlan. @@ -273,21 +273,21 @@ public: O << *Plan; } - /// Look through the existing plans and return true if we have one with all - /// the vectorization factors in question. - bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const { - return any_of(VPlans, [&](const VPlanPtr &Plan) { - return all_of(VFs, [&](const ElementCount &VF) { - return Plan->hasVF(VF); - }); - }); - } - + /// Look through the existing plans and return true if we have one with all + /// the vectorization factors in question. + bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const { + return any_of(VPlans, [&](const VPlanPtr &Plan) { + return all_of(VFs, [&](const ElementCount &VF) { + return Plan->hasVF(VF); + }); + }); + } + /// Test a \p Predicate on a \p Range of VF's. Return the value of applying /// \p Predicate on Range.Start, possibly decreasing Range.End such that the /// returned value holds for the entire \p Range. static bool - getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate, + getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate, VFRange &Range); protected: @@ -299,7 +299,7 @@ protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. - void buildVPlans(ElementCount MinVF, ElementCount MaxVF); + void buildVPlans(ElementCount MinVF, ElementCount MaxVF); private: /// Build a VPlan according to the information gathered by Legal. \return a @@ -310,20 +310,20 @@ private: /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. VPlanPtr buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, + VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, const DenseMap<Instruction *, Instruction *> &SinkAfter); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); - - /// Adjust the recipes for any inloop reductions. The chain of instructions - /// leading from the loop exit instr to the phi need to be converted to - /// reductions, with one operand being vector and the other being the scalar - /// reduction chain. - void adjustRecipesForInLoopReductions(VPlanPtr &Plan, - VPRecipeBuilder &RecipeBuilder); + void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + + /// Adjust the recipes for any inloop reductions. The chain of instructions + /// leading from the loop exit instr to the phi need to be converted to + /// reductions, with one operand being vector and the other being the scalar + /// reduction chain. + void adjustRecipesForInLoopReductions(VPlanPtr &Plan, + VPRecipeBuilder &RecipeBuilder); }; } // namespace llvm diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp index b456a97aa4..decb6ce1d7 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -130,7 +130,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/InstructionCost.h" +#include "llvm/Support/InstructionCost.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -158,38 +158,38 @@ using namespace llvm; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME -#ifndef NDEBUG -const char VerboseDebug[] = DEBUG_TYPE "-verbose"; -#endif - +#ifndef NDEBUG +const char VerboseDebug[] = DEBUG_TYPE "-verbose"; +#endif + /// @{ /// Metadata attribute names -const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; -const char LLVMLoopVectorizeFollowupVectorized[] = +const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; +const char LLVMLoopVectorizeFollowupVectorized[] = "llvm.loop.vectorize.followup_vectorized"; -const char LLVMLoopVectorizeFollowupEpilogue[] = +const char LLVMLoopVectorizeFollowupEpilogue[] = "llvm.loop.vectorize.followup_epilogue"; /// @} STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); -STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); - -static cl::opt<bool> EnableEpilogueVectorization( - "enable-epilogue-vectorization", cl::init(true), cl::Hidden, - cl::desc("Enable vectorization of epilogue loops.")); - -static cl::opt<unsigned> EpilogueVectorizationForceVF( - "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, - cl::desc("When epilogue vectorization is enabled, and a value greater than " - "1 is specified, forces the given VF for all applicable epilogue " - "loops.")); - -static cl::opt<unsigned> EpilogueVectorizationMinVF( - "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, - cl::desc("Only loops with vectorization factor equal to or larger than " - "the specified value are considered for epilogue vectorization.")); - +STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); + +static cl::opt<bool> EnableEpilogueVectorization( + "enable-epilogue-vectorization", cl::init(true), cl::Hidden, + cl::desc("Enable vectorization of epilogue loops.")); + +static cl::opt<unsigned> EpilogueVectorizationForceVF( + "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, + cl::desc("When epilogue vectorization is enabled, and a value greater than " + "1 is specified, forces the given VF for all applicable epilogue " + "loops.")); + +static cl::opt<unsigned> EpilogueVectorizationMinVF( + "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, + cl::desc("Only loops with vectorization factor equal to or larger than " + "the specified value are considered for epilogue vectorization.")); + /// Loops with a known constant trip count below this number are vectorized only /// if no scalar iteration overheads are incurred. static cl::opt<unsigned> TinyTripCountVectorThreshold( @@ -198,37 +198,37 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); -// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, -// that predication is preferred, and this lists all options. I.e., the -// vectorizer will try to fold the tail-loop (epilogue) into the vector body -// and predicate the instructions accordingly. If tail-folding fails, there are -// different fallback strategies depending on these values: -namespace PreferPredicateTy { - enum Option { - ScalarEpilogue = 0, - PredicateElseScalarEpilogue, - PredicateOrDontVectorize - }; -} // namespace PreferPredicateTy - -static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( - "prefer-predicate-over-epilogue", - cl::init(PreferPredicateTy::ScalarEpilogue), - cl::Hidden, - cl::desc("Tail-folding and predication preferences over creating a scalar " - "epilogue loop."), - cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, - "scalar-epilogue", - "Don't tail-predicate loops, create scalar epilogue"), - clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, - "predicate-else-scalar-epilogue", - "prefer tail-folding, create scalar epilogue if tail " - "folding fails."), - clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, - "predicate-dont-vectorize", - "prefers tail-folding, don't attempt vectorization if " - "tail-folding fails."))); - +// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, +// that predication is preferred, and this lists all options. I.e., the +// vectorizer will try to fold the tail-loop (epilogue) into the vector body +// and predicate the instructions accordingly. If tail-folding fails, there are +// different fallback strategies depending on these values: +namespace PreferPredicateTy { + enum Option { + ScalarEpilogue = 0, + PredicateElseScalarEpilogue, + PredicateOrDontVectorize + }; +} // namespace PreferPredicateTy + +static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( + "prefer-predicate-over-epilogue", + cl::init(PreferPredicateTy::ScalarEpilogue), + cl::Hidden, + cl::desc("Tail-folding and predication preferences over creating a scalar " + "epilogue loop."), + cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, + "scalar-epilogue", + "Don't tail-predicate loops, create scalar epilogue"), + clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, + "predicate-else-scalar-epilogue", + "prefer tail-folding, create scalar epilogue if tail " + "folding fails."), + clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, + "predicate-dont-vectorize", + "prefers tail-folding, don't attempt vectorization if " + "tail-folding fails."))); + static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -239,7 +239,7 @@ static cl::opt<bool> EnableInterleavedMemAccesses( cl::desc("Enable vectorization on interleaved memory accesses in a loop")); /// An interleave-group may need masking if it resides in a block that needs -/// predication, or in order to mask away gaps. +/// predication, or in order to mask away gaps. static cl::opt<bool> EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); @@ -273,12 +273,12 @@ static cl::opt<unsigned> ForceTargetInstructionCost( "an instruction to a single constant value. Mostly " "useful for getting consistent testing.")); -static cl::opt<bool> ForceTargetSupportsScalableVectors( - "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, - cl::desc( - "Pretend that scalable vectors are supported, even if the target does " - "not support them. This flag should only be used for testing.")); - +static cl::opt<bool> ForceTargetSupportsScalableVectors( + "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, + cl::desc( + "Pretend that scalable vectors are supported, even if the target does " + "not support them. This flag should only be used for testing.")); + static cl::opt<unsigned> SmallLoopCost( "small-loop-cost", cl::init(20), cl::Hidden, cl::desc( @@ -296,12 +296,12 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave( cl::desc( "Enable runtime interleaving until load/store ports are saturated")); -/// Interleave small loops with scalar reductions. -static cl::opt<bool> InterleaveSmallLoopScalarReduction( - "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, - cl::desc("Enable interleaving for loops with small iteration counts that " - "contain scalar reductions to expose ILP.")); - +/// Interleave small loops with scalar reductions. +static cl::opt<bool> InterleaveSmallLoopScalarReduction( + "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, + cl::desc("Enable interleaving for loops with small iteration counts that " + "contain scalar reductions to expose ILP.")); + /// The number of stores in a loop that are allowed to need predication. static cl::opt<unsigned> NumberOfStoresToPredicate( "vectorize-num-stores-pred", cl::init(1), cl::Hidden, @@ -320,17 +320,17 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC( cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); -static cl::opt<bool> - PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), - cl::Hidden, - cl::desc("Prefer in-loop vector reductions, " - "overriding the targets preference.")); - -static cl::opt<bool> PreferPredicatedReductionSelect( - "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, - cl::desc( - "Prefer predicating a reduction operation over an after loop select.")); - +static cl::opt<bool> + PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), + cl::Hidden, + cl::desc("Prefer in-loop vector reductions, " + "overriding the targets preference.")); + +static cl::opt<bool> PreferPredicatedReductionSelect( + "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, + cl::desc( + "Prefer predicating a reduction operation over an after loop select.")); + cl::opt<bool> EnableVPlanNativePath( "enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " @@ -372,11 +372,11 @@ static Type *getMemInstValueType(Value *I) { /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an -/// element of the corresponding vector type. -static bool hasIrregularType(Type *Ty, const DataLayout &DL) { - // Determine if an array of N elements of type Ty is "bitcast compatible" - // with a <N x Ty> vector. - // This is only true if there is no padding between the array elements. +/// element of the corresponding vector type. +static bool hasIrregularType(Type *Ty, const DataLayout &DL) { + // Determine if an array of N elements of type Ty is "bitcast compatible" + // with a <N x Ty> vector. + // This is only true if there is no padding between the array elements. return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); } @@ -453,42 +453,42 @@ public: LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, ElementCount VecWidth, + OptimizationRemarkEmitter *ORE, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), - VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), - BFI(BFI), PSI(PSI) { - // Query this against the original loop and save it here because the profile - // of the original loop header may change as the transformation happens. - OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( - OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); - } - + VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), + BFI(BFI), PSI(PSI) { + // Query this against the original loop and save it here because the profile + // of the original loop header may change as the transformation happens. + OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( + OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); + } + virtual ~InnerLoopVectorizer() = default; - /// Create a new empty loop that will contain vectorized instructions later - /// on, while the old loop will be used as the scalar remainder. Control flow - /// is generated around the vectorized (and scalar epilogue) loops consisting - /// of various checks and bypasses. Return the pre-header block of the new - /// loop. - /// In the case of epilogue vectorization, this function is overriden to - /// handle the more complex control flow around the loops. - virtual BasicBlock *createVectorizedLoopSkeleton(); + /// Create a new empty loop that will contain vectorized instructions later + /// on, while the old loop will be used as the scalar remainder. Control flow + /// is generated around the vectorized (and scalar epilogue) loops consisting + /// of various checks and bypasses. Return the pre-header block of the new + /// loop. + /// In the case of epilogue vectorization, this function is overriden to + /// handle the more complex control flow around the loops. + virtual BasicBlock *createVectorizedLoopSkeleton(); /// Widen a single instruction within the innermost loop. - void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, + void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, VPTransformState &State); /// Widen a single call instruction within the innermost loop. - void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, + void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State); /// Widen a single select instruction within the innermost loop. - void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, + void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, bool InvariantCond, VPTransformState &State); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. @@ -504,15 +504,15 @@ public: /// Vectorize a single GetElementPtrInst based on information gathered and /// decisions taken during planning. - void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, - unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, + void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, + unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, - Value *StartV, unsigned UF, ElementCount VF); + void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, + Value *StartV, unsigned UF, ElementCount VF); /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane @@ -526,8 +526,8 @@ public: /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, Value *Start, - TruncInst *Trunc = nullptr); + void widenIntOrFpInduction(PHINode *IV, Value *Start, + TruncInst *Trunc = nullptr); /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a /// vector or scalar value on-demand if one is not yet available. When @@ -552,10 +552,10 @@ public: /// value into a vector. Value *getOrCreateVectorValue(Value *V, unsigned Part); - void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { - VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); - } - + void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { + VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); + } + /// Return a value in the new loop corresponding to \p V from the original /// loop at unroll and vector indices \p Instance. If the value has been /// vectorized but not scalarized, the necessary extractelement instruction @@ -570,9 +570,9 @@ public: /// BlockInMask is non-null. Use \p State to translate given VPValues to IR /// values in the vectorized loop. void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, - ArrayRef<VPValue *> VPDefs, + ArrayRef<VPValue *> VPDefs, VPTransformState &State, VPValue *Addr, - ArrayRef<VPValue *> StoredValues, + ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask = nullptr); /// Vectorize Load and Store instructions with the base address given in \p @@ -580,8 +580,8 @@ public: /// non-null. Use \p State to translate given VPValues to IR values in the /// vectorized loop. void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, - VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask); + VPValue *Def, VPValue *Addr, + VPValue *StoredValue, VPValue *BlockInMask); /// Set the debug location in the builder using the debug location in /// the instruction. @@ -625,11 +625,11 @@ protected: /// Clear NSW/NUW flags from reduction instructions if necessary. void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); - /// Fixup the LCSSA phi nodes in the unique exit block. This simply - /// means we need to add the appropriate incoming value from the middle - /// block as exiting edges from the scalar epilogue loop (if present) are - /// already in place, and we exit the vector loop exclusively to the middle - /// block. + /// Fixup the LCSSA phi nodes in the unique exit block. This simply + /// means we need to add the appropriate incoming value from the middle + /// block as exiting edges from the scalar epilogue loop (if present) are + /// already in place, and we exit the vector loop exclusively to the middle + /// block. void fixLCSSAPHIs(); /// Iteratively sink the scalarized operands of a predicated instruction into @@ -668,8 +668,8 @@ protected: /// truncate instruction, instead of widening the original IV, we widen a /// version of the IV truncated to \p EntryVal's type. void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, - Value *Step, Value *Start, - Instruction *EntryVal); + Value *Step, Value *Start, + Instruction *EntryVal); /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. @@ -737,28 +737,28 @@ protected: const DataLayout &DL, const InductionDescriptor &ID) const; - /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, - /// vector loop preheader, middle block and scalar preheader. Also - /// allocate a loop object for the new vector loop and return it. - Loop *createVectorLoopSkeleton(StringRef Prefix); - - /// Create new phi nodes for the induction variables to resume iteration count - /// in the scalar epilogue, from where the vectorized loop left off (given by - /// \p VectorTripCount). - /// In cases where the loop skeleton is more complicated (eg. epilogue - /// vectorization) and the resume values can come from an additional bypass - /// block, the \p AdditionalBypass pair provides information about the bypass - /// block and the end value on the edge from bypass to this loop. - void createInductionResumeValues( - Loop *L, Value *VectorTripCount, - std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); - - /// Complete the loop skeleton by adding debug MDs, creating appropriate - /// conditional branches in the middle block, preparing the builder and - /// running the verifier. Take in the vector loop \p L as argument, and return - /// the preheader of the completed vector loop. - BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); - + /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, + /// vector loop preheader, middle block and scalar preheader. Also + /// allocate a loop object for the new vector loop and return it. + Loop *createVectorLoopSkeleton(StringRef Prefix); + + /// Create new phi nodes for the induction variables to resume iteration count + /// in the scalar epilogue, from where the vectorized loop left off (given by + /// \p VectorTripCount). + /// In cases where the loop skeleton is more complicated (eg. epilogue + /// vectorization) and the resume values can come from an additional bypass + /// block, the \p AdditionalBypass pair provides information about the bypass + /// block and the end value on the edge from bypass to this loop. + void createInductionResumeValues( + Loop *L, Value *VectorTripCount, + std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); + + /// Complete the loop skeleton by adding debug MDs, creating appropriate + /// conditional branches in the middle block, preparing the builder and + /// running the verifier. Take in the vector loop \p L as argument, and return + /// the preheader of the completed vector loop. + BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); + /// Add additional metadata to \p To that was not present on \p Orig. /// /// Currently this is used to add the noalias annotations based on the @@ -777,11 +777,11 @@ protected: /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); - /// Allow subclasses to override and print debug traces before/after vplan - /// execution, when trace information is requested. - virtual void printDebugTracesAtStart(){}; - virtual void printDebugTracesAtEnd(){}; - + /// Allow subclasses to override and print debug traces before/after vplan + /// execution, when trace information is requested. + virtual void printDebugTracesAtStart(){}; + virtual void printDebugTracesAtEnd(){}; + /// The original loop. Loop *OrigLoop; @@ -820,7 +820,7 @@ protected: /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. - ElementCount VF; + ElementCount VF; /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. @@ -840,8 +840,8 @@ protected: /// Middle Block between the vector and the scalar. BasicBlock *LoopMiddleBlock; - /// The (unique) ExitBlock of the scalar loop. Note that - /// there can be multiple exiting edges reaching this block. + /// The (unique) ExitBlock of the scalar loop. Note that + /// there can be multiple exiting edges reaching this block. BasicBlock *LoopExitBlock; /// The vector loop body. @@ -890,14 +890,14 @@ protected: // Vector of original scalar PHIs whose corresponding widened PHIs need to be // fixed up at the end of vector code generation. SmallVector<PHINode *, 8> OrigPHIsToFix; - - /// BFI and PSI are used to check for profile guided size optimizations. - BlockFrequencyInfo *BFI; - ProfileSummaryInfo *PSI; - - // Whether this loop should be optimized for size based on profile guided size - // optimizatios. - bool OptForSizeBasedOnProfile; + + /// BFI and PSI are used to check for profile guided size optimizations. + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; + + // Whether this loop should be optimized for size based on profile guided size + // optimizatios. + bool OptForSizeBasedOnProfile; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -908,11 +908,11 @@ public: const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - ElementCount::getFixed(1), UnrollFactor, LVL, CM, - BFI, PSI) {} + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + ElementCount::getFixed(1), UnrollFactor, LVL, CM, + BFI, PSI) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -922,128 +922,128 @@ private: Value *reverseVector(Value *Vec) override; }; -/// Encapsulate information regarding vectorization of a loop and its epilogue. -/// This information is meant to be updated and used across two stages of -/// epilogue vectorization. -struct EpilogueLoopVectorizationInfo { - ElementCount MainLoopVF = ElementCount::getFixed(0); - unsigned MainLoopUF = 0; - ElementCount EpilogueVF = ElementCount::getFixed(0); - unsigned EpilogueUF = 0; - BasicBlock *MainLoopIterationCountCheck = nullptr; - BasicBlock *EpilogueIterationCountCheck = nullptr; - BasicBlock *SCEVSafetyCheck = nullptr; - BasicBlock *MemSafetyCheck = nullptr; - Value *TripCount = nullptr; - Value *VectorTripCount = nullptr; - - EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, - unsigned EUF) - : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), - EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { - assert(EUF == 1 && - "A high UF for the epilogue loop is likely not beneficial."); - } -}; - -/// An extension of the inner loop vectorizer that creates a skeleton for a -/// vectorized loop that has its epilogue (residual) also vectorized. -/// The idea is to run the vplan on a given loop twice, firstly to setup the -/// skeleton and vectorize the main loop, and secondly to complete the skeleton -/// from the first step and vectorize the epilogue. This is achieved by -/// deriving two concrete strategy classes from this base class and invoking -/// them in succession from the loop vectorizer planner. -class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { -public: - InnerLoopAndEpilogueVectorizer( - Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, - LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), - EPI(EPI) {} - - // Override this function to handle the more complex control flow around the - // three loops. - BasicBlock *createVectorizedLoopSkeleton() final override { - return createEpilogueVectorizedLoopSkeleton(); - } - - /// The interface for creating a vectorized skeleton using one of two - /// different strategies, each corresponding to one execution of the vplan - /// as described above. - virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; - - /// Holds and updates state information required to vectorize the main loop - /// and its epilogue in two separate passes. This setup helps us avoid - /// regenerating and recomputing runtime safety checks. It also helps us to - /// shorten the iteration-count-check path length for the cases where the - /// iteration count of the loop is so small that the main vector loop is - /// completely skipped. - EpilogueLoopVectorizationInfo &EPI; -}; - -/// A specialized derived class of inner loop vectorizer that performs -/// vectorization of *main* loops in the process of vectorizing loops and their -/// epilogues. -class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { -public: - EpilogueVectorizerMainLoop( - Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, - LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) - : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI) {} - /// Implements the interface for creating a vectorized skeleton using the - /// *main loop* strategy (ie the first pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; - -protected: - /// Emits an iteration count bypass check once for the main loop (when \p - /// ForEpilogue is false) and once for the epilogue loop (when \p - /// ForEpilogue is true). - BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, - bool ForEpilogue); - void printDebugTracesAtStart() override; - void printDebugTracesAtEnd() override; -}; - -// A specialized derived class of inner loop vectorizer that performs -// vectorization of *epilogue* loops in the process of vectorizing loops and -// their epilogues. -class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { -public: - EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, - LoopInfo *LI, DominatorTree *DT, - const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, - EpilogueLoopVectorizationInfo &EPI, - LoopVectorizationLegality *LVL, - llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) - : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI) {} - /// Implements the interface for creating a vectorized skeleton using the - /// *epilogue loop* strategy (ie the second pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; - -protected: - /// Emits an iteration count bypass check after the main vector loop has - /// finished to see if there are any iterations left to execute by either - /// the vector epilogue or the scalar epilogue. - BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, - BasicBlock *Bypass, - BasicBlock *Insert); - void printDebugTracesAtStart() override; - void printDebugTracesAtEnd() override; -}; +/// Encapsulate information regarding vectorization of a loop and its epilogue. +/// This information is meant to be updated and used across two stages of +/// epilogue vectorization. +struct EpilogueLoopVectorizationInfo { + ElementCount MainLoopVF = ElementCount::getFixed(0); + unsigned MainLoopUF = 0; + ElementCount EpilogueVF = ElementCount::getFixed(0); + unsigned EpilogueUF = 0; + BasicBlock *MainLoopIterationCountCheck = nullptr; + BasicBlock *EpilogueIterationCountCheck = nullptr; + BasicBlock *SCEVSafetyCheck = nullptr; + BasicBlock *MemSafetyCheck = nullptr; + Value *TripCount = nullptr; + Value *VectorTripCount = nullptr; + + EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, + unsigned EUF) + : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), + EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { + assert(EUF == 1 && + "A high UF for the epilogue loop is likely not beneficial."); + } +}; + +/// An extension of the inner loop vectorizer that creates a skeleton for a +/// vectorized loop that has its epilogue (residual) also vectorized. +/// The idea is to run the vplan on a given loop twice, firstly to setup the +/// skeleton and vectorize the main loop, and secondly to complete the skeleton +/// from the first step and vectorize the epilogue. This is achieved by +/// deriving two concrete strategy classes from this base class and invoking +/// them in succession from the loop vectorizer planner. +class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { +public: + InnerLoopAndEpilogueVectorizer( + Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), + EPI(EPI) {} + + // Override this function to handle the more complex control flow around the + // three loops. + BasicBlock *createVectorizedLoopSkeleton() final override { + return createEpilogueVectorizedLoopSkeleton(); + } + + /// The interface for creating a vectorized skeleton using one of two + /// different strategies, each corresponding to one execution of the vplan + /// as described above. + virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; + + /// Holds and updates state information required to vectorize the main loop + /// and its epilogue in two separate passes. This setup helps us avoid + /// regenerating and recomputing runtime safety checks. It also helps us to + /// shorten the iteration-count-check path length for the cases where the + /// iteration count of the loop is so small that the main vector loop is + /// completely skipped. + EpilogueLoopVectorizationInfo &EPI; +}; + +/// A specialized derived class of inner loop vectorizer that performs +/// vectorization of *main* loops in the process of vectorizing loops and their +/// epilogues. +class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { +public: + EpilogueVectorizerMainLoop( + Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI, LVL, CM, BFI, PSI) {} + /// Implements the interface for creating a vectorized skeleton using the + /// *main loop* strategy (ie the first pass of vplan execution). + BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + +protected: + /// Emits an iteration count bypass check once for the main loop (when \p + /// ForEpilogue is false) and once for the epilogue loop (when \p + /// ForEpilogue is true). + BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, + bool ForEpilogue); + void printDebugTracesAtStart() override; + void printDebugTracesAtEnd() override; +}; + +// A specialized derived class of inner loop vectorizer that performs +// vectorization of *epilogue* loops in the process of vectorizing loops and +// their epilogues. +class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { +public: + EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, + EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, + llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI, LVL, CM, BFI, PSI) {} + /// Implements the interface for creating a vectorized skeleton using the + /// *epilogue loop* strategy (ie the second pass of vplan execution). + BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + +protected: + /// Emits an iteration count bypass check after the main vector loop has + /// finished to see if there are any iterations left to execute by either + /// the vector epilogue or the scalar epilogue. + BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, + BasicBlock *Bypass, + BasicBlock *Insert); + void printDebugTracesAtStart() override; + void printDebugTracesAtEnd() override; +}; } // end namespace llvm /// Look for a meaningful debug location on the instruction or it's @@ -1070,9 +1070,9 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) const DILocation *DIL = Inst->getDebugLoc(); if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && !isa<DbgInfoIntrinsic>(Inst)) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); - auto NewDIL = - DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto NewDIL = + DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); if (NewDIL) B.SetCurrentDebugLocation(NewDIL.getValue()); else @@ -1126,15 +1126,15 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, return R; } -/// Return a value for Step multiplied by VF. -static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { - assert(isa<ConstantInt>(Step) && "Expected an integer step"); - Constant *StepVal = ConstantInt::get( - Step->getType(), - cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); - return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; -} - +/// Return a value for Step multiplied by VF. +static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { + assert(isa<ConstantInt>(Step) && "Expected an integer step"); + Constant *StepVal = ConstantInt::get( + Step->getType(), + cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; +} + namespace llvm { void reportVectorizationFailure(const StringRef DebugMsg, @@ -1206,10 +1206,10 @@ enum ScalarEpilogueLowering { CM_ScalarEpilogueNotAllowedLowTripLoop, // Loop hint predicate indicating an epilogue is undesired. - CM_ScalarEpilogueNotNeededUsePredicate, - - // Directive indicating we must either tail fold or not vectorize - CM_ScalarEpilogueNotAllowedUsePredicate + CM_ScalarEpilogueNotNeededUsePredicate, + + // Directive indicating we must either tail fold or not vectorize + CM_ScalarEpilogueNotAllowedUsePredicate }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1236,7 +1236,7 @@ public: /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); + Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); /// \return True if runtime checks are required for vectorization, and false /// otherwise. @@ -1246,13 +1246,13 @@ public: /// This method checks every power of two up to MaxVF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); - VectorizationFactor - selectEpilogueVectorizationFactor(const ElementCount MaxVF, - const LoopVectorizationPlanner &LVP); + VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); + VectorizationFactor + selectEpilogueVectorizationFactor(const ElementCount MaxVF, + const LoopVectorizationPlanner &LVP); /// Setup cost-based decisions for user vectorization factor. - void selectUserVectorizationFactor(ElementCount UserVF) { + void selectUserVectorizationFactor(ElementCount UserVF) { collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); } @@ -1266,7 +1266,7 @@ public: /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); + unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1275,7 +1275,7 @@ public: /// the lists of loop-uniform and loop-scalar instructions. /// The calculated cost is saved with widening decision in order to /// avoid redundant calculations. - void setCostBasedWideningDecision(ElementCount VF); + void setCostBasedWideningDecision(ElementCount VF); /// A struct that represents some properties of the register usage /// of a loop. @@ -1290,16 +1290,16 @@ public: /// \return Returns information about the register usages of the loop for the /// given vectorization factors. - SmallVector<RegisterUsage, 8> - calculateRegisterUsage(ArrayRef<ElementCount> VFs); + SmallVector<RegisterUsage, 8> + calculateRegisterUsage(ArrayRef<ElementCount> VFs); /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); - /// Split reductions into those that happen in the loop, and those that happen - /// outside. In loop reductions are collected into InLoopReductionChains. - void collectInLoopReductions(); - + /// Split reductions into those that happen in the loop, and those that happen + /// outside. In loop reductions are collected into InLoopReductionChains. + void collectInLoopReductions(); + /// \returns The smallest bitwidth each instruction can be represented with. /// The vector equivalents of these instructions should be truncated to this /// type. @@ -1309,9 +1309,9 @@ public: /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. - bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { - assert(VF.isVector() && - "Profitable to scalarize relevant only for VF > 1."); + bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { + assert(VF.isVector() && + "Profitable to scalarize relevant only for VF > 1."); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. @@ -1325,8 +1325,8 @@ public: } /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { - if (VF.isScalar()) + bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { + if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1341,8 +1341,8 @@ public: } /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { - if (VF.isScalar()) + bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { + if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1358,8 +1358,8 @@ public: /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. - bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { - return VF.isVector() && MinBWs.find(I) != MinBWs.end() && + bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { + return VF.isVector() && MinBWs.find(I) != MinBWs.end() && !isProfitableToScalarize(I, VF) && !isScalarAfterVectorization(I, VF); } @@ -1376,18 +1376,18 @@ public: /// Save vectorization decision \p W and \p Cost taken by the cost model for /// instruction \p I and vector width \p VF. - void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, - InstructionCost Cost) { - assert(VF.isVector() && "Expected VF >=2"); + void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, + InstructionCost Cost) { + assert(VF.isVector() && "Expected VF >=2"); WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); } /// Save vectorization decision \p W and \p Cost taken by the cost model for /// interleaving group \p Grp and vector width \p VF. - void setWideningDecision(const InterleaveGroup<Instruction> *Grp, - ElementCount VF, InstWidening W, - InstructionCost Cost) { - assert(VF.isVector() && "Expected VF >=2"); + void setWideningDecision(const InterleaveGroup<Instruction> *Grp, + ElementCount VF, InstWidening W, + InstructionCost Cost) { + assert(VF.isVector() && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. /// But the cost will be assigned to one instruction only. for (unsigned i = 0; i < Grp->getFactor(); ++i) { @@ -1403,14 +1403,14 @@ public: /// Return the cost model decision for the given instruction \p I and vector /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. - InstWidening getWideningDecision(Instruction *I, ElementCount VF) { - assert(VF.isVector() && "Expected VF to be a vector VF"); + InstWidening getWideningDecision(Instruction *I, ElementCount VF) { + assert(VF.isVector() && "Expected VF to be a vector VF"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return CM_GatherScatter; - std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); + std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) return CM_Unknown; @@ -1419,9 +1419,9 @@ public: /// Return the vectorization cost for the given instruction \p I and vector /// width \p VF. - InstructionCost getWideningCost(Instruction *I, ElementCount VF) { - assert(VF.isVector() && "Expected VF >=2"); - std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); + InstructionCost getWideningCost(Instruction *I, ElementCount VF) { + assert(VF.isVector() && "Expected VF >=2"); + std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && "The cost is not calculated"); return WideningDecisions[InstOnVF].second; @@ -1430,7 +1430,7 @@ public: /// Return True if instruction \p I is an optimizable truncate whose operand /// is an induction variable. Such a truncate will be removed by adding a new /// induction variable with the destination type. - bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { + bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { // If the instruction is not a truncate, return false. auto *Trunc = dyn_cast<TruncInst>(I); if (!Trunc) @@ -1455,14 +1455,14 @@ public: /// Collects the instructions to scalarize for each predicated instruction in /// the loop. - void collectInstsToScalarize(ElementCount VF); + void collectInstsToScalarize(ElementCount VF); /// Collect Uniform and Scalar values for the given \p VF. /// The sets depend on CM decision for Load/Store instructions /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(ElementCount VF) { + void collectUniformsAndScalars(ElementCount VF) { // Do the analysis once. - if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) + if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) return; setCostBasedWideningDecision(VF); collectLoopUniforms(VF); @@ -1513,8 +1513,8 @@ public: /// instructions that may divide by zero. /// If a non-zero VF has been calculated, we check if I will be scalarized /// predication for that VF. - bool isScalarWithPredication(Instruction *I, - ElementCount VF = ElementCount::getFixed(1)); + bool isScalarWithPredication(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. @@ -1531,16 +1531,16 @@ public: /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. - bool - memoryInstructionCanBeWidened(Instruction *I, - ElementCount VF = ElementCount::getFixed(1)); + bool + memoryInstructionCanBeWidened(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); /// Returns true if \p I is a memory instruction in an interleaved-group /// of memory accesses that can be vectorized with wide vector loads/stores /// and shuffles. - bool - interleavedAccessCanBeWidened(Instruction *I, - ElementCount VF = ElementCount::getFixed(1)); + bool + interleavedAccessCanBeWidened(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { @@ -1553,16 +1553,16 @@ public: return InterleaveInfo.getInterleaveGroup(Instr); } - /// Returns true if we're required to use a scalar epilogue for at least - /// the final iteration of the original loop. + /// Returns true if we're required to use a scalar epilogue for at least + /// the final iteration of the original loop. bool requiresScalarEpilogue() const { - if (!isScalarEpilogueAllowed()) - return false; - // If we might exit from anywhere but the latch, must run the exiting - // iteration in scalar form. - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) - return true; - return InterleaveInfo.requiresScalarEpilogue(); + if (!isScalarEpilogueAllowed()) + return false; + // If we might exit from anywhere but the latch, must run the exiting + // iteration in scalar form. + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) + return true; + return InterleaveInfo.requiresScalarEpilogue(); } /// Returns true if a scalar epilogue is not allowed due to optsize or a @@ -1578,34 +1578,34 @@ public: return foldTailByMasking() || Legal->blockNeedsPredication(BB); } - /// A SmallMapVector to store the InLoop reduction op chains, mapping phi - /// nodes to the chain of instructions representing the reductions. Uses a - /// MapVector to ensure deterministic iteration order. - using ReductionChainMap = - SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; - - /// Return the chain of instructions representing an inloop reduction. - const ReductionChainMap &getInLoopReductionChains() const { - return InLoopReductionChains; - } - - /// Returns true if the Phi is part of an inloop reduction. - bool isInLoopReduction(PHINode *Phi) const { - return InLoopReductionChains.count(Phi); - } - + /// A SmallMapVector to store the InLoop reduction op chains, mapping phi + /// nodes to the chain of instructions representing the reductions. Uses a + /// MapVector to ensure deterministic iteration order. + using ReductionChainMap = + SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; + + /// Return the chain of instructions representing an inloop reduction. + const ReductionChainMap &getInLoopReductionChains() const { + return InLoopReductionChains; + } + + /// Returns true if the Phi is part of an inloop reduction. + bool isInLoopReduction(PHINode *Phi) const { + return InLoopReductionChains.count(Phi); + } + /// Estimate cost of an intrinsic call instruction CI if it were vectorized /// with factor VF. Return the cost of the instruction, including /// scalarization overhead if it's needed. - InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); + InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); /// Estimate cost of a call instruction CI if it were vectorized with factor /// VF. Return the cost of the instruction, including scalarization overhead /// if it's needed. The flag NeedToScalarize shows if the call needs to be /// scalarized - /// i.e. either vector version isn't available, or is too expensive. - InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, - bool &NeedToScalarize); + InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, + bool &NeedToScalarize); /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { @@ -1620,8 +1620,8 @@ private: /// \return An upper bound for the vectorization factor, a power-of-2 larger /// than zero. One is returned if vectorization should best be avoided due /// to cost. - ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF); + ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -1630,54 +1630,54 @@ private: /// is /// false, then all operations will be scalarized (i.e. no vectorization has /// actually taken place). - using VectorizationCostTy = std::pair<InstructionCost, bool>; + using VectorizationCostTy = std::pair<InstructionCost, bool>; /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. - VectorizationCostTy expectedCost(ElementCount VF); + VectorizationCostTy expectedCost(ElementCount VF); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. - InstructionCost getInstructionCost(Instruction *I, ElementCount VF, - Type *&VectorTy); - - /// Return the cost of instructions in an inloop reduction pattern, if I is - /// part of that pattern. - InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, - Type *VectorTy, - TTI::TargetCostKind CostKind); - + InstructionCost getInstructionCost(Instruction *I, ElementCount VF, + Type *&VectorTy); + + /// Return the cost of instructions in an inloop reduction pattern, if I is + /// part of that pattern. + InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, + Type *VectorTy, + TTI::TargetCostKind CostKind); + /// Calculate vectorization cost of memory instruction \p I. - InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); + InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); /// The cost computation for scalarized memory instruction. - InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); + InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); /// The cost computation for interleaving group of memory instructions. - InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); + InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); /// The cost computation for Gather/Scatter instruction. - InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); + InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); /// The cost computation for widening instruction \p I with consecutive /// memory access. - InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); + InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); /// The cost calculation for Load/Store instruction \p I with uniform pointer - /// Load: scalar load + broadcast. /// Store: scalar store + (loop invariant value stored? 0 : extract of last /// element) - InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); + InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); + InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. @@ -1695,7 +1695,7 @@ private: /// A type representing the costs for instructions if they were to be /// scalarized rather than vectorized. The entries are Instruction-Cost /// pairs. - using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; + using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; /// A set containing all BasicBlocks that are known to present after /// vectorization as a predicated block. @@ -1717,38 +1717,38 @@ private: /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. - DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; + DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; /// Holds the instructions known to be uniform after vectorization. /// The data is collected per VF. - DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; + DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; /// Holds the instructions known to be scalar after vectorization. /// The data is collected per VF. - DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; + DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; /// Holds the instructions (address computations) that are forced to be /// scalarized. - DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; - - /// PHINodes of the reductions that should be expanded in-loop along with - /// their associated chains of reduction operations, in program order from top - /// (PHI) to bottom - ReductionChainMap InLoopReductionChains; - - /// A Map of inloop reduction operations and their immediate chain operand. - /// FIXME: This can be removed once reductions can be costed correctly in - /// vplan. This was added to allow quick lookup to the inloop operations, - /// without having to loop through InLoopReductionChains. - DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; - + DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; + + /// PHINodes of the reductions that should be expanded in-loop along with + /// their associated chains of reduction operations, in program order from top + /// (PHI) to bottom + ReductionChainMap InLoopReductionChains; + + /// A Map of inloop reduction operations and their immediate chain operand. + /// FIXME: This can be removed once reductions can be costed correctly in + /// vplan. This was added to allow quick lookup to the inloop operations, + /// without having to loop through InLoopReductionChains. + DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; + /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A /// non-negative return value implies the expression will be scalarized. /// Currently, only single-use chains are considered for scalarization. int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, - ElementCount VF); + ElementCount VF); /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in @@ -1759,28 +1759,28 @@ private: /// scalarized instruction will be represented by VF scalar values in the /// vectorized loop, each corresponding to an iteration of the original /// scalar loop. - void collectLoopUniforms(ElementCount VF); + void collectLoopUniforms(ElementCount VF); /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized /// during vectorization. Non-uniform scalarized instructions will be /// represented by VF values in the vectorized loop, each corresponding to an /// iteration of the original scalar loop. - void collectLoopScalars(ElementCount VF); + void collectLoopScalars(ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. - using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, - std::pair<InstWidening, InstructionCost>>; + using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, + std::pair<InstWidening, InstructionCost>>; DecisionList WideningDecisions; /// Returns true if \p V is expected to be vectorized and it needs to be /// extracted. - bool needsExtract(Value *V, ElementCount VF) const { + bool needsExtract(Value *V, ElementCount VF) const { Instruction *I = dyn_cast<Instruction>(V); - if (VF.isScalar() || !I || !TheLoop->contains(I) || - TheLoop->isLoopInvariant(I)) + if (VF.isScalar() || !I || !TheLoop->contains(I) || + TheLoop->isLoopInvariant(I)) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -1795,21 +1795,21 @@ private: /// Returns a range containing only operands needing to be extracted. SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, - ElementCount VF) { + ElementCount VF) { return SmallVector<Value *, 4>(make_filter_range( Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } - /// Determines if we have the infrastructure to vectorize loop \p L and its - /// epilogue, assuming the main loop is vectorized by \p VF. - bool isCandidateForEpilogueVectorization(const Loop &L, - const ElementCount VF) const; - - /// Returns true if epilogue vectorization is considered profitable, and - /// false otherwise. - /// \p VF is the vectorization factor chosen for the original loop. - bool isEpilogueVectorizationProfitable(const ElementCount VF) const; - + /// Determines if we have the infrastructure to vectorize loop \p L and its + /// epilogue, assuming the main loop is vectorized by \p VF. + bool isCandidateForEpilogueVectorization(const Loop &L, + const ElementCount VF) const; + + /// Returns true if epilogue vectorization is considered profitable, and + /// false otherwise. + /// \p VF is the vectorization factor chosen for the original loop. + bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + public: /// The loop that we evaluate. Loop *TheLoop; @@ -1852,9 +1852,9 @@ public: /// Values to ignore in the cost model when VF > 1. SmallPtrSet<const Value *, 16> VecValuesToIgnore; - - /// Profitable vector factors. - SmallVector<VectorizationFactor, 8> ProfitableVFs; + + /// Profitable vector factors. + SmallVector<VectorizationFactor, 8> ProfitableVFs; }; } // end namespace llvm @@ -1875,7 +1875,7 @@ public: // representation for pragma 'omp simd' is introduced. static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE) { - assert(!OuterLp->isInnermost() && "This is not an outer loop"); + assert(!OuterLp->isInnermost() && "This is not an outer loop"); LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); // Only outer loops with an explicit vectorization hint are supported. @@ -1908,7 +1908,7 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI, // now, only collect outer loops that have explicit vectorization hints. If we // are stress testing the VPlan H-CFG construction, we collect the outermost // loop of every loop nest. - if (L.isInnermost() || VPlanBuildStressTest || + if (L.isInnermost() || VPlanBuildStressTest || (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { LoopBlocksRPO RPOT(&L); RPOT.perform(LI); @@ -2022,8 +2022,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { } void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( - const InductionDescriptor &II, Value *Step, Value *Start, - Instruction *EntryVal) { + const InductionDescriptor &II, Value *Step, Value *Start, + Instruction *EntryVal) { assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); @@ -2055,8 +2055,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. - Value *ConstVF = - getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); + Value *ConstVF = + getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); // Create a vector splat to use in the induction update. @@ -2064,10 +2064,10 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - assert(!VF.isScalable() && "scalable vectors not yet supported."); - Value *SplatVF = isa<Constant>(Mul) - ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) - : Builder.CreateVectorSplat(VF, Mul); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + Value *SplatVF = isa<Constant>(Mul) + ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(VF, Mul); Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -2143,8 +2143,8 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); } -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, - TruncInst *Trunc) { +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, + TruncInst *Trunc) { assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); @@ -2202,10 +2202,10 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *EntryPart = - getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, - ID.getInductionOpcode()); + getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, + ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); if (Trunc) addMetadata(EntryPart, Trunc); @@ -2215,7 +2215,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (VF.isZero() || VF.isScalar()) { + if (VF.isZero() || VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); CreateSplatIV(ScalarIV, Step); return; @@ -2226,7 +2226,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // least one user in the loop that is not widened. auto NeedsScalarIV = needsScalarInduction(EntryVal); if (!NeedsScalarIV) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); return; } @@ -2234,7 +2234,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (!shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); Value *ScalarIV = CreateScalarIV(Step); // Create scalar steps that can be used by instructions we will later // scalarize. Note that the addition of the scalar steps will not increase @@ -2256,7 +2256,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps BinOp) { // Create and check the types. - auto *ValVTy = cast<FixedVectorType>(Val->getType()); + auto *ValVTy = cast<FixedVectorType>(Val->getType()); int VLen = ValVTy->getNumElements(); Type *STy = Val->getType()->getScalarType(); @@ -2313,7 +2313,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. - assert(VF.isVector() && "VF should be greater than one"); + assert(VF.isVector() && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2335,27 +2335,27 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) - ? 1 - : VF.getKnownMinValue(); - assert((!VF.isScalable() || Lanes == 1) && - "Should never scalarize a scalable vector"); + Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) + ? 1 + : VF.getKnownMinValue(); + assert((!VF.isScalable() || Lanes == 1) && + "Should never scalarize a scalable vector"); // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), - ScalarIVTy->getScalarSizeInBits()); - Value *StartIdx = - createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); - if (ScalarIVTy->isFloatingPointTy()) - StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); - StartIdx = addFastMathFlag(Builder.CreateBinOp( - AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); - // The step returned by `createStepForVF` is a runtime-evaluated value - // when VF is scalable. Otherwise, it should be folded into a Constant. - assert((VF.isScalable() || isa<Constant>(StartIdx)) && - "Expected StartIdx to be folded to a constant when VF is not " - "scalable"); + auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), + ScalarIVTy->getScalarSizeInBits()); + Value *StartIdx = + createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); + if (ScalarIVTy->isFloatingPointTy()) + StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); + StartIdx = addFastMathFlag(Builder.CreateBinOp( + AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); + // The step returned by `createStepForVF` is a runtime-evaluated value + // when VF is scalable. Otherwise, it should be folded into a Constant. + assert((VF.isScalable() || isa<Constant>(StartIdx)) && + "Expected StartIdx to be folded to a constant when VF is not " + "scalable"); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); @@ -2389,7 +2389,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // If we aren't vectorizing, we can just copy the scalar map values over to // the vector map. - if (VF.isScalar()) { + if (VF.isScalar()) { VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); return ScalarValue; } @@ -2398,11 +2398,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // is known to be uniform after vectorization, this corresponds to lane zero // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. - unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) - ? 0 - : VF.getKnownMinValue() - 1; - assert((!VF.isScalable() || LastLane == 0) && - "Scalable vectorization can't lead to any scalarized values."); + unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) + ? 0 + : VF.getKnownMinValue() - 1; + assert((!VF.isScalable() || LastLane == 0) && + "Scalable vectorization can't lead to any scalarized values."); auto *LastInst = cast<Instruction>( VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); @@ -2423,11 +2423,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { VectorValue = getBroadcastInstrs(ScalarValue); VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { - // Initialize packing with insertelements to start from poison. - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); - VectorLoopValueMap.setVectorValue(V, Part, Poison); - for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) + // Initialize packing with insertelements to start from poison. + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); + VectorLoopValueMap.setVectorValue(V, Part, Poison); + for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) packScalarIntoVectorValue(V, {Part, Lane}); VectorValue = VectorLoopValueMap.getVectorValue(V, Part); } @@ -2466,7 +2466,7 @@ InnerLoopVectorizer::getOrCreateScalarValue(Value *V, // extractelement instruction. auto *U = getOrCreateVectorValue(V, Instance.Part); if (!U->getType()->isVectorTy()) { - assert(VF.isScalar() && "Value not scalarized has non-vector type"); + assert(VF.isScalar() && "Value not scalarized has non-vector type"); return U; } @@ -2491,12 +2491,12 @@ void InnerLoopVectorizer::packScalarIntoVectorValue( Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); - assert(!VF.isScalable() && "Cannot reverse scalable vectors"); + assert(!VF.isScalable() && "Cannot reverse scalable vectors"); SmallVector<int, 8> ShuffleMask; - for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) - ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); + for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) + ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); - return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); + return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); } // Return whether we allow using masked interleave-groups (for dealing with @@ -2521,9 +2521,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // } // To: // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B -// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements -// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements -// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements +// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements +// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements +// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements // // Or translate following interleaved store group (factor = 3): // for (i = 0; i < N; i+=3) { @@ -2534,22 +2534,22 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // } // To: // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> -// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> +// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup( - const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, - VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, - VPValue *BlockInMask) { + const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, + VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, + VPValue *BlockInMask) { Instruction *Instr = Group->getInsertPos(); const DataLayout &DL = Instr->getModule()->getDataLayout(); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); - assert(!VF.isScalable() && "scalable vectors not yet supported."); - auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. SmallVector<Value *, 2> AddrParts; @@ -2565,10 +2565,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. - assert(!VF.isScalable() && - "scalable vector reverse operation is not implemented"); + assert(!VF.isScalable() && + "scalable vector reverse operation is not implemented"); if (Group->isReverse()) - Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); + Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, {Part, 0}); @@ -2599,12 +2599,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } setDebugLocFromInst(Builder, Instr); - Value *PoisonVec = PoisonValue::get(VecTy); + Value *PoisonVec = PoisonValue::get(VecTy); Value *MaskForGaps = nullptr; if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); - MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } @@ -2620,11 +2620,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *GroupMask = MaskForGaps; if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); - assert(!VF.isScalable() && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, - createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), - "interleaved.mask"); + BlockInMaskPart, + createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), + "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) @@ -2632,7 +2632,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } NewLoad = Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), - GroupMask, PoisonVec, "wide.masked.vec"); + GroupMask, PoisonVec, "wide.masked.vec"); } else NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], @@ -2643,7 +2643,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // For each member in the group, shuffle out the appropriate data from the // wide loads. - unsigned J = 0; + unsigned J = 0; for (unsigned I = 0; I < InterleaveFactor; ++I) { Instruction *Member = Group->getMember(I); @@ -2651,33 +2651,33 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (!Member) continue; - assert(!VF.isScalable() && "scalable vectors not yet supported."); - auto StrideMask = - createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto StrideMask = + createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( - NewLoads[Part], StrideMask, "strided.vec"); + NewLoads[Part], StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } if (Group->isReverse()) StridedVec = reverseVector(StridedVec); - State.set(VPDefs[J], Member, StridedVec, Part); + State.set(VPDefs[J], Member, StridedVec, Part); } - ++J; + ++J; } return; } // The sub vector type for current instruction. - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - auto *SubVT = VectorType::get(ScalarTy, VF); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + auto *SubVT = VectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. for (unsigned Part = 0; Part < UF; Part++) { @@ -2685,10 +2685,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( SmallVector<Value *, 4> StoredVecs; for (unsigned i = 0; i < InterleaveFactor; i++) { // Interleaved store group doesn't allow a gap, so each index has a member - assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); - - Value *StoredVec = State.get(StoredValues[i], Part); + assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); + Value *StoredVec = State.get(StoredValues[i], Part); + if (Group->isReverse()) StoredVec = reverseVector(StoredVec); @@ -2704,17 +2704,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. - assert(!VF.isScalable() && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *IVec = Builder.CreateShuffleVector( - WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), + WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), "interleaved.vec"); Instruction *NewStoreInstr; if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, - createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), + BlockInMaskPart, + createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); @@ -2727,9 +2727,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } } -void InnerLoopVectorizer::vectorizeMemoryInstruction( - Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask) { +void InnerLoopVectorizer::vectorizeMemoryInstruction( + Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, + VPValue *StoredValue, VPValue *BlockInMask) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); @@ -2746,8 +2746,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( "CM decision is not to widen the memory instruction"); Type *ScalarDataTy = getMemInstValueType(Instr); - - auto *DataTy = VectorType::get(ScalarDataTy, VF); + + auto *DataTy = VectorType::get(ScalarDataTy, VF); const Align Alignment = getLoadStoreAlignment(Instr); // Determine if the pointer operand of the access is either consecutive or @@ -2779,23 +2779,23 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( InBounds = gep->isInBounds(); if (Reverse) { - assert(!VF.isScalable() && - "Reversing vectors is not yet supported for scalable vectors."); - + assert(!VF.isScalable() && + "Reversing vectors is not yet supported for scalable vectors."); + // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); + PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( + ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); PartPtr->setIsInBounds(InBounds); - PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( - ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); + PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( + ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { - Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); + Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); + Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); PartPtr->setIsInBounds(InBounds); } @@ -2850,7 +2850,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) NewLI = Builder.CreateMaskedLoad( - VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), + VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), "wide.masked.load"); else NewLI = @@ -2861,8 +2861,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( if (Reverse) NewLI = reverseVector(NewLI); } - - State.set(Def, Instr, NewLI, Part); + + State.set(Def, Instr, NewLI, Part); } } @@ -2872,12 +2872,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, VPTransformState &State) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); - // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for - // the first lane and part. - if (isa<NoAliasScopeDeclInst>(Instr)) - if (Instance.Lane != 0 || Instance.Part != 0) - return; - + // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for + // the first lane and part. + if (isa<NoAliasScopeDeclInst>(Instr)) + if (Instance.Lane != 0 || Instance.Part != 0) + return; + setDebugLocFromInst(Builder, Instr); // Does this instruction return a value ? @@ -2890,12 +2890,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { - auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); - auto InputInstance = Instance; - if (!Operand || !OrigLoop->contains(Operand) || - (Cost->isUniformAfterVectorization(Operand, State.VF))) - InputInstance.Lane = 0; - auto *NewOp = State.get(User.getOperand(op), InputInstance); + auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); + auto InputInstance = Instance; + if (!Operand || !OrigLoop->contains(Operand) || + (Cost->isUniformAfterVectorization(Operand, State.VF))) + InputInstance.Lane = 0; + auto *NewOp = State.get(User.getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -2903,9 +2903,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, // Place the cloned scalar in the new loop. Builder.Insert(Cloned); - // TODO: Set result for VPValue of VPReciplicateRecipe. This requires - // representing scalar values in VPTransformState. Add the cloned scalar to - // the scalar map entry. + // TODO: Set result for VPValue of VPReciplicateRecipe. This requires + // representing scalar values in VPTransformState. Add the cloned scalar to + // the scalar map entry. VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); // If we just cloned a new assumption, add it the assumption cache. @@ -2942,7 +2942,7 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, Induction->addIncoming(Next, Latch); // Create the compare. Value *ICmp = Builder.CreateICmpEQ(Next, End); - Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); + Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); // Now we have two terminators. Remove the old one from the block. Latch->getTerminator()->eraseFromParent(); @@ -2959,7 +2959,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { // Find the loop boundaries. ScalarEvolution *SE = PSE.getSE(); const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); - assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && + assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); Type *IdxTy = Legal->getWidestInductionType(); @@ -3005,8 +3005,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Type *Ty = TC->getType(); - // This is where we can make the step a runtime constant. - Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); + // This is where we can make the step a runtime constant. + Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first @@ -3015,12 +3015,12 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // that it starts at zero and its Step is a power of two; the loop will then // exit, with the last early-exit vector comparison also producing all-true. if (Cost->foldTailByMasking()) { - assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && + assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - assert(!VF.isScalable() && - "Tail folding not yet supported for scalable vectors"); - TC = Builder.CreateAdd( - TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); + assert(!VF.isScalable() && + "Tail folding not yet supported for scalable vectors"); + TC = Builder.CreateAdd( + TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -3030,18 +3030,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // unroll factor (number of SIMD instructions). Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); - // There are two cases where we need to ensure (at least) the last iteration - // runs in the scalar remainder loop. Thus, if the step evenly divides + // There are two cases where we need to ensure (at least) the last iteration + // runs in the scalar remainder loop. Thus, if the step evenly divides // the trip count, we set the remainder to be equal to the step. If the step // does not evenly divide the trip count, no adjustment is necessary since // there will already be scalar iterations. Note that the minimum iterations - // check ensures that N >= Step. The cases are: - // 1) If there is a non-reversed interleaved group that may speculatively - // access memory out-of-bounds. - // 2) If any instruction may follow a conditionally taken exit. That is, if - // the loop contains multiple exiting blocks, or a single exiting block - // which is not the latch. - if (VF.isVector() && Cost->requiresScalarEpilogue()) { + // check ensures that N >= Step. The cases are: + // 1) If there is a non-reversed interleaved group that may speculatively + // access memory out-of-bounds. + // 2) If any instruction may follow a conditionally taken exit. That is, if + // the loop contains multiple exiting blocks, or a single exiting block + // which is not the latch. + if (VF.isVector() && Cost->requiresScalarEpilogue()) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -3054,18 +3054,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL) { // Verify that V is a vector type with same number of elements as DstVTy. - auto *DstFVTy = cast<FixedVectorType>(DstVTy); - unsigned VF = DstFVTy->getNumElements(); - auto *SrcVecTy = cast<FixedVectorType>(V->getType()); + auto *DstFVTy = cast<FixedVectorType>(DstVTy); + unsigned VF = DstFVTy->getNumElements(); + auto *SrcVecTy = cast<FixedVectorType>(V->getType()); assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); Type *SrcElemTy = SrcVecTy->getElementType(); - Type *DstElemTy = DstFVTy->getElementType(); + Type *DstElemTy = DstFVTy->getElementType(); assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"); // Do a direct cast if element types are castable. if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { - return Builder.CreateBitOrPointerCast(V, DstFVTy); + return Builder.CreateBitOrPointerCast(V, DstFVTy); } // V cannot be directly casted to desired vector type. // May happen when V is a floating point vector but DstVTy is a vector of @@ -3079,7 +3079,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); auto *VecIntTy = FixedVectorType::get(IntTy, VF); Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); - return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); + return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, @@ -3100,11 +3100,11 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); - if (!Cost->foldTailByMasking()) { - Value *Step = - createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); - CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); - } + if (!Cost->foldTailByMasking()) { + Value *Step = + createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); + CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + } // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, @@ -3141,9 +3141,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { if (C->isZero()) return; - assert(!(SCEVCheckBlock->getParent()->hasOptSize() || - (OptForSizeBasedOnProfile && - Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && + assert(!(SCEVCheckBlock->getParent()->hasOptSize() || + (OptForSizeBasedOnProfile && + Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"); SCEVCheckBlock->setName("vector.scevcheck"); @@ -3182,7 +3182,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { if (!RtPtrChecking.Need) return; - if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { + if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); @@ -3202,33 +3202,33 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, "vector.ph"); - auto *CondBranch = cast<BranchInst>( - Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); - ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); - LoopBypassBlocks.push_back(MemCheckBlock); - AddedSafetyChecks = true; - + auto *CondBranch = cast<BranchInst>( + Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); + ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); + LoopBypassBlocks.push_back(MemCheckBlock); + AddedSafetyChecks = true; + // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { DT->changeImmediateDominator(Bypass, MemCheckBlock); DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); } - Instruction *FirstCheckInst; - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, - RtPtrChecking.getChecks(), RtPtrChecking.getSE()); - assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " - "claimed checks are required"); - CondBranch->setCondition(MemRuntimeCheck); + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + std::tie(FirstCheckInst, MemRuntimeCheck) = + addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, + RtPtrChecking.getChecks(), RtPtrChecking.getSE()); + assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " + "claimed checks are required"); + CondBranch->setCondition(MemRuntimeCheck); // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. - LVer = std::make_unique<LoopVersioning>( - *Legal->getLAI(), - Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, - DT, PSE.getSE()); + LVer = std::make_unique<LoopVersioning>( + *Legal->getLAI(), + Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, + DT, PSE.getSE()); LVer->prepareNoAliasMetadata(); } @@ -3332,35 +3332,35 @@ Value *InnerLoopVectorizer::emitTransformedIndex( llvm_unreachable("invalid enum"); } -Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { +Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); - LoopExitBlock = OrigLoop->getUniqueExitBlock(); + LoopExitBlock = OrigLoop->getUniqueExitBlock(); assert(LoopExitBlock && "Must have an exit block"); assert(LoopVectorPreHeader && "Invalid loop structure"); LoopMiddleBlock = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - LI, nullptr, Twine(Prefix) + "middle.block"); + LI, nullptr, Twine(Prefix) + "middle.block"); LoopScalarPreHeader = SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, - nullptr, Twine(Prefix) + "scalar.ph"); - - // Set up branch from middle block to the exit and scalar preheader blocks. - // completeLoopSkeleton will update the condition to use an iteration check, - // if required to decide whether to execute the remainder. - BranchInst *BrInst = - BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); - auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); - BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); - ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); - + nullptr, Twine(Prefix) + "scalar.ph"); + + // Set up branch from middle block to the exit and scalar preheader blocks. + // completeLoopSkeleton will update the condition to use an iteration check, + // if required to decide whether to execute the remainder. + BranchInst *BrInst = + BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); + auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); + BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); + ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); + // We intentionally don't let SplitBlock to update LoopInfo since // LoopVectorBody should belong to another loop than LoopVectorPreHeader. // LoopVectorBody is explicitly added to the correct place few lines later. LoopVectorBody = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - nullptr, nullptr, Twine(Prefix) + "vector.body"); + nullptr, nullptr, Twine(Prefix) + "vector.body"); // Update dominator for loop exit. DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); @@ -3377,16 +3377,16 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LI->addTopLevelLoop(Lp); } Lp->addBasicBlockToLoop(LoopVectorBody, *LI); - return Lp; -} - -void InnerLoopVectorizer::createInductionResumeValues( - Loop *L, Value *VectorTripCount, - std::pair<BasicBlock *, Value *> AdditionalBypass) { - assert(VectorTripCount && L && "Expected valid arguments"); - assert(((AdditionalBypass.first && AdditionalBypass.second) || - (!AdditionalBypass.first && !AdditionalBypass.second)) && - "Inconsistent information about additional bypass."); + return Lp; +} + +void InnerLoopVectorizer::createInductionResumeValues( + Loop *L, Value *VectorTripCount, + std::pair<BasicBlock *, Value *> AdditionalBypass) { + assert(VectorTripCount && L && "Expected valid arguments"); + assert(((AdditionalBypass.first && AdditionalBypass.second) || + (!AdditionalBypass.first && !AdditionalBypass.second)) && + "Inconsistent information about additional bypass."); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3405,31 +3405,31 @@ void InnerLoopVectorizer::createInductionResumeValues( // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; - Value *EndValueFromAdditionalBypass = AdditionalBypass.second; + Value *EndValueFromAdditionalBypass = AdditionalBypass.second; if (OrigPhi == OldInduction) { // We know what the end value is. - EndValue = VectorTripCount; + EndValue = VectorTripCount; } else { - IRBuilder<> B(L->getLoopPreheader()->getTerminator()); + IRBuilder<> B(L->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = - CastInst::getCastOpcode(VectorTripCount, true, StepType, true); - Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); + CastInst::getCastOpcode(VectorTripCount, true, StepType, true); + Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); - - // Compute the end value for the additional bypass (if applicable). - if (AdditionalBypass.first) { - B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); - CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, - StepType, true); - CRD = - B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); - EndValueFromAdditionalBypass = - emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); - EndValueFromAdditionalBypass->setName("ind.end"); - } + + // Compute the end value for the additional bypass (if applicable). + if (AdditionalBypass.first) { + B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); + CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, + StepType, true); + CRD = + B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); + EndValueFromAdditionalBypass = + emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + EndValueFromAdditionalBypass->setName("ind.end"); + } } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. @@ -3440,44 +3440,44 @@ void InnerLoopVectorizer::createInductionResumeValues( // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); - - if (AdditionalBypass.first) - BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, - EndValueFromAdditionalBypass); - + + if (AdditionalBypass.first) + BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, + EndValueFromAdditionalBypass); + OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } -} +} -BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, - MDNode *OrigLoopID) { - assert(L && "Expected valid loop."); - - // The trip counts should be cached by now. - Value *Count = getOrCreateTripCount(L); - Value *VectorTripCount = getOrCreateVectorTripCount(L); - - auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); +BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, + MDNode *OrigLoopID) { + assert(L && "Expected valid loop."); + // The trip counts should be cached by now. + Value *Count = getOrCreateTripCount(L); + Value *VectorTripCount = getOrCreateVectorTripCount(L); + + auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); + // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. // If tail is to be folded, we know we don't need to run the remainder. if (!Cost->foldTailByMasking()) { - Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - Count, VectorTripCount, "cmp.n", - LoopMiddleBlock->getTerminator()); + Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + Count, VectorTripCount, "cmp.n", + LoopMiddleBlock->getTerminator()); - // Here we use the same DebugLoc as the scalar loop latch terminator instead + // Here we use the same DebugLoc as the scalar loop latch terminator instead // of the corresponding compare because they may have ended up with // different line numbers and we want to avoid awkward line stepping while // debugging. Eg. if the compare has got a line number inside the loop. - CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); - cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); + CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); + cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); } // Get ready to start creating new instructions into the vectorized body. - assert(LoopVectorPreHeader == L->getLoopPreheader() && + assert(LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"); Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); @@ -3485,7 +3485,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); if (VectorizedLoopID.hasValue()) { - L->setLoopID(VectorizedLoopID.getValue()); + L->setLoopID(VectorizedLoopID.getValue()); // Do not setAlreadyVectorized if loop attributes have been defined // explicitly. @@ -3495,9 +3495,9 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). if (MDNode *LID = OrigLoop->getLoopID()) - L->setLoopID(LID); + L->setLoopID(LID); - LoopVectorizeHints Hints(L, true, *ORE); + LoopVectorizeHints Hints(L, true, *ORE); Hints.setAlreadyVectorized(); #ifdef EXPENSIVE_CHECKS @@ -3508,91 +3508,91 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, return LoopVectorPreHeader; } -BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { - /* - In this function we generate a new loop. The new loop will contain - the vectorized instructions while the old loop will continue to run the - scalar remainder. - - [ ] <-- loop iteration number check. - / | - / v - | [ ] <-- vector loop bypass (may consist of multiple blocks). - | / | - | / v - || [ ] <-- vector pre header. - |/ | - | v - | [ ] \ - | [ ]_| <-- vector loop. - | | - | v - | -[ ] <--- middle-block. - | / | - | / v - -|- >[ ] <--- new preheader. - | | - | v - | [ ] \ - | [ ]_| <-- old scalar loop to handle remainder. - \ | - \ v - >[ ] <-- exit block. - ... - */ - - // Get the metadata of the original loop before it gets modified. - MDNode *OrigLoopID = OrigLoop->getLoopID(); - - // Create an empty vector loop, and prepare basic blocks for the runtime - // checks. - Loop *Lp = createVectorLoopSkeleton(""); - - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. This check also covers the case where the - // backedge-taken count is uint##_max: adding one to it will overflow leading - // to an incorrect trip count of zero. In this (rare) case we will also jump - // to the scalar loop. - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); - - // Generate the code to check any assumptions that we've made for SCEV - // expressions. - emitSCEVChecks(Lp, LoopScalarPreHeader); - - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - - // Some loops have a single integer induction variable, while other loops - // don't. One example is c++ iterators that often have multiple pointer - // induction variables. In the code below we also support a case where we - // don't have a single induction variable. - // - // We try to obtain an induction variable from the original loop as hard - // as possible. However if we don't find one that: - // - is an integer - // - counts from zero, stepping by one - // - is the size of the widest induction variable type - // then we create a new one. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - Value *StartIdx = ConstantInt::get(IdxTy, 0); - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); - Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); - - // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(Lp, CountRoundDown); - - return completeLoopSkeleton(Lp, OrigLoopID); -} - +BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { + /* + In this function we generate a new loop. The new loop will contain + the vectorized instructions while the old loop will continue to run the + scalar remainder. + + [ ] <-- loop iteration number check. + / | + / v + | [ ] <-- vector loop bypass (may consist of multiple blocks). + | / | + | / v + || [ ] <-- vector pre header. + |/ | + | v + | [ ] \ + | [ ]_| <-- vector loop. + | | + | v + | -[ ] <--- middle-block. + | / | + | / v + -|- >[ ] <--- new preheader. + | | + | v + | [ ] \ + | [ ]_| <-- old scalar loop to handle remainder. + \ | + \ v + >[ ] <-- exit block. + ... + */ + + // Get the metadata of the original loop before it gets modified. + MDNode *OrigLoopID = OrigLoop->getLoopID(); + + // Create an empty vector loop, and prepare basic blocks for the runtime + // checks. + Loop *Lp = createVectorLoopSkeleton(""); + + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. This check also covers the case where the + // backedge-taken count is uint##_max: adding one to it will overflow leading + // to an incorrect trip count of zero. In this (rare) case we will also jump + // to the scalar loop. + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); + + // Generate the code to check any assumptions that we've made for SCEV + // expressions. + emitSCEVChecks(Lp, LoopScalarPreHeader); + + // Generate the code that checks in runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); + + // Some loops have a single integer induction variable, while other loops + // don't. One example is c++ iterators that often have multiple pointer + // induction variables. In the code below we also support a case where we + // don't have a single induction variable. + // + // We try to obtain an induction variable from the original loop as hard + // as possible. However if we don't find one that: + // - is an integer + // - counts from zero, stepping by one + // - is the size of the widest induction variable type + // then we create a new one. + OldInduction = Legal->getPrimaryInduction(); + Type *IdxTy = Legal->getWidestInductionType(); + Value *StartIdx = ConstantInt::get(IdxTy, 0); + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); + Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + + // Emit phis for the new starting index of the scalar loop. + createInductionResumeValues(Lp, CountRoundDown); + + return completeLoopSkeleton(Lp, OrigLoopID); +} + // Fix up external users of the induction variable. At this point, we are // in LCSSA form, with all external PHIs that use the IV having one input value, // coming from the remainder loop. We need those PHIs to also have a correct @@ -3606,7 +3606,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, // value (the value that feeds into the phi from the loop latch). // We allow both, but they, obviously, have different values. - assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); + assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); DenseMap<Value *, Value *> MissingVals; @@ -3712,10 +3712,10 @@ static void cse(BasicBlock *BB) { } } -InstructionCost -LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, - bool &NeedToScalarize) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); +InstructionCost +LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, + bool &NeedToScalarize) { + assert(!VF.isScalable() && "scalable vectors not yet supported."); Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector<Type *, 4> Tys, ScalarTys; @@ -3726,9 +3726,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, // to be vectors, so we need to extract individual elements from there, // execute VF scalar calls, and then gather the result into the vector return // value. - InstructionCost ScalarCallCost = - TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); - if (VF.isScalar()) + InstructionCost ScalarCallCost = + TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); + if (VF.isScalar()) return ScalarCallCost; // Compute corresponding vector type for return value and arguments. @@ -3738,33 +3738,33 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); + InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); - InstructionCost Cost = - ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; + InstructionCost Cost = + ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. NeedToScalarize = true; - VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); + VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); if (!TLI || CI->isNoBuiltin() || !VecFunc) return Cost; // If the corresponding vector cost is cheaper, return its cost. - InstructionCost VectorCallCost = - TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); + InstructionCost VectorCallCost = + TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); if (VectorCallCost < Cost) { NeedToScalarize = false; - Cost = VectorCallCost; + Cost = VectorCallCost; } return Cost; } -InstructionCost -LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, - ElementCount VF) { +InstructionCost +LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, + ElementCount VF) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); @@ -3804,8 +3804,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), KV.second); auto *TruncatedTy = FixedVectorType::get( - ScalarTruncatedTy, - cast<FixedVectorType>(OriginalTy)->getNumElements()); + ScalarTruncatedTy, + cast<FixedVectorType>(OriginalTy)->getNumElements()); if (TruncatedTy == OriginalTy) continue; @@ -3855,13 +3855,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { break; } } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { - auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) - ->getNumElements(); + auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) + ->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( SI->getOperand(0), FixedVectorType::get(ScalarTruncatedTy, Elements0)); - auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) - ->getNumElements(); + auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) + ->getNumElements(); auto *O1 = B.CreateZExtOrTrunc( SI->getOperand(1), FixedVectorType::get(ScalarTruncatedTy, Elements1)); @@ -3871,16 +3871,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { // Don't do anything with the operands, just extend the result. continue; } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { - auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) - ->getNumElements(); + auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) + ->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( IE->getOperand(0), FixedVectorType::get(ScalarTruncatedTy, Elements)); auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { - auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) - ->getNumElements(); + auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) + ->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( EE->getOperand(0), FixedVectorType::get(ScalarTruncatedTy, Elements)); @@ -3922,7 +3922,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { void InnerLoopVectorizer::fixVectorizedLoop() { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. - if (VF.isVector()) + if (VF.isVector()) truncateToMinimalBitwidths(); // Fix widened non-induction PHIs by setting up the PHI operands. @@ -3963,13 +3963,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // profile is not inherently precise anyway. Note also possible bypass of // vector code caused by legality checks is ignored, assigning all the weight // to the vector loop, optimistically. - // - // For scalable vectorization we can't know at compile time how many iterations - // of the loop are handled in one vector iteration, so instead assume a pessimistic - // vscale of '1'. - setProfileInfoAfterUnrolling( - LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), - LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); + // + // For scalable vectorization we can't know at compile time how many iterations + // of the loop are handled in one vector iteration, so instead assume a pessimistic + // vscale of '1'. + setProfileInfoAfterUnrolling( + LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), + LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { @@ -4048,12 +4048,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Create a vector from the initial value. auto *VectorInit = ScalarInit; - if (VF.isVector()) { + if (VF.isVector()) { Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - assert(!VF.isScalable() && "VF is assumed to be non scalable."); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); VectorInit = Builder.CreateInsertElement( - PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, - Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); + PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, + Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); } // We constructed a temporary phi node in the first phase of vectorization. @@ -4094,11 +4094,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. - assert(!VF.isScalable()); - SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); - ShuffleMask[0] = VF.getKnownMinValue() - 1; - for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) - ShuffleMask[I] = I + VF.getKnownMinValue() - 1; + assert(!VF.isScalable()); + SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); + ShuffleMask[0] = VF.getKnownMinValue() - 1; + for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) + ShuffleMask[I] = I + VF.getKnownMinValue() - 1; // The vector from which to take the initial value for the current iteration // (actual or unrolled). Initially, this is the vector phi node. @@ -4108,10 +4108,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *PreviousPart = getOrCreateVectorValue(Previous, Part); Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); - auto *Shuffle = - VF.isVector() - ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) - : Incoming; + auto *Shuffle = + VF.isVector() + ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) + : Incoming; PhiPart->replaceAllUsesWith(Shuffle); cast<Instruction>(PhiPart)->eraseFromParent(); VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); @@ -4124,11 +4124,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. auto *ExtractForScalar = Incoming; - if (VF.isVector()) { + if (VF.isVector()) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); ExtractForScalar = Builder.CreateExtractElement( - ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), - "vector.recur.extract"); + ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), + "vector.recur.extract"); } // Extract the second last element in the middle block if the // Phi is used outside the loop. We need to extract the phi itself @@ -4136,10 +4136,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // will be the value when jumping to the exit block from the LoopMiddleBlock, // when the scalar loop is not run at all. Value *ExtractForPhiUsedOutsideLoop = nullptr; - if (VF.isVector()) + if (VF.isVector()) ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), - "vector.recur.extract.for.phi"); + Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), + "vector.recur.extract.for.phi"); // When loop is unrolled without vectorizing, initialize // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of // `Incoming`. This is analogous to the vectorized case above: extracting the @@ -4163,13 +4163,13 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // vector recurrence we extracted in the middle block. Since the loop is in // LCSSA form, we just need to find all the phi nodes for the original scalar // recurrence in the exit block, and then add an edge for the middle block. - // Note that LCSSA does not imply single entry when the original scalar loop - // had multiple exiting edges (as we always run the last iteration in the - // scalar epilogue); in that case, the exiting path through middle will be - // dynamically dead and the value picked for the phi doesn't matter. - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (any_of(LCSSAPhi.incoming_values(), - [Phi](Value *V) { return V == Phi; })) + // Note that LCSSA does not imply single entry when the original scalar loop + // had multiple exiting edges (as we always run the last iteration in the + // scalar epilogue); in that case, the exiting path through middle will be + // dynamically dead and the value picked for the phi doesn't matter. + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) + if (any_of(LCSSAPhi.incoming_values(), + [Phi](Value *V) { return V == Phi; })) LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); } @@ -4179,11 +4179,11 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { "Unable to find the reduction variable"); RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; - RecurKind RK = RdxDesc.getRecurrenceKind(); + RecurKind RK = RdxDesc.getRecurrenceKind(); TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); setDebugLocFromInst(Builder, ReductionStartValue); - bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); + bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); // This is the vector-clone of the value that leaves the loop. Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); @@ -4215,9 +4215,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // If tail is folded by masking, the vector value to leave the loop should be // a Select choosing between the vectorized LoopExitInst and vectorized Phi, - // instead of the former. For an inloop reduction the reduction will already - // be predicated, and does not need to be handled here. - if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { + // instead of the former. For an inloop reduction the reduction will already + // be predicated, and does not need to be handled here. + if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *VecLoopExitInst = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); @@ -4231,31 +4231,31 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } assert(Sel && "Reduction exit feeds no select"); VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); - - // If the target can create a predicated operator for the reduction at no - // extra cost in the loop (for example a predicated vadd), it can be - // cheaper for the select to remain in the loop than be sunk out of it, - // and so use the select value for the phi instead of the old - // LoopExitValue. - RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; - if (PreferPredicatedReductionSelect || - TTI->preferPredicatedReductionSelect( - RdxDesc.getOpcode(), Phi->getType(), - TargetTransformInfo::ReductionFlags())) { - auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); - VecRdxPhi->setIncomingValueForBlock( - LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); - } + + // If the target can create a predicated operator for the reduction at no + // extra cost in the loop (for example a predicated vadd), it can be + // cheaper for the select to remain in the loop than be sunk out of it, + // and so use the select value for the phi instead of the old + // LoopExitValue. + RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; + if (PreferPredicatedReductionSelect || + TTI->preferPredicatedReductionSelect( + RdxDesc.getOpcode(), Phi->getType(), + TargetTransformInfo::ReductionFlags())) { + auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); + VecRdxPhi->setIncomingValueForBlock( + LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); + } } } // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. - if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { - assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); - assert(!VF.isScalable() && "scalable vectors not yet supported."); - Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); + if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { + assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint( LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); VectorParts RdxParts(UF); @@ -4282,7 +4282,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); - unsigned Op = RecurrenceDescriptor::getOpcode(RK); + unsigned Op = RecurrenceDescriptor::getOpcode(RK); // The middle block terminator has already been assigned a DebugLoc here (the // OrigLoop's single latch terminator). We want the whole middle block to @@ -4301,14 +4301,14 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { ReducedPartRdx, "bin.rdx"), RdxDesc.getFastMathFlags()); else - ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); } - // Create the reduction after the loop. Note that inloop reductions create the - // target reduction in the loop using a Reduction recipe. - if (VF.isVector() && !IsInLoopReductionPhi) { + // Create the reduction after the loop. Note that inloop reductions create the + // target reduction in the loop using a Reduction recipe. + if (VF.isVector() && !IsInLoopReductionPhi) { ReducedPartRdx = - createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); + createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (Phi->getType() != RdxDesc.getRecurrenceType()) @@ -4329,16 +4329,16 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. - // We know that the loop is in LCSSA form. We need to update the PHI nodes - // in the exit blocks. See comment on analogous loop in - // fixFirstOrderRecurrence for a more complete explaination of the logic. - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (any_of(LCSSAPhi.incoming_values(), - [LoopExitInst](Value *V) { return V == LoopExitInst; })) + // We know that the loop is in LCSSA form. We need to update the PHI nodes + // in the exit blocks. See comment on analogous loop in + // fixFirstOrderRecurrence for a more complete explaination of the logic. + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) + if (any_of(LCSSAPhi.incoming_values(), + [LoopExitInst](Value *V) { return V == LoopExitInst; })) LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. int IncomingEdgeBlockIdx = Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); @@ -4350,8 +4350,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { void InnerLoopVectorizer::clearReductionWrapFlags( RecurrenceDescriptor &RdxDesc) { - RecurKind RK = RdxDesc.getRecurrenceKind(); - if (RK != RecurKind::Add && RK != RecurKind::Mul) + RecurKind RK = RdxDesc.getRecurrenceKind(); + if (RK != RecurKind::Add && RK != RecurKind::Mul) return; Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); @@ -4380,27 +4380,27 @@ void InnerLoopVectorizer::clearReductionWrapFlags( void InnerLoopVectorizer::fixLCSSAPHIs() { for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) - // Some phis were already hand updated by the reduction and recurrence - // code above, leave them alone. - continue; - - auto *IncomingValue = LCSSAPhi.getIncomingValue(0); - // Non-instruction incoming values will have only one value. - unsigned LastLane = 0; - if (isa<Instruction>(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast<Instruction>(IncomingValue), VF) - ? 0 - : VF.getKnownMinValue() - 1; - assert((!VF.isScalable() || LastLane == 0) && - "scalable vectors dont support non-uniform scalars yet"); - // Can be a loop invariant incoming value or the last scalar value to be - // extracted from the vectorized loop. - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - Value *lastIncomingValue = - getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); - LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); + if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) + // Some phis were already hand updated by the reduction and recurrence + // code above, leave them alone. + continue; + + auto *IncomingValue = LCSSAPhi.getIncomingValue(0); + // Non-instruction incoming values will have only one value. + unsigned LastLane = 0; + if (isa<Instruction>(IncomingValue)) + LastLane = Cost->isUniformAfterVectorization( + cast<Instruction>(IncomingValue), VF) + ? 0 + : VF.getKnownMinValue() - 1; + assert((!VF.isScalable() || LastLane == 0) && + "scalable vectors dont support non-uniform scalars yet"); + // Can be a loop invariant incoming value or the last scalar value to be + // extracted from the vectorized loop. + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + Value *lastIncomingValue = + getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); + LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } } @@ -4504,9 +4504,9 @@ void InnerLoopVectorizer::fixNonInductionPHIs() { } } -void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, - VPUser &Operands, unsigned UF, - ElementCount VF, bool IsPtrLoopInvariant, +void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, + VPUser &Operands, unsigned UF, + ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State) { // Construct a vector GEP by widening the operands of the scalar GEP as @@ -4515,7 +4515,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, // is vector-typed. Thus, to keep the representation compact, we only use // vector-typed operands for loop-varying values. - if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { // If we are vectorizing, but the GEP has only loop-invariant operands, // the GEP we build (by only using vector-typed operands for // loop-varying values) would be a scalar pointer. Thus, to ensure we @@ -4531,7 +4531,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, auto *Clone = Builder.Insert(GEP->clone()); for (unsigned Part = 0; Part < UF; ++Part) { Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - State.set(VPDef, GEP, EntryPart, Part); + State.set(VPDef, GEP, EntryPart, Part); addMetadata(EntryPart, GEP); } } else { @@ -4566,19 +4566,19 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, Indices) : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && + assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); - State.set(VPDef, GEP, NewGEP, Part); + State.set(VPDef, GEP, NewGEP, Part); addMetadata(NewGEP, GEP); } } } -void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, - RecurrenceDescriptor *RdxDesc, - Value *StartV, unsigned UF, - ElementCount VF) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); +void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, + RecurrenceDescriptor *RdxDesc, + Value *StartV, unsigned UF, + ElementCount VF) { + assert(!VF.isScalable() && "scalable vectors not yet supported."); PHINode *P = cast<PHINode>(PN); if (EnableVPlanNativePath) { // Currently we enter here in the VPlan-native path for non-induction @@ -4586,7 +4586,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. Type *VecTy = - (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); + (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); VectorLoopValueMap.setVectorValue(P, 0, VecPhi); OrigPHIsToFix.push_back(P); @@ -4601,60 +4601,60 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. - if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { - Value *Iden = nullptr; - bool ScalarPHI = - (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); - Type *VecTy = - ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); - - if (RdxDesc) { - assert(Legal->isReductionVariable(P) && StartV && - "RdxDesc should only be set for reduction variables; in that case " - "a StartV is also required"); - RecurKind RK = RdxDesc->getRecurrenceKind(); - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { - // MinMax reduction have the start value as their identify. - if (ScalarPHI) { - Iden = StartV; - } else { - IRBuilderBase::InsertPointGuard IPBuilder(Builder); - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident"); - } - } else { - Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( - RK, VecTy->getScalarType()); - Iden = IdenC; - - if (!ScalarPHI) { - Iden = ConstantVector::getSplat(VF, IdenC); - IRBuilderBase::InsertPointGuard IPBuilder(Builder); - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - Constant *Zero = Builder.getInt32(0); - StartV = Builder.CreateInsertElement(Iden, StartV, Zero); - } - } - } - + if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { + Value *Iden = nullptr; + bool ScalarPHI = + (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); + Type *VecTy = + ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); + + if (RdxDesc) { + assert(Legal->isReductionVariable(P) && StartV && + "RdxDesc should only be set for reduction variables; in that case " + "a StartV is also required"); + RecurKind RK = RdxDesc->getRecurrenceKind(); + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { + // MinMax reduction have the start value as their identify. + if (ScalarPHI) { + Iden = StartV; + } else { + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident"); + } + } else { + Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( + RK, VecTy->getScalarType()); + Iden = IdenC; + + if (!ScalarPHI) { + Iden = ConstantVector::getSplat(VF, IdenC); + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + Constant *Zero = Builder.getInt32(0); + StartV = Builder.CreateInsertElement(Iden, StartV, Zero); + } + } + } + for (unsigned Part = 0; Part < UF; ++Part) { // This is phase one of vectorizing PHIs. Value *EntryPart = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); VectorLoopValueMap.setVectorValue(P, Part, EntryPart); - if (StartV) { - // Make sure to add the reduction start value only to the - // first unroll part. - Value *StartVal = (Part == 0) ? StartV : Iden; - cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); - } + if (StartV) { + // Make sure to add the reduction start value only to the + // first unroll part. + Value *StartVal = (Part == 0) ? StartV : Iden; + cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); + } } return; } - assert(!Legal->isReductionVariable(P) && - "reductions should be handled above"); - + assert(!Legal->isReductionVariable(P) && + "reductions should be handled above"); + setDebugLocFromInst(Builder, P); // This PHINode must be an induction variable. @@ -4675,74 +4675,74 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); - - if (Cost->isScalarAfterVectorization(P, VF)) { - // This is the normalized GEP that starts counting at zero. - Value *PtrInd = - Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); - // Determine the number of scalars we need to generate for each unroll - // iteration. If the instruction is uniform, we only need to generate the - // first lane. Otherwise, we generate all VF values. - unsigned Lanes = - Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); - for (unsigned Part = 0; Part < UF; ++Part) { - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - Constant *Idx = ConstantInt::get(PtrInd->getType(), - Lane + Part * VF.getKnownMinValue()); - Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); - Value *SclrGep = - emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); - SclrGep->setName("next.gep"); - VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); - } + + if (Cost->isScalarAfterVectorization(P, VF)) { + // This is the normalized GEP that starts counting at zero. + Value *PtrInd = + Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); + // Determine the number of scalars we need to generate for each unroll + // iteration. If the instruction is uniform, we only need to generate the + // first lane. Otherwise, we generate all VF values. + unsigned Lanes = + Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); + for (unsigned Part = 0; Part < UF; ++Part) { + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Constant *Idx = ConstantInt::get(PtrInd->getType(), + Lane + Part * VF.getKnownMinValue()); + Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); + Value *SclrGep = + emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); + SclrGep->setName("next.gep"); + VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); + } } - return; - } - assert(isa<SCEVConstant>(II.getStep()) && - "Induction step not a SCEV constant!"); - Type *PhiType = II.getStep()->getType(); - - // Build a pointer phi - Value *ScalarStartValue = II.getStartValue(); - Type *ScStValueType = ScalarStartValue->getType(); - PHINode *NewPointerPhi = - PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); - NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); - - // A pointer induction, performed by using a gep - BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); - Instruction *InductionLoc = LoopLatch->getTerminator(); - const SCEV *ScalarStep = II.getStep(); - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - Value *ScalarStepValue = - Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); - Value *InductionGEP = GetElementPtrInst::Create( - ScStValueType->getPointerElementType(), NewPointerPhi, - Builder.CreateMul( - ScalarStepValue, - ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), - "ptr.ind", InductionLoc); - NewPointerPhi->addIncoming(InductionGEP, LoopLatch); - - // Create UF many actual address geps that use the pointer - // phi as base and a vectorized version of the step value - // (<step*0, ..., step*N>) as offset. - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Constant *, 8> Indices; - // Create a vector of consecutive numbers from zero to VF. - for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) - Indices.push_back( - ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); - Constant *StartOffset = ConstantVector::get(Indices); - - Value *GEP = Builder.CreateGEP( - ScStValueType->getPointerElementType(), NewPointerPhi, - Builder.CreateMul( - StartOffset, - Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), - "vector.gep")); - VectorLoopValueMap.setVectorValue(P, Part, GEP); + return; } + assert(isa<SCEVConstant>(II.getStep()) && + "Induction step not a SCEV constant!"); + Type *PhiType = II.getStep()->getType(); + + // Build a pointer phi + Value *ScalarStartValue = II.getStartValue(); + Type *ScStValueType = ScalarStartValue->getType(); + PHINode *NewPointerPhi = + PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); + NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); + + // A pointer induction, performed by using a gep + BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); + Instruction *InductionLoc = LoopLatch->getTerminator(); + const SCEV *ScalarStep = II.getStep(); + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); + Value *ScalarStepValue = + Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); + Value *InductionGEP = GetElementPtrInst::Create( + ScStValueType->getPointerElementType(), NewPointerPhi, + Builder.CreateMul( + ScalarStepValue, + ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), + "ptr.ind", InductionLoc); + NewPointerPhi->addIncoming(InductionGEP, LoopLatch); + + // Create UF many actual address geps that use the pointer + // phi as base and a vectorized version of the step value + // (<step*0, ..., step*N>) as offset. + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Constant *, 8> Indices; + // Create a vector of consecutive numbers from zero to VF. + for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) + Indices.push_back( + ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); + Constant *StartOffset = ConstantVector::get(Indices); + + Value *GEP = Builder.CreateGEP( + ScStValueType->getPointerElementType(), NewPointerPhi, + Builder.CreateMul( + StartOffset, + Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), + "vector.gep")); + VectorLoopValueMap.setVectorValue(P, Part, GEP); + } } } } @@ -4765,8 +4765,8 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, - VPUser &User, +void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, + VPUser &User, VPTransformState &State) { switch (I.getOpcode()) { case Instruction::Call: @@ -4808,7 +4808,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, VecOp->copyIRFlags(&I); // Use this vector value for all users of the original instruction. - State.set(Def, &I, V, Part); + State.set(Def, &I, V, Part); addMetadata(V, &I); } @@ -4832,7 +4832,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, } else { C = Builder.CreateICmp(Cmp->getPredicate(), A, B); } - State.set(Def, &I, C, Part); + State.set(Def, &I, C, Part); addMetadata(C, &I); } @@ -4856,12 +4856,12 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, /// Vectorize casts. Type *DestTy = - (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); + (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); for (unsigned Part = 0; Part < UF; ++Part) { Value *A = State.get(User.getOperand(0), Part); Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - State.set(Def, &I, Cast, Part); + State.set(Def, &I, Cast, Part); addMetadata(Cast, &I); } break; @@ -4873,8 +4873,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, } // end of switch. } -void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, - VPUser &ArgOperands, +void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, + VPUser &ArgOperands, VPTransformState &State) { assert(!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); @@ -4885,7 +4885,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, SmallVector<Type *, 4> Tys; for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); @@ -4893,13 +4893,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize = false; - InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); - InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; - bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; + InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; + bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; assert((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."); - assert(IntrinsicCost.isValid() && CallCost.isValid() && - "Cannot have invalid costs while widening"); + assert(IntrinsicCost.isValid() && CallCost.isValid() && + "Cannot have invalid costs while widening"); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 4> Args; @@ -4918,15 +4918,15 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, if (UseVectorIntrinsic) { // Use vector version of the intrinsic. Type *TysForDecl[] = {CI->getType()}; - if (VF.isVector()) { - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); - } + if (VF.isVector()) { + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + } VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); } else { // Use vector version of the function call. - const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); + const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); #ifndef NDEBUG assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && "Can't create vector function."); @@ -4940,12 +4940,12 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, if (isa<FPMathOperator>(V)) V->copyFastMathFlags(CI); - State.set(Def, &I, V, Part); + State.set(Def, &I, V, Part); addMetadata(V, &I); } } -void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, +void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, bool InvariantCond, VPTransformState &State) { @@ -4964,16 +4964,16 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, Value *Op0 = State.get(Operands.getOperand(1), Part); Value *Op1 = State.get(Operands.getOperand(2), Part); Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); - State.set(VPDef, &I, Sel, Part); + State.set(VPDef, &I, Sel, Part); addMetadata(Sel, &I); } } -void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { +void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. - assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && + assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && "This function should not be visited twice for the same VF"); SmallSetVector<Instruction *, 8> Worklist; @@ -4982,7 +4982,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // accesses that will remain scalar. SmallSetVector<Instruction *, 8> ScalarPtrs; SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; - auto *Latch = TheLoop->getLoopLatch(); + auto *Latch = TheLoop->getLoopLatch(); // A helper that returns true if the use of Ptr by MemAccess will be scalar. // The pointer operands of loads and stores will be scalar as long as the @@ -5008,33 +5008,33 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { !TheLoop->isLoopInvariant(V); }; - auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { - if (!isa<PHINode>(Ptr) || - !Legal->getInductionVars().count(cast<PHINode>(Ptr))) - return false; - auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; - if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) - return false; - return isScalarUse(MemAccess, Ptr); - }; - - // A helper that evaluates a memory access's use of a pointer. If the - // pointer is actually the pointer induction of a loop, it is being - // inserted into Worklist. If the use will be a scalar use, and the - // pointer is only used by memory accesses, we place the pointer in - // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. + auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { + if (!isa<PHINode>(Ptr) || + !Legal->getInductionVars().count(cast<PHINode>(Ptr))) + return false; + auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; + if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) + return false; + return isScalarUse(MemAccess, Ptr); + }; + + // A helper that evaluates a memory access's use of a pointer. If the + // pointer is actually the pointer induction of a loop, it is being + // inserted into Worklist. If the use will be a scalar use, and the + // pointer is only used by memory accesses, we place the pointer in + // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { - if (isScalarPtrInduction(MemAccess, Ptr)) { - Worklist.insert(cast<Instruction>(Ptr)); - Instruction *Update = cast<Instruction>( - cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); - Worklist.insert(Update); - LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr - << "\n"); - LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update - << "\n"); - return; - } + if (isScalarPtrInduction(MemAccess, Ptr)) { + Worklist.insert(cast<Instruction>(Ptr)); + Instruction *Update = cast<Instruction>( + cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); + Worklist.insert(Update); + LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr + << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update + << "\n"); + return; + } // We only care about bitcast and getelementptr instructions contained in // the loop. if (!isLoopVaryingBitCastOrGEP(Ptr)) @@ -5058,9 +5058,9 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { }; // We seed the scalars analysis with three classes of instructions: (1) - // instructions marked uniform-after-vectorization and (2) bitcast, - // getelementptr and (pointer) phi instructions used by memory accesses - // requiring a scalar use. + // instructions marked uniform-after-vectorization and (2) bitcast, + // getelementptr and (pointer) phi instructions used by memory accesses + // requiring a scalar use. // // (1) Add to the worklist all instructions that have been identified as // uniform-after-vectorization. @@ -5156,8 +5156,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, - ElementCount VF) { +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, + ElementCount VF) { if (!blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { @@ -5171,7 +5171,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, auto *Ty = getMemInstValueType(I); // We have already decided how to vectorize this instruction, get that // result. - if (VF.isVector()) { + if (VF.isVector()) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -5192,8 +5192,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, return false; } -bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( - Instruction *I, ElementCount VF) { +bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( + Instruction *I, ElementCount VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); assert(getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."); @@ -5204,7 +5204,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); auto *ScalarTy = getMemInstValueType(I); - if (hasIrregularType(ScalarTy, DL)) + if (hasIrregularType(ScalarTy, DL)) return false; // Check if masking is required. @@ -5229,8 +5229,8 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( : TTI.isLegalMaskedStore(Ty, Alignment); } -bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( - Instruction *I, ElementCount VF) { +bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( + Instruction *I, ElementCount VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast<LoadInst>(I); StoreInst *SI = dyn_cast<StoreInst>(I); @@ -5251,19 +5251,19 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); - if (hasIrregularType(ScalarTy, DL)) + if (hasIrregularType(ScalarTy, DL)) return false; return true; } -void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { +void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any // sense. - assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && + assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll @@ -5289,11 +5289,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // replicating region where only a single instance out of VF should be formed. // TODO: optimize such seldom cases if found important, see PR40816. auto addToWorklistIfAllowed = [&](Instruction *I) -> void { - if (isOutOfScope(I)) { - LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " - << *I << "\n"); - return; - } + if (isOutOfScope(I)) { + LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " + << *I << "\n"); + return; + } if (isScalarWithPredication(I, VF)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); @@ -5310,71 +5310,71 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) addToWorklistIfAllowed(Cmp); - auto isUniformDecision = [&](Instruction *I, ElementCount VF) { + auto isUniformDecision = [&](Instruction *I, ElementCount VF) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); - // A uniform memory op is itself uniform. We exclude uniform stores - // here as they demand the last lane, not the first one. - if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { - assert(WideningDecision == CM_Scalarize); - return true; - } - + // A uniform memory op is itself uniform. We exclude uniform stores + // here as they demand the last lane, not the first one. + if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { + assert(WideningDecision == CM_Scalarize); + return true; + } + return (WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || WideningDecision == CM_Interleave); }; - - - // Returns true if Ptr is the pointer operand of a memory access instruction - // I, and I is known to not require scalarization. - auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { - return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); - }; - - // Holds a list of values which are known to have at least one uniform use. - // Note that there may be other uses which aren't uniform. A "uniform use" - // here is something which only demands lane 0 of the unrolled iterations; - // it does not imply that all lanes produce the same value (e.g. this is not - // the usual meaning of uniform) - SmallPtrSet<Value *, 8> HasUniformUse; - - // Scan the loop for instructions which are either a) known to have only - // lane 0 demanded or b) are uses which demand only lane 0 of their operand. + + + // Returns true if Ptr is the pointer operand of a memory access instruction + // I, and I is known to not require scalarization. + auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { + return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); + }; + + // Holds a list of values which are known to have at least one uniform use. + // Note that there may be other uses which aren't uniform. A "uniform use" + // here is something which only demands lane 0 of the unrolled iterations; + // it does not imply that all lanes produce the same value (e.g. this is not + // the usual meaning of uniform) + SmallPtrSet<Value *, 8> HasUniformUse; + + // Scan the loop for instructions which are either a) known to have only + // lane 0 demanded or b) are uses which demand only lane 0 of their operand. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { // If there's no pointer operand, there's nothing to do. - auto *Ptr = getLoadStorePointerOperand(&I); + auto *Ptr = getLoadStorePointerOperand(&I); if (!Ptr) continue; - // A uniform memory op is itself uniform. We exclude uniform stores - // here as they demand the last lane, not the first one. - if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) - addToWorklistIfAllowed(&I); + // A uniform memory op is itself uniform. We exclude uniform stores + // here as they demand the last lane, not the first one. + if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) + addToWorklistIfAllowed(&I); - if (isUniformDecision(&I, VF)) { - assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); - HasUniformUse.insert(Ptr); - } + if (isUniformDecision(&I, VF)) { + assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); + HasUniformUse.insert(Ptr); + } } - // Add to the worklist any operands which have *only* uniform (e.g. lane 0 - // demanding) users. Since loops are assumed to be in LCSSA form, this - // disallows uses outside the loop as well. - for (auto *V : HasUniformUse) { - if (isOutOfScope(V)) - continue; - auto *I = cast<Instruction>(V); - auto UsersAreMemAccesses = - llvm::all_of(I->users(), [&](User *U) -> bool { - return isVectorizedMemAccessUse(cast<Instruction>(U), V); - }); - if (UsersAreMemAccesses) - addToWorklistIfAllowed(I); - } + // Add to the worklist any operands which have *only* uniform (e.g. lane 0 + // demanding) users. Since loops are assumed to be in LCSSA form, this + // disallows uses outside the loop as well. + for (auto *V : HasUniformUse) { + if (isOutOfScope(V)) + continue; + auto *I = cast<Instruction>(V); + auto UsersAreMemAccesses = + llvm::all_of(I->users(), [&](User *U) -> bool { + return isVectorizedMemAccessUse(cast<Instruction>(U), V); + }); + if (UsersAreMemAccesses) + addToWorklistIfAllowed(I); + } // Expand Worklist in topological order: whenever a new instruction // is added , its users should be already inside Worklist. It ensures @@ -5397,7 +5397,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { auto *OI = cast<Instruction>(OV); if (llvm::all_of(OI->users(), [&](User *U) -> bool { auto *J = cast<Instruction>(U); - return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); + return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); })) addToWorklistIfAllowed(OI); } @@ -5475,8 +5475,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { return false; } -Optional<ElementCount> -LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { +Optional<ElementCount> +LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -5498,9 +5498,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return computeFeasibleMaxVF(TC, UserVF); - case CM_ScalarEpilogueNotAllowedUsePredicate: - LLVM_FALLTHROUGH; + return computeFeasibleMaxVF(TC, UserVF); + case CM_ScalarEpilogueNotAllowedUsePredicate: + LLVM_FALLTHROUGH; case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" @@ -5521,26 +5521,26 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { // for size. if (runtimeChecksRequired()) return None; - + break; } - // The only loops we can vectorize without a scalar epilogue, are loops with - // a bottom-test and a single exiting block. We'd have to handle the fact - // that not every instruction executes on the last iteration. This will - // require a lane mask which varies through the vector loop body. (TODO) - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - // If there was a tail-folding hint/switch, but we can't fold the tail by - // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " - "scalar epilogue instead.\n"); - ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return computeFeasibleMaxVF(TC, UserVF); - } - return None; - } - + // The only loops we can vectorize without a scalar epilogue, are loops with + // a bottom-test and a single exiting block. We'd have to handle the fact + // that not every instruction executes on the last iteration. This will + // require a lane mask which varies through the vector loop body. (TODO) + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + // If there was a tail-folding hint/switch, but we can't fold the tail by + // masking, fallback to a vectorization with a scalar epilogue. + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " + "scalar epilogue instead.\n"); + ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + return computeFeasibleMaxVF(TC, UserVF); + } + return None; + } + // Now try the tail folding // Invalidate interleave groups that require an epilogue if we can't mask @@ -5553,22 +5553,22 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); - assert(!MaxVF.isScalable() && - "Scalable vectors do not yet support tail folding"); - assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && - "MaxVF must be a power of 2"); - unsigned MaxVFtimesIC = - UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); - // Avoid tail folding if the trip count is known to be a multiple of any VF we - // chose. - ScalarEvolution *SE = PSE.getSE(); - const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); - const SCEV *ExitCount = SE->getAddExpr( - BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); - const SCEV *Rem = SE->getURemExpr( - ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); - if (Rem->isZero()) { + ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); + assert(!MaxVF.isScalable() && + "Scalable vectors do not yet support tail folding"); + assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && + "MaxVF must be a power of 2"); + unsigned MaxVFtimesIC = + UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); + // Avoid tail folding if the trip count is known to be a multiple of any VF we + // chose. + ScalarEvolution *SE = PSE.getSE(); + const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + const SCEV *Rem = SE->getURemExpr( + ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); + if (Rem->isZero()) { // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); return MaxVF; @@ -5583,20 +5583,20 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return MaxVF; } - // If there was a tail-folding hint/switch, but we can't fold the tail by - // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " - "scalar epilogue instead.\n"); - ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return MaxVF; - } - - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { - LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); - return None; - } - + // If there was a tail-folding hint/switch, but we can't fold the tail by + // masking, fallback to a vectorization with a scalar epilogue. + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " + "scalar epilogue instead.\n"); + ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + return MaxVF; + } + + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { + LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); + return None; + } + if (TC == 0) { reportVectorizationFailure( "Unable to calculate the loop count due to complex control flow", @@ -5614,33 +5614,33 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return None; } -ElementCount -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF) { - bool IgnoreScalableUserVF = UserVF.isScalable() && - !TTI.supportsScalableVectors() && - !ForceTargetSupportsScalableVectors; - if (IgnoreScalableUserVF) { - LLVM_DEBUG( - dbgs() << "LV: Ignoring VF=" << UserVF - << " because target does not support scalable vectors.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "Ignoring VF=" << ore::NV("UserVF", UserVF) - << " because target does not support scalable vectors."; - }); - } - - // Beyond this point two scenarios are handled. If UserVF isn't specified - // then a suitable VF is chosen. If UserVF is specified and there are - // dependencies, check if it's legal. However, if a UserVF is specified and - // there are no dependencies, then there's nothing to do. - if (UserVF.isNonZero() && !IgnoreScalableUserVF && - Legal->isSafeForAnyVectorWidth()) - return UserVF; - +ElementCount +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF) { + bool IgnoreScalableUserVF = UserVF.isScalable() && + !TTI.supportsScalableVectors() && + !ForceTargetSupportsScalableVectors; + if (IgnoreScalableUserVF) { + LLVM_DEBUG( + dbgs() << "LV: Ignoring VF=" << UserVF + << " because target does not support scalable vectors.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "Ignoring VF=" << ore::NV("UserVF", UserVF) + << " because target does not support scalable vectors."; + }); + } + + // Beyond this point two scenarios are handled. If UserVF isn't specified + // then a suitable VF is chosen. If UserVF is specified and there are + // dependencies, check if it's legal. However, if a UserVF is specified and + // there are no dependencies, then there's nothing to do. + if (UserVF.isNonZero() && !IgnoreScalableUserVF && + Legal->isSafeForAnyVectorWidth()) + return UserVF; + MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5650,63 +5650,63 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from // the memory accesses that is most restrictive (involved in the smallest // dependence distance). - unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); - - // If the user vectorization factor is legally unsafe, clamp it to a safe - // value. Otherwise, return as is. - if (UserVF.isNonZero() && !IgnoreScalableUserVF) { - unsigned MaxSafeElements = - PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); - ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); - - if (UserVF.isScalable()) { - Optional<unsigned> MaxVScale = TTI.getMaxVScale(); - - // Scale VF by vscale before checking if it's safe. - MaxSafeVF = ElementCount::getScalable( - MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); - - if (MaxSafeVF.isZero()) { - // The dependence distance is too small to use scalable vectors, - // fallback on fixed. - LLVM_DEBUG( - dbgs() - << "LV: Max legal vector width too small, scalable vectorization " - "unfeasible. Using fixed-width vectorization instead.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "Max legal vector width too small, scalable vectorization " - << "unfeasible. Using fixed-width vectorization instead."; - }); - return computeFeasibleMaxVF( - ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); - } - } - - LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); - - if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) - return UserVF; - - LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF - << " is unsafe, clamping to max safe VF=" << MaxSafeVF - << ".\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "User-specified vectorization factor " - << ore::NV("UserVectorizationFactor", UserVF) - << " is unsafe, clamping to maximum safe vectorization factor " - << ore::NV("VectorizationFactor", MaxSafeVF); - }); - return MaxSafeVF; - } - - WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); - + unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); + + // If the user vectorization factor is legally unsafe, clamp it to a safe + // value. Otherwise, return as is. + if (UserVF.isNonZero() && !IgnoreScalableUserVF) { + unsigned MaxSafeElements = + PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); + ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); + + if (UserVF.isScalable()) { + Optional<unsigned> MaxVScale = TTI.getMaxVScale(); + + // Scale VF by vscale before checking if it's safe. + MaxSafeVF = ElementCount::getScalable( + MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); + + if (MaxSafeVF.isZero()) { + // The dependence distance is too small to use scalable vectors, + // fallback on fixed. + LLVM_DEBUG( + dbgs() + << "LV: Max legal vector width too small, scalable vectorization " + "unfeasible. Using fixed-width vectorization instead.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "Max legal vector width too small, scalable vectorization " + << "unfeasible. Using fixed-width vectorization instead."; + }); + return computeFeasibleMaxVF( + ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); + } + } + + LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); + + if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) + return UserVF; + + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, clamping to max safe VF=" << MaxSafeVF + << ".\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe, clamping to maximum safe vectorization factor " + << ore::NV("VectorizationFactor", MaxSafeVF); + }); + return MaxSafeVF; + } + + WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); + // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); @@ -5716,13 +5716,13 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister << " bits.\n"); - assert(MaxVectorSize <= WidestRegister && - "Did not expect to pack so many elements" - " into one vector!"); + assert(MaxVectorSize <= WidestRegister && + "Did not expect to pack so many elements" + " into one vector!"); if (MaxVectorSize == 0) { LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); MaxVectorSize = 1; - return ElementCount::getFixed(MaxVectorSize); + return ElementCount::getFixed(MaxVectorSize); } else if (ConstTripCount && ConstTripCount < MaxVectorSize && isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in @@ -5730,7 +5730,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " << ConstTripCount << "\n"); MaxVectorSize = ConstTripCount; - return ElementCount::getFixed(MaxVectorSize); + return ElementCount::getFixed(MaxVectorSize); } unsigned MaxVF = MaxVectorSize; @@ -5738,10 +5738,10 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, (MaximizeBandwidth && isScalarEpilogueAllowed())) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). - SmallVector<ElementCount, 8> VFs; + SmallVector<ElementCount, 8> VFs; unsigned NewMaxVectorSize = WidestRegister / SmallestType; for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) - VFs.push_back(ElementCount::getFixed(VS)); + VFs.push_back(ElementCount::getFixed(VS)); // For each VF calculate its register usage. auto RUs = calculateRegisterUsage(VFs); @@ -5756,7 +5756,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, Selected = false; } if (Selected) { - MaxVF = VFs[i].getKnownMinValue(); + MaxVF = VFs[i].getKnownMinValue(); break; } } @@ -5768,39 +5768,39 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, } } } - return ElementCount::getFixed(MaxVF); + return ElementCount::getFixed(MaxVF); } VectorizationFactor -LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { - // FIXME: This can be fixed for scalable vectors later, because at this stage - // the LoopVectorizer will only consider vectorizing a loop with scalable - // vectors when the loop has a hint to enable vectorization for a given VF. - assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); - - InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; - LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); - assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); - +LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { + // FIXME: This can be fixed for scalable vectors later, because at this stage + // the LoopVectorizer will only consider vectorizing a loop with scalable + // vectors when the loop has a hint to enable vectorization for a given VF. + assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); + + InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; + LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); + assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); + unsigned Width = 1; - const float ScalarCost = *ExpectedCost.getValue(); - float Cost = ScalarCost; + const float ScalarCost = *ExpectedCost.getValue(); + float Cost = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - if (ForceVectorization && MaxVF.isVector()) { + if (ForceVectorization && MaxVF.isVector()) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. Cost = std::numeric_limits<float>::max(); } - for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { + for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. - VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); - assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); - float VectorCost = *C.first.getValue() / (float)i; + VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); + assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); + float VectorCost = *C.first.getValue() / (float)i; LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); if (!C.second && !ForceVectorization) { @@ -5809,13 +5809,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { << " because it will not generate any vector instructions.\n"); continue; } - - // If profitable add it to ProfitableVF list. - if (VectorCost < ScalarCost) { - ProfitableVFs.push_back(VectorizationFactor( - {ElementCount::getFixed(i), (unsigned)VectorCost})); - } - + + // If profitable add it to ProfitableVF list. + if (VectorCost < ScalarCost) { + ProfitableVFs.push_back(VectorizationFactor( + {ElementCount::getFixed(i), (unsigned)VectorCost})); + } + if (VectorCost < Cost) { Cost = VectorCost; Width = i; @@ -5834,131 +5834,131 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - VectorizationFactor Factor = {ElementCount::getFixed(Width), - (unsigned)(Width * Cost)}; + VectorizationFactor Factor = {ElementCount::getFixed(Width), + (unsigned)(Width * Cost)}; return Factor; } -bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( - const Loop &L, ElementCount VF) const { - // Cross iteration phis such as reductions need special handling and are - // currently unsupported. - if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { - return Legal->isFirstOrderRecurrence(&Phi) || - Legal->isReductionVariable(&Phi); - })) - return false; - - // Phis with uses outside of the loop require special handling and are - // currently unsupported. - for (auto &Entry : Legal->getInductionVars()) { - // Look for uses of the value of the induction at the last iteration. - Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); - for (User *U : PostInc->users()) - if (!L.contains(cast<Instruction>(U))) - return false; - // Look for uses of penultimate value of the induction. - for (User *U : Entry.first->users()) - if (!L.contains(cast<Instruction>(U))) - return false; - } - - // Induction variables that are widened require special handling that is - // currently not supported. - if (any_of(Legal->getInductionVars(), [&](auto &Entry) { - return !(this->isScalarAfterVectorization(Entry.first, VF) || - this->isProfitableToScalarize(Entry.first, VF)); - })) - return false; - - return true; -} - -bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( - const ElementCount VF) const { - // FIXME: We need a much better cost-model to take different parameters such - // as register pressure, code size increase and cost of extra branches into - // account. For now we apply a very crude heuristic and only consider loops - // with vectorization factors larger than a certain value. - // We also consider epilogue vectorization unprofitable for targets that don't - // consider interleaving beneficial (eg. MVE). - if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) - return false; - if (VF.getFixedValue() >= EpilogueVectorizationMinVF) - return true; - return false; -} - -VectorizationFactor -LoopVectorizationCostModel::selectEpilogueVectorizationFactor( - const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { - VectorizationFactor Result = VectorizationFactor::Disabled(); - if (!EnableEpilogueVectorization) { - LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); - return Result; - } - - if (!isScalarEpilogueAllowed()) { - LLVM_DEBUG( - dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " - "allowed.\n";); - return Result; - } - - // FIXME: This can be fixed for scalable vectors later, because at this stage - // the LoopVectorizer will only consider vectorizing a loop with scalable - // vectors when the loop has a hint to enable vectorization for a given VF. - if (MainLoopVF.isScalable()) { - LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " - "yet supported.\n"); - return Result; - } - - // Not really a cost consideration, but check for unsupported cases here to - // simplify the logic. - if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { - LLVM_DEBUG( - dbgs() << "LEV: Unable to vectorize epilogue because the loop is " - "not a supported candidate.\n";); - return Result; - } - - if (EpilogueVectorizationForceVF > 1) { - LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); - if (LVP.hasPlanWithVFs( - {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) - return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; - else { - LLVM_DEBUG( - dbgs() - << "LEV: Epilogue vectorization forced factor is not viable.\n";); - return Result; - } - } - - if (TheLoop->getHeader()->getParent()->hasOptSize() || - TheLoop->getHeader()->getParent()->hasMinSize()) { - LLVM_DEBUG( - dbgs() - << "LEV: Epilogue vectorization skipped due to opt for size.\n";); - return Result; - } - - if (!isEpilogueVectorizationProfitable(MainLoopVF)) - return Result; - - for (auto &NextVF : ProfitableVFs) - if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && - (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && - LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) - Result = NextVF; - - if (Result != VectorizationFactor::Disabled()) - LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " - << Result.Width.getFixedValue() << "\n";); - return Result; -} - +bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( + const Loop &L, ElementCount VF) const { + // Cross iteration phis such as reductions need special handling and are + // currently unsupported. + if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { + return Legal->isFirstOrderRecurrence(&Phi) || + Legal->isReductionVariable(&Phi); + })) + return false; + + // Phis with uses outside of the loop require special handling and are + // currently unsupported. + for (auto &Entry : Legal->getInductionVars()) { + // Look for uses of the value of the induction at the last iteration. + Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); + for (User *U : PostInc->users()) + if (!L.contains(cast<Instruction>(U))) + return false; + // Look for uses of penultimate value of the induction. + for (User *U : Entry.first->users()) + if (!L.contains(cast<Instruction>(U))) + return false; + } + + // Induction variables that are widened require special handling that is + // currently not supported. + if (any_of(Legal->getInductionVars(), [&](auto &Entry) { + return !(this->isScalarAfterVectorization(Entry.first, VF) || + this->isProfitableToScalarize(Entry.first, VF)); + })) + return false; + + return true; +} + +bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( + const ElementCount VF) const { + // FIXME: We need a much better cost-model to take different parameters such + // as register pressure, code size increase and cost of extra branches into + // account. For now we apply a very crude heuristic and only consider loops + // with vectorization factors larger than a certain value. + // We also consider epilogue vectorization unprofitable for targets that don't + // consider interleaving beneficial (eg. MVE). + if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) + return false; + if (VF.getFixedValue() >= EpilogueVectorizationMinVF) + return true; + return false; +} + +VectorizationFactor +LoopVectorizationCostModel::selectEpilogueVectorizationFactor( + const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { + VectorizationFactor Result = VectorizationFactor::Disabled(); + if (!EnableEpilogueVectorization) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); + return Result; + } + + if (!isScalarEpilogueAllowed()) { + LLVM_DEBUG( + dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " + "allowed.\n";); + return Result; + } + + // FIXME: This can be fixed for scalable vectors later, because at this stage + // the LoopVectorizer will only consider vectorizing a loop with scalable + // vectors when the loop has a hint to enable vectorization for a given VF. + if (MainLoopVF.isScalable()) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " + "yet supported.\n"); + return Result; + } + + // Not really a cost consideration, but check for unsupported cases here to + // simplify the logic. + if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { + LLVM_DEBUG( + dbgs() << "LEV: Unable to vectorize epilogue because the loop is " + "not a supported candidate.\n";); + return Result; + } + + if (EpilogueVectorizationForceVF > 1) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); + if (LVP.hasPlanWithVFs( + {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) + return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; + else { + LLVM_DEBUG( + dbgs() + << "LEV: Epilogue vectorization forced factor is not viable.\n";); + return Result; + } + } + + if (TheLoop->getHeader()->getParent()->hasOptSize() || + TheLoop->getHeader()->getParent()->hasMinSize()) { + LLVM_DEBUG( + dbgs() + << "LEV: Epilogue vectorization skipped due to opt for size.\n";); + return Result; + } + + if (!isEpilogueVectorizationProfitable(MainLoopVF)) + return Result; + + for (auto &NextVF : ProfitableVFs) + if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && + (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && + LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) + Result = NextVF; + + if (Result != VectorizationFactor::Disabled()) + LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " + << Result.Width.getFixedValue() << "\n";); + return Result; +} + std::pair<unsigned, unsigned> LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned MinWidth = -1U; @@ -5985,11 +5985,11 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { if (!Legal->isReductionVariable(PN)) continue; RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; - if (PreferInLoopReductions || - TTI.preferInLoopReduction(RdxDesc.getOpcode(), - RdxDesc.getRecurrenceType(), - TargetTransformInfo::ReductionFlags())) - continue; + if (PreferInLoopReductions || + TTI.preferInLoopReduction(RdxDesc.getOpcode(), + RdxDesc.getRecurrenceType(), + TargetTransformInfo::ReductionFlags())) + continue; T = RdxDesc.getRecurrenceType(); } @@ -6020,7 +6020,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -6043,15 +6043,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; - auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); - const bool HasReductions = !Legal->getReductionVars().empty(); + auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); + const bool HasReductions = !Legal->getReductionVars().empty(); // Do not interleave loops with a relatively small known or estimated trip - // count. But we will interleave when InterleaveSmallLoopScalarReduction is - // enabled, and the code has scalar reductions(HasReductions && VF = 1), - // because with the above conditions interleaving can expose ILP and break - // cross iteration dependences for reductions. - if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && - !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) + // count. But we will interleave when InterleaveSmallLoopScalarReduction is + // enabled, and the code has scalar reductions(HasReductions && VF = 1), + // because with the above conditions interleaving can expose ILP and break + // cross iteration dependences for reductions. + if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && + !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) return 1; RegisterUsage R = calculateRegisterUsage({VF})[0]; @@ -6079,7 +6079,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << " registers of " << TTI.getRegisterClassName(pair.first) << " register class\n"); - if (VF.isScalar()) { + if (VF.isScalar()) { if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumScalarRegs; } else { @@ -6103,11 +6103,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, } // Clamp the interleave ranges to reasonable counts. - unsigned MaxInterleaveCount = - TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); + unsigned MaxInterleaveCount = + TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); // Check if the user has overridden the max. - if (VF.isScalar()) { + if (VF.isScalar()) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; } else { @@ -6116,47 +6116,47 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, } // If trip count is known or estimated compile time constant, limit the - // interleave count to be less than the trip count divided by VF, provided it - // is at least 1. - // - // For scalable vectors we can't know if interleaving is beneficial. It may - // not be beneficial for small loops if none of the lanes in the second vector - // iterations is enabled. However, for larger loops, there is likely to be a - // similar benefit as for fixed-width vectors. For now, we choose to leave - // the InterleaveCount as if vscale is '1', although if some information about - // the vector is known (e.g. min vector size), we can make a better decision. + // interleave count to be less than the trip count divided by VF, provided it + // is at least 1. + // + // For scalable vectors we can't know if interleaving is beneficial. It may + // not be beneficial for small loops if none of the lanes in the second vector + // iterations is enabled. However, for larger loops, there is likely to be a + // similar benefit as for fixed-width vectors. For now, we choose to leave + // the InterleaveCount as if vscale is '1', although if some information about + // the vector is known (e.g. min vector size), we can make a better decision. if (BestKnownTC) { - MaxInterleaveCount = - std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); - // Make sure MaxInterleaveCount is greater than 0. - MaxInterleaveCount = std::max(1u, MaxInterleaveCount); + MaxInterleaveCount = + std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); + // Make sure MaxInterleaveCount is greater than 0. + MaxInterleaveCount = std::max(1u, MaxInterleaveCount); } - assert(MaxInterleaveCount > 0 && - "Maximum interleave count must be greater than 0"); + assert(MaxInterleaveCount > 0 && + "Maximum interleave count must be greater than 0"); // Clamp the calculated IC to be between the 1 and the max interleave count // that the target and trip count allows. if (IC > MaxInterleaveCount) IC = MaxInterleaveCount; - else - // Make sure IC is greater than 0. - IC = std::max(1u, IC); - - assert(IC > 0 && "Interleave count must be greater than 0."); - - // If we did not calculate the cost for VF (because the user selected the VF) - // then we calculate the cost of VF here. - if (LoopCost == 0) { - assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); - LoopCost = *expectedCost(VF).first.getValue(); - } - - assert(LoopCost && "Non-zero loop cost expected"); - + else + // Make sure IC is greater than 0. + IC = std::max(1u, IC); + + assert(IC > 0 && "Interleave count must be greater than 0."); + + // If we did not calculate the cost for VF (because the user selected the VF) + // then we calculate the cost of VF here. + if (LoopCost == 0) { + assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); + LoopCost = *expectedCost(VF).first.getValue(); + } + + assert(LoopCost && "Non-zero loop cost expected"); + // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF.isVector() && HasReductions) { + if (VF.isVector() && HasReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -6164,15 +6164,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // Note that if we've already vectorized the loop we will have done the // runtime check and so interleaving won't require further checks. bool InterleavingRequiresRuntimePointerCheck = - (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); + (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. - LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' - << "LV: IC is " << IC << '\n' - << "LV: VF is " << VF << '\n'); - const bool AggressivelyInterleaveReductions = - TTI.enableAggressiveInterleaving(HasReductions); + LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' + << "LV: IC is " << IC << '\n' + << "LV: VF is " << VF << '\n'); + const bool AggressivelyInterleaveReductions = + TTI.enableAggressiveInterleaving(HasReductions); if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the @@ -6191,7 +6191,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // by this point), we can increase the critical path length if the loop // we're interleaving is inside another loop. Limit, by default to 2, so the // critical path only gets increased by one reduction operation. - if (HasReductions && TheLoop->getLoopDepth() > 1) { + if (HasReductions && TheLoop->getLoopDepth() > 1) { unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); SmallIC = std::min(SmallIC, F); StoresIC = std::min(StoresIC, F); @@ -6205,23 +6205,23 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, return std::max(StoresIC, LoadsIC); } - // If there are scalar reductions and TTI has enabled aggressive - // interleaving for reductions, we will interleave to expose ILP. - if (InterleaveSmallLoopScalarReduction && VF.isScalar() && - AggressivelyInterleaveReductions) { - LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); - // Interleave no less than SmallIC but not as aggressive as the normal IC - // to satisfy the rare situation when resources are too limited. - return std::max(IC / 2, SmallIC); - } else { - LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); - return SmallIC; - } + // If there are scalar reductions and TTI has enabled aggressive + // interleaving for reductions, we will interleave to expose ILP. + if (InterleaveSmallLoopScalarReduction && VF.isScalar() && + AggressivelyInterleaveReductions) { + LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); + // Interleave no less than SmallIC but not as aggressive as the normal IC + // to satisfy the rare situation when resources are too limited. + return std::max(IC / 2, SmallIC); + } else { + LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); + return SmallIC; + } } // Interleave if this is a large loop (small loops are already dealt with by // this point) that could benefit from interleaving. - if (AggressivelyInterleaveReductions) { + if (AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); return IC; } @@ -6231,7 +6231,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, } SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> -LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { +LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -6309,11 +6309,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); // A lambda that gets the register usage for the given type and VF. - const auto &TTICapture = TTI; - auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { - if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) + const auto &TTICapture = TTI; + auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { + if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) return 0U; - return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); + return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -6337,7 +6337,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { // Count the number of live intervals. SmallMapVector<unsigned, unsigned, 4> RegUsage; - if (VFs[j].isScalar()) { + if (VFs[j].isScalar()) { for (auto Inst : OpenIntervals) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); if (RegUsage.find(ClassID) == RegUsage.end()) @@ -6366,7 +6366,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { } } } - + for (auto& pair : RegUsage) { if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); @@ -6384,12 +6384,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { for (unsigned i = 0, e = VFs.size(); i < e; ++i) { SmallMapVector<unsigned, unsigned, 4> Invariant; - + for (auto Inst : LoopInvariants) { - unsigned Usage = - VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); - unsigned ClassID = - TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); + unsigned Usage = + VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + unsigned ClassID = + TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); if (Invariant.find(ClassID) == Invariant.end()) Invariant[ClassID] = Usage; else @@ -6437,13 +6437,13 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ NumPredStores > NumberOfStoresToPredicate); } -void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { +void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. - if (VF.isScalar() || VF.isZero() || - InstsToScalarize.find(VF) != InstsToScalarize.end()) + if (VF.isScalar() || VF.isZero() || + InstsToScalarize.find(VF) != InstsToScalarize.end()) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's @@ -6472,13 +6472,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { } int LoopVectorizationCostModel::computePredInstDiscount( - Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { + Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); // Initialize the discount to zero, meaning that the scalar version and the // vector version cost the same. - InstructionCost Discount = 0; + InstructionCost Discount = 0; // Holds instructions to analyze. The instructions we visit are mapped in // ScalarCosts. Those instructions are the ones that would be scalarized if @@ -6533,27 +6533,27 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the cost of the vector instruction. Note that this cost already // includes the scalarization overhead of the predicated instruction. - InstructionCost VectorCost = getInstructionCost(I, VF).first; + InstructionCost VectorCost = getInstructionCost(I, VF).first; // Compute the cost of the scalarized instruction. This cost is the cost of // the instruction as if it wasn't if-converted and instead remained in the // predicated block. We will scale this cost by block probability after // computing the scalarization overhead. - assert(!VF.isScalable() && "scalable vectors not yet supported."); - InstructionCost ScalarCost = - VF.getKnownMinValue() * - getInstructionCost(I, ElementCount::getFixed(1)).first; + assert(!VF.isScalable() && "scalable vectors not yet supported."); + InstructionCost ScalarCost = + VF.getKnownMinValue() * + getInstructionCost(I, ElementCount::getFixed(1)).first; // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(I->getType(), VF)), - APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); - assert(!VF.isScalable() && "scalable vectors not yet supported."); - ScalarCost += - VF.getKnownMinValue() * - TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); + APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + ScalarCost += + VF.getKnownMinValue() * + TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); } // Compute the scalarization overhead of needed extractelement @@ -6566,12 +6566,12 @@ int LoopVectorizationCostModel::computePredInstDiscount( "Instruction has non-scalar type"); if (canBeScalarized(J)) Worklist.push_back(J); - else if (needsExtract(J, VF)) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); + else if (needsExtract(J, VF)) { + assert(!VF.isScalable() && "scalable vectors not yet supported."); ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(J->getType(), VF)), - APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); - } + APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); + } } // Scale the total scalar cost by block probability. @@ -6583,11 +6583,11 @@ int LoopVectorizationCostModel::computePredInstDiscount( ScalarCosts[I] = ScalarCost; } - return *Discount.getValue(); + return *Discount.getValue(); } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost(ElementCount VF) { +LoopVectorizationCostModel::expectedCost(ElementCount VF) { VectorizationCostTy Cost; // For each block. @@ -6597,15 +6597,15 @@ LoopVectorizationCostModel::expectedCost(ElementCount VF) { // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. - if (ValuesToIgnore.count(&I) || - (VF.isVector() && VecValuesToIgnore.count(&I))) + if (ValuesToIgnore.count(&I) || + (VF.isVector() && VecValuesToIgnore.count(&I))) continue; VectorizationCostTy C = getInstructionCost(&I, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) - C.first = InstructionCost(ForceTargetInstructionCost); + C.first = InstructionCost(ForceTargetInstructionCost); BlockCost.first += C.first; BlockCost.second |= C.second; @@ -6618,10 +6618,10 @@ LoopVectorizationCostModel::expectedCost(ElementCount VF) { // if-converted. This means that the block's instructions (aside from // stores and instructions that may divide by zero) will now be // unconditionally executed. For the scalar case, we may not always execute - // the predicated block, if it is an if-else block. Thus, scale the block's - // cost by the probability of executing it. blockNeedsPredication from - // Legal is used so as to not include all blocks in tail folded loops. - if (VF.isScalar() && Legal->blockNeedsPredication(BB)) + // the predicated block, if it is an if-else block. Thus, scale the block's + // cost by the probability of executing it. blockNeedsPredication from + // Legal is used so as to not include all blocks in tail folded loops. + if (VF.isScalar() && Legal->blockNeedsPredication(BB)) BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; @@ -6666,12 +6666,12 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { Legal->hasStride(I->getOperand(1)); } -InstructionCost -LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, - ElementCount VF) { - assert(VF.isVector() && - "Scalarization cost of instruction implies vectorization."); - assert(!VF.isScalable() && "scalable vectors not yet supported."); +InstructionCost +LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, + ElementCount VF) { + assert(VF.isVector() && + "Scalarization cost of instruction implies vectorization."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Type *ValTy = getMemInstValueType(I); auto SE = PSE.getSE(); @@ -6684,15 +6684,15 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - InstructionCost Cost = - VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + InstructionCost Cost = + VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. const Align Alignment = getLoadStoreAlignment(I); - Cost += VF.getKnownMinValue() * - TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, - AS, TTI::TCK_RecipThroughput); + Cost += VF.getKnownMinValue() * + TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, + AS, TTI::TCK_RecipThroughput); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -6713,9 +6713,9 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, return Cost; } -InstructionCost -LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, - ElementCount VF) { +InstructionCost +LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); @@ -6726,7 +6726,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access"); const Align Alignment = getLoadStoreAlignment(I); - InstructionCost Cost = 0; + InstructionCost Cost = 0; if (Legal->isMaskRequired(I)) Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, CostKind); @@ -6740,11 +6740,11 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, return Cost; } -InstructionCost -LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, - ElementCount VF) { - assert(Legal->isUniformMemOp(*I)); - +InstructionCost +LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, + ElementCount VF) { + assert(Legal->isUniformMemOp(*I)); + Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6765,12 +6765,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - VF.getKnownMinValue() - 1)); + VF.getKnownMinValue() - 1)); } -InstructionCost -LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, - ElementCount VF) { +InstructionCost +LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6782,9 +6782,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, TargetTransformInfo::TCK_RecipThroughput, I); } -InstructionCost -LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, - ElementCount VF) { +InstructionCost +LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); @@ -6793,8 +6793,8 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); - assert(!VF.isScalable() && "scalable vectors not yet supported."); - auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); // Holds the indices of existing members in an interleaved load group. // An interleaved store group doesn't need this as it doesn't allow gaps. @@ -6808,7 +6808,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, // Calculate the cost of the whole interleaved group. bool UseMaskForGaps = Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); - InstructionCost Cost = TTI.getInterleavedMemoryOpCost( + InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); @@ -6822,122 +6822,122 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, return Cost; } -InstructionCost LoopVectorizationCostModel::getReductionPatternCost( - Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { - // Early exit for no inloop reductions - if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) - return InstructionCost::getInvalid(); - auto *VectorTy = cast<VectorType>(Ty); - - // We are looking for a pattern of, and finding the minimal acceptable cost: - // reduce(mul(ext(A), ext(B))) or - // reduce(mul(A, B)) or - // reduce(ext(A)) or - // reduce(A). - // The basic idea is that we walk down the tree to do that, finding the root - // reduction instruction in InLoopReductionImmediateChains. From there we find - // the pattern of mul/ext and test the cost of the entire pattern vs the cost - // of the components. If the reduction cost is lower then we return it for the - // reduction instruction and 0 for the other instructions in the pattern. If - // it is not we return an invalid cost specifying the orignal cost method - // should be used. - Instruction *RetI = I; - if ((RetI->getOpcode() == Instruction::SExt || - RetI->getOpcode() == Instruction::ZExt)) { - if (!RetI->hasOneUser()) - return InstructionCost::getInvalid(); - RetI = RetI->user_back(); - } - if (RetI->getOpcode() == Instruction::Mul && - RetI->user_back()->getOpcode() == Instruction::Add) { - if (!RetI->hasOneUser()) - return InstructionCost::getInvalid(); - RetI = RetI->user_back(); - } - - // Test if the found instruction is a reduction, and if not return an invalid - // cost specifying the parent to use the original cost modelling. - if (!InLoopReductionImmediateChains.count(RetI)) - return InstructionCost::getInvalid(); - - // Find the reduction this chain is a part of and calculate the basic cost of - // the reduction on its own. - Instruction *LastChain = InLoopReductionImmediateChains[RetI]; - Instruction *ReductionPhi = LastChain; - while (!isa<PHINode>(ReductionPhi)) - ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; - - RecurrenceDescriptor RdxDesc = - Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; - unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), - VectorTy, false, CostKind); - - // Get the operand that was not the reduction chain and match it to one of the - // patterns, returning the better cost if it is found. - Instruction *RedOp = RetI->getOperand(1) == LastChain - ? dyn_cast<Instruction>(RetI->getOperand(0)) - : dyn_cast<Instruction>(RetI->getOperand(1)); - - VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); - - if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && - !TheLoop->isLoopInvariant(RedOp)) { - bool IsUnsigned = isa<ZExtInst>(RedOp); - auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); - InstructionCost RedCost = TTI.getExtendedAddReductionCost( - /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, - CostKind); - - unsigned ExtCost = - TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, - TTI::CastContextHint::None, CostKind, RedOp); - if (RedCost.isValid() && RedCost < BaseCost + ExtCost) - return I == RetI ? *RedCost.getValue() : 0; - } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { - Instruction *Mul = RedOp; - Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); - Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); - if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && - Op0->getOpcode() == Op1->getOpcode() && - Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && - !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { - bool IsUnsigned = isa<ZExtInst>(Op0); - auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); - // reduce(mul(ext, ext)) - unsigned ExtCost = - TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, - TTI::CastContextHint::None, CostKind, Op0); - unsigned MulCost = - TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); - - InstructionCost RedCost = TTI.getExtendedAddReductionCost( - /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, - CostKind); - - if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) - return I == RetI ? *RedCost.getValue() : 0; - } else { - unsigned MulCost = - TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); - - InstructionCost RedCost = TTI.getExtendedAddReductionCost( - /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, - CostKind); - - if (RedCost.isValid() && RedCost < MulCost + BaseCost) - return I == RetI ? *RedCost.getValue() : 0; - } - } - - return I == RetI ? BaseCost : InstructionCost::getInvalid(); -} - -InstructionCost -LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, - ElementCount VF) { +InstructionCost LoopVectorizationCostModel::getReductionPatternCost( + Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { + // Early exit for no inloop reductions + if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) + return InstructionCost::getInvalid(); + auto *VectorTy = cast<VectorType>(Ty); + + // We are looking for a pattern of, and finding the minimal acceptable cost: + // reduce(mul(ext(A), ext(B))) or + // reduce(mul(A, B)) or + // reduce(ext(A)) or + // reduce(A). + // The basic idea is that we walk down the tree to do that, finding the root + // reduction instruction in InLoopReductionImmediateChains. From there we find + // the pattern of mul/ext and test the cost of the entire pattern vs the cost + // of the components. If the reduction cost is lower then we return it for the + // reduction instruction and 0 for the other instructions in the pattern. If + // it is not we return an invalid cost specifying the orignal cost method + // should be used. + Instruction *RetI = I; + if ((RetI->getOpcode() == Instruction::SExt || + RetI->getOpcode() == Instruction::ZExt)) { + if (!RetI->hasOneUser()) + return InstructionCost::getInvalid(); + RetI = RetI->user_back(); + } + if (RetI->getOpcode() == Instruction::Mul && + RetI->user_back()->getOpcode() == Instruction::Add) { + if (!RetI->hasOneUser()) + return InstructionCost::getInvalid(); + RetI = RetI->user_back(); + } + + // Test if the found instruction is a reduction, and if not return an invalid + // cost specifying the parent to use the original cost modelling. + if (!InLoopReductionImmediateChains.count(RetI)) + return InstructionCost::getInvalid(); + + // Find the reduction this chain is a part of and calculate the basic cost of + // the reduction on its own. + Instruction *LastChain = InLoopReductionImmediateChains[RetI]; + Instruction *ReductionPhi = LastChain; + while (!isa<PHINode>(ReductionPhi)) + ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; + + RecurrenceDescriptor RdxDesc = + Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; + unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), + VectorTy, false, CostKind); + + // Get the operand that was not the reduction chain and match it to one of the + // patterns, returning the better cost if it is found. + Instruction *RedOp = RetI->getOperand(1) == LastChain + ? dyn_cast<Instruction>(RetI->getOperand(0)) + : dyn_cast<Instruction>(RetI->getOperand(1)); + + VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); + + if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && + !TheLoop->isLoopInvariant(RedOp)) { + bool IsUnsigned = isa<ZExtInst>(RedOp); + auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); + InstructionCost RedCost = TTI.getExtendedAddReductionCost( + /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, + CostKind); + + unsigned ExtCost = + TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, + TTI::CastContextHint::None, CostKind, RedOp); + if (RedCost.isValid() && RedCost < BaseCost + ExtCost) + return I == RetI ? *RedCost.getValue() : 0; + } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { + Instruction *Mul = RedOp; + Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); + Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); + if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && + Op0->getOpcode() == Op1->getOpcode() && + Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && + !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { + bool IsUnsigned = isa<ZExtInst>(Op0); + auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); + // reduce(mul(ext, ext)) + unsigned ExtCost = + TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, + TTI::CastContextHint::None, CostKind, Op0); + unsigned MulCost = + TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + + InstructionCost RedCost = TTI.getExtendedAddReductionCost( + /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, + CostKind); + + if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) + return I == RetI ? *RedCost.getValue() : 0; + } else { + unsigned MulCost = + TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + + InstructionCost RedCost = TTI.getExtendedAddReductionCost( + /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, + CostKind); + + if (RedCost.isValid() && RedCost < MulCost + BaseCost) + return I == RetI ? *RedCost.getValue() : 0; + } + } + + return I == RetI ? BaseCost : InstructionCost::getInvalid(); +} + +InstructionCost +LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, + ElementCount VF) { // Calculate scalar cost only. Vectorization cost should be ready at this // moment. - if (VF.isScalar()) { + if (VF.isScalar()) { Type *ValTy = getMemInstValueType(I); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); @@ -6950,52 +6950,52 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::getInstructionCost(Instruction *I, - ElementCount VF) { +LoopVectorizationCostModel::getInstructionCost(Instruction *I, + ElementCount VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (isUniformAfterVectorization(I, VF)) - VF = ElementCount::getFixed(1); + VF = ElementCount::getFixed(1); - if (VF.isVector() && isProfitableToScalarize(I, VF)) + if (VF.isVector() && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); // Forced scalars do not have any scalarization overhead. auto ForcedScalar = ForcedScalars.find(VF); - if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { + if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { auto InstSet = ForcedScalar->second; if (InstSet.count(I)) - return VectorizationCostTy( - (getInstructionCost(I, ElementCount::getFixed(1)).first * - VF.getKnownMinValue()), - false); + return VectorizationCostTy( + (getInstructionCost(I, ElementCount::getFixed(1)).first * + VF.getKnownMinValue()), + false); } Type *VectorTy; - InstructionCost C = getInstructionCost(I, VF, VectorTy); + InstructionCost C = getInstructionCost(I, VF, VectorTy); bool TypeNotScalarized = - VF.isVector() && VectorTy->isVectorTy() && - TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); + VF.isVector() && VectorTy->isVectorTy() && + TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); return VectorizationCostTy(C, TypeNotScalarized); } -InstructionCost -LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, - ElementCount VF) { +InstructionCost +LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, + ElementCount VF) { - assert(!VF.isScalable() && - "cannot compute scalarization overhead for scalable vectorization"); - if (VF.isScalar()) + assert(!VF.isScalable() && + "cannot compute scalarization overhead for scalable vectorization"); + if (VF.isScalar()) return 0; - InstructionCost Cost = 0; + InstructionCost Cost = 0; Type *RetTy = ToVectorTy(I->getType(), VF); if (!RetTy->isVoidTy() && (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( - cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), - true, false); + cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), + true, false); // Some targets keep addresses scalar. if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) @@ -7012,11 +7012,11 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, // Skip operands that do not require extraction/scalarization and do not incur // any overhead. return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); + filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); } -void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { - if (VF.isScalar()) +void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { + if (VF.isScalar()) return; NumPredStores = 0; for (BasicBlock *BB : TheLoop->blocks()) { @@ -7033,19 +7033,19 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) NumPredStores++; - if (Legal->isUniformMemOp(I)) { + if (Legal->isUniformMemOp(I)) { // TODO: Avoid replicating loads and stores instead of // relying on instcombine to remove them. // Load: Scalar load + broadcast // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract - InstructionCost Cost = getUniformMemOpCost(&I, VF); + InstructionCost Cost = getUniformMemOpCost(&I, VF); setWideningDecision(&I, VF, CM_Scalarize, Cost); continue; } // We assume that widening is the best solution when possible. if (memoryInstructionCanBeWidened(&I, VF)) { - InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); + InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); int ConsecutiveStride = Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && @@ -7057,7 +7057,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { } // Choose between Interleaving, Gather/Scatter or Scalarization. - InstructionCost InterleaveCost = std::numeric_limits<int>::max(); + InstructionCost InterleaveCost = std::numeric_limits<int>::max(); unsigned NumAccesses = 1; if (isAccessInterleaved(&I)) { auto Group = getInterleavedAccessGroup(&I); @@ -7072,17 +7072,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { InterleaveCost = getInterleaveGroupCost(&I, VF); } - InstructionCost GatherScatterCost = + InstructionCost GatherScatterCost = isLegalGatherOrScatter(&I) ? getGatherScatterCost(&I, VF) * NumAccesses - : std::numeric_limits<int>::max(); + : std::numeric_limits<int>::max(); - InstructionCost ScalarizationCost = + InstructionCost ScalarizationCost = getMemInstScalarizationCost(&I, VF) * NumAccesses; // Choose better solution for the current VF, // write down this decision and use it during vectorization. - InstructionCost Cost; + InstructionCost Cost; InstWidening Decision; if (InterleaveCost <= GatherScatterCost && InterleaveCost < ScalarizationCost) { @@ -7126,7 +7126,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // Add all instructions used to generate the addresses. SmallVector<Instruction *, 4> Worklist; - append_range(Worklist, AddrDefs); + append_range(Worklist, AddrDefs); while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); for (auto &Op : I->operands()) @@ -7145,18 +7145,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { InstWidening Decision = getWideningDecision(I, VF); if (Decision == CM_Widen || Decision == CM_Widen_Reverse) // Scalarize a widened load of address. - setWideningDecision( - I, VF, CM_Scalarize, - (VF.getKnownMinValue() * - getMemoryInstructionCost(I, ElementCount::getFixed(1)))); + setWideningDecision( + I, VF, CM_Scalarize, + (VF.getKnownMinValue() * + getMemoryInstructionCost(I, ElementCount::getFixed(1)))); else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) - setWideningDecision( - Member, VF, CM_Scalarize, - (VF.getKnownMinValue() * - getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); + setWideningDecision( + Member, VF, CM_Scalarize, + (VF.getKnownMinValue() * + getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); } } } else @@ -7166,9 +7166,9 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { } } -InstructionCost -LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, - Type *&VectorTy) { +InstructionCost +LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, + Type *&VectorTy) { Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); @@ -7190,22 +7190,22 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // blocks requires also an extract of its vector compare i1 element. bool ScalarPredicatedBB = false; BranchInst *BI = cast<BranchInst>(I); - if (VF.isVector() && BI->isConditional() && + if (VF.isVector() && BI->isConditional() && (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { // Return cost for branches around scalarized and predicated blocks. - assert(!VF.isScalable() && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *Vec_i1Ty = - VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); - return (TTI.getScalarizationOverhead( - Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), - false, true) + - (TTI.getCFInstrCost(Instruction::Br, CostKind) * - VF.getKnownMinValue())); - } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) + VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); + return (TTI.getScalarizationOverhead( + Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), + false, true) + + (TTI.getCFInstrCost(Instruction::Br, CostKind) * + VF.getKnownMinValue())); + } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. return TTI.getCFInstrCost(Instruction::Br, CostKind); else @@ -7220,20 +7220,20 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // First-order recurrences are replaced by vector shuffles inside the loop. // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. - if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) - return TTI.getShuffleCost( - TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), - VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); + if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) + return TTI.getShuffleCost( + TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), + VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi // node, where N is the number of incoming values. - if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) + if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( Instruction::Select, ToVectorTy(Phi->getType(), VF), ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), - CmpInst::BAD_ICMP_PREDICATE, CostKind); + CmpInst::BAD_ICMP_PREDICATE, CostKind); return TTI.getCFInstrCost(Instruction::PHI, CostKind); } @@ -7245,19 +7245,19 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF.isVector() && isScalarWithPredication(I)) { - InstructionCost Cost = 0; + if (VF.isVector() && isScalarWithPredication(I)) { + InstructionCost Cost = 0; // These instructions have a non-void type, so account for the phi nodes // that we will create. This cost is likely to be zero. The phi node // cost, if any, should be scaled by the block probability because it // models a copy at the end of each predicated block. - Cost += VF.getKnownMinValue() * - TTI.getCFInstrCost(Instruction::PHI, CostKind); + Cost += VF.getKnownMinValue() * + TTI.getCFInstrCost(Instruction::PHI, CostKind); // The cost of the non-predicated instruction. - Cost += VF.getKnownMinValue() * - TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); + Cost += VF.getKnownMinValue() * + TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); // The cost of insertelement and extractelement instructions needed for // scalarization. @@ -7286,13 +7286,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // Since we will replace the stride by 1 the multiplication should go away. if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) return 0; - - // Detect reduction patterns - InstructionCost RedCost; - if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) - .isValid()) - return RedCost; - + + // Detect reduction patterns + InstructionCost RedCost; + if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + .isValid()) + return RedCost; + // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. Value *Op2 = I->getOperand(1); @@ -7303,15 +7303,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Op2VK = TargetTransformInfo::OK_UniformValue; SmallVector<const Value *, 4> Operands(I->operand_values()); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, @@ -7325,9 +7325,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) - CondTy = VectorType::get(CondTy, VF); + CondTy = VectorType::get(CondTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, - CmpInst::BAD_ICMP_PREDICATE, CostKind, I); + CmpInst::BAD_ICMP_PREDICATE, CostKind, I); } case Instruction::ICmp: case Instruction::FCmp: { @@ -7336,18 +7336,18 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, - CmpInst::BAD_ICMP_PREDICATE, CostKind, I); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, + CmpInst::BAD_ICMP_PREDICATE, CostKind, I); } case Instruction::Store: case Instruction::Load: { - ElementCount Width = VF; - if (Width.isVector()) { + ElementCount Width = VF; + if (Width.isVector()) { InstWidening Decision = getWideningDecision(I, Width); assert(Decision != CM_Unknown && "CM decision should be taken at this point"); if (Decision == CM_Scalarize) - Width = ElementCount::getFixed(1); + Width = ElementCount::getFixed(1); } VectorTy = ToVectorTy(getMemInstValueType(I), Width); return getMemoryInstructionCost(I, VF); @@ -7364,62 +7364,62 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - // Computes the CastContextHint from a Load/Store instruction. - auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { - assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && - "Expected a load or a store!"); - - if (VF.isScalar() || !TheLoop->contains(I)) - return TTI::CastContextHint::Normal; - - switch (getWideningDecision(I, VF)) { - case LoopVectorizationCostModel::CM_GatherScatter: - return TTI::CastContextHint::GatherScatter; - case LoopVectorizationCostModel::CM_Interleave: - return TTI::CastContextHint::Interleave; - case LoopVectorizationCostModel::CM_Scalarize: - case LoopVectorizationCostModel::CM_Widen: - return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked - : TTI::CastContextHint::Normal; - case LoopVectorizationCostModel::CM_Widen_Reverse: - return TTI::CastContextHint::Reversed; - case LoopVectorizationCostModel::CM_Unknown: - llvm_unreachable("Instr did not go through cost modelling?"); - } - - llvm_unreachable("Unhandled case!"); - }; - - unsigned Opcode = I->getOpcode(); - TTI::CastContextHint CCH = TTI::CastContextHint::None; - // For Trunc, the context is the only user, which must be a StoreInst. - if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { - if (I->hasOneUse()) - if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) - CCH = ComputeCCH(Store); - } - // For Z/Sext, the context is the operand, which must be a LoadInst. - else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || - Opcode == Instruction::FPExt) { - if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) - CCH = ComputeCCH(Load); - } - + // Computes the CastContextHint from a Load/Store instruction. + auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { + assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && + "Expected a load or a store!"); + + if (VF.isScalar() || !TheLoop->contains(I)) + return TTI::CastContextHint::Normal; + + switch (getWideningDecision(I, VF)) { + case LoopVectorizationCostModel::CM_GatherScatter: + return TTI::CastContextHint::GatherScatter; + case LoopVectorizationCostModel::CM_Interleave: + return TTI::CastContextHint::Interleave; + case LoopVectorizationCostModel::CM_Scalarize: + case LoopVectorizationCostModel::CM_Widen: + return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked + : TTI::CastContextHint::Normal; + case LoopVectorizationCostModel::CM_Widen_Reverse: + return TTI::CastContextHint::Reversed; + case LoopVectorizationCostModel::CM_Unknown: + llvm_unreachable("Instr did not go through cost modelling?"); + } + + llvm_unreachable("Unhandled case!"); + }; + + unsigned Opcode = I->getOpcode(); + TTI::CastContextHint CCH = TTI::CastContextHint::None; + // For Trunc, the context is the only user, which must be a StoreInst. + if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { + if (I->hasOneUse()) + if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) + CCH = ComputeCCH(Store); + } + // For Z/Sext, the context is the operand, which must be a LoadInst. + else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || + Opcode == Instruction::FPExt) { + if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) + CCH = ComputeCCH(Load); + } + // We optimize the truncation of induction variables having constant // integer steps. The cost of these truncations is the same as the scalar // operation. if (isOptimizableIVTruncate(I, VF)) { auto *Trunc = cast<TruncInst>(I); return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), - Trunc->getSrcTy(), CCH, CostKind, Trunc); + Trunc->getSrcTy(), CCH, CostKind, Trunc); } - // Detect reduction patterns - InstructionCost RedCost; - if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) - .isValid()) - return RedCost; - + // Detect reduction patterns + InstructionCost RedCost; + if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + .isValid()) + return RedCost; + Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; @@ -7430,39 +7430,39 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // // Calculate the modified src and dest types. Type *MinVecTy = VectorTy; - if (Opcode == Instruction::Trunc) { + if (Opcode == Instruction::Trunc) { SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); - } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { + } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); } } - assert(!VF.isScalable() && "VF is assumed to be non scalable"); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; - return N * - TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); + assert(!VF.isScalable() && "VF is assumed to be non scalable"); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; + return N * + TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { bool NeedToScalarize; CallInst *CI = cast<CallInst>(I); - InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); - if (getVectorIntrinsicIDForCall(CI, TLI)) { - InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); - return std::min(CallCost, IntrinsicCost); - } + InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); + if (getVectorIntrinsicIDForCall(CI, TLI)) { + InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); + return std::min(CallCost, IntrinsicCost); + } return CallCost; } - case Instruction::ExtractValue: - return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); + case Instruction::ExtractValue: + return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( - Instruction::Mul, VectorTy, CostKind) + + return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( + Instruction::Mul, VectorTy, CostKind) + getScalarizationOverhead(I, VF); } // end of switch. } @@ -7515,7 +7515,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // detection. for (auto &Reduction : Legal->getReductionVars()) { RecurrenceDescriptor &RedDes = Reduction.second; - const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); + const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } // Ignore type-casting instructions we identified during induction @@ -7527,43 +7527,43 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { } } -void LoopVectorizationCostModel::collectInLoopReductions() { - for (auto &Reduction : Legal->getReductionVars()) { - PHINode *Phi = Reduction.first; - RecurrenceDescriptor &RdxDesc = Reduction.second; - - // We don't collect reductions that are type promoted (yet). - if (RdxDesc.getRecurrenceType() != Phi->getType()) - continue; - - // If the target would prefer this reduction to happen "in-loop", then we - // want to record it as such. - unsigned Opcode = RdxDesc.getOpcode(); - if (!PreferInLoopReductions && - !TTI.preferInLoopReduction(Opcode, Phi->getType(), - TargetTransformInfo::ReductionFlags())) - continue; - - // Check that we can correctly put the reductions into the loop, by - // finding the chain of operations that leads from the phi to the loop - // exit value. - SmallVector<Instruction *, 4> ReductionOperations = - RdxDesc.getReductionOpChain(Phi, TheLoop); - bool InLoop = !ReductionOperations.empty(); - if (InLoop) { - InLoopReductionChains[Phi] = ReductionOperations; - // Add the elements to InLoopReductionImmediateChains for cost modelling. - Instruction *LastChain = Phi; - for (auto *I : ReductionOperations) { - InLoopReductionImmediateChains[I] = LastChain; - LastChain = I; - } - } - LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") - << " reduction for phi: " << *Phi << "\n"); - } -} - +void LoopVectorizationCostModel::collectInLoopReductions() { + for (auto &Reduction : Legal->getReductionVars()) { + PHINode *Phi = Reduction.first; + RecurrenceDescriptor &RdxDesc = Reduction.second; + + // We don't collect reductions that are type promoted (yet). + if (RdxDesc.getRecurrenceType() != Phi->getType()) + continue; + + // If the target would prefer this reduction to happen "in-loop", then we + // want to record it as such. + unsigned Opcode = RdxDesc.getOpcode(); + if (!PreferInLoopReductions && + !TTI.preferInLoopReduction(Opcode, Phi->getType(), + TargetTransformInfo::ReductionFlags())) + continue; + + // Check that we can correctly put the reductions into the loop, by + // finding the chain of operations that leads from the phi to the loop + // exit value. + SmallVector<Instruction *, 4> ReductionOperations = + RdxDesc.getReductionOpChain(Phi, TheLoop); + bool InLoop = !ReductionOperations.empty(); + if (InLoop) { + InLoopReductionChains[Phi] = ReductionOperations; + // Add the elements to InLoopReductionImmediateChains for cost modelling. + Instruction *LastChain = Phi; + for (auto *I : ReductionOperations) { + InLoopReductionImmediateChains[I] = LastChain; + LastChain = I; + } + } + LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") + << " reduction for phi: " << *Phi << "\n"); + } +} + // TODO: we could return a pair of values that specify the max VF and // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment @@ -7577,40 +7577,40 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { - assert(!UserVF.isScalable() && "scalable vectors not yet supported"); - ElementCount VF = UserVF; +LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { + assert(!UserVF.isScalable() && "scalable vectors not yet supported"); + ElementCount VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in // the vectorization pipeline. - if (!OrigLoop->isInnermost()) { + if (!OrigLoop->isInnermost()) { // If the user doesn't provide a vectorization factor, determine a // reasonable one. - if (UserVF.isZero()) { - VF = ElementCount::getFixed( - determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); + if (UserVF.isZero()) { + VF = ElementCount::getFixed( + determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. - if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { + if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " << "overriding computed VF.\n"); - VF = ElementCount::getFixed(4); + VF = ElementCount::getFixed(4); } } assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - assert(isPowerOf2_32(VF.getKnownMinValue()) && - "VF needs to be a power of two"); - LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") - << "VF " << VF << " to build VPlans.\n"); + assert(isPowerOf2_32(VF.getKnownMinValue()) && + "VF needs to be a power of two"); + LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") + << "VF " << VF << " to build VPlans.\n"); buildVPlans(VF, VF); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/}; + return {VF, 0 /*Cost*/}; } LLVM_DEBUG( @@ -7619,10 +7619,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { return VectorizationFactor::Disabled(); } -Optional<VectorizationFactor> -LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { - assert(OrigLoop->isInnermost() && "Inner loop expected."); - Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); +Optional<VectorizationFactor> +LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { + assert(OrigLoop->isInnermost() && "Inner loop expected."); + Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -7640,55 +7640,55 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { CM.invalidateCostModelingDecisions(); } - ElementCount MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF.isNonZero() && "MaxVF is zero."); - - bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); - if (!UserVF.isZero() && - (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { - // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable - // VFs here, this should be reverted to only use legal UserVFs once the - // loop below supports scalable VFs. - ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; - LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") - << " VF " << VF << ".\n"); - assert(isPowerOf2_32(VF.getKnownMinValue()) && - "VF needs to be a power of two"); + ElementCount MaxVF = MaybeMaxVF.getValue(); + assert(MaxVF.isNonZero() && "MaxVF is zero."); + + bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); + if (!UserVF.isZero() && + (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { + // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable + // VFs here, this should be reverted to only use legal UserVFs once the + // loop below supports scalable VFs. + ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; + LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") + << " VF " << VF << ".\n"); + assert(isPowerOf2_32(VF.getKnownMinValue()) && + "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - CM.selectUserVectorizationFactor(VF); - CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(VF, VF); + CM.selectUserVectorizationFactor(VF); + CM.collectInLoopReductions(); + buildVPlansWithVPRecipes(VF, VF); LLVM_DEBUG(printPlans(dbgs())); - return {{VF, 0}}; + return {{VF, 0}}; } - assert(!MaxVF.isScalable() && - "Scalable vectors not yet supported beyond this point"); + assert(!MaxVF.isScalable() && + "Scalable vectors not yet supported beyond this point"); - for (ElementCount VF = ElementCount::getFixed(1); - ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { + for (ElementCount VF = ElementCount::getFixed(1); + ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. CM.collectUniformsAndScalars(VF); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - if (VF.isVector()) + if (VF.isVector()) CM.collectInstsToScalarize(VF); } - CM.collectInLoopReductions(); - - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); + CM.collectInLoopReductions(); + + buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); LLVM_DEBUG(printPlans(dbgs())); - if (MaxVF.isScalar()) + if (MaxVF.isScalar()) return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. return CM.selectVectorizationFactor(MaxVF); } -void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { +void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); BestVF = VF; @@ -7707,23 +7707,23 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPCallbackILV CallbackILV(ILV); - assert(BestVF.hasValue() && "Vectorization Factor is missing"); - - VPTransformState State{*BestVF, - BestUF, - OrigLoop, - LI, - DT, - ILV.Builder, - ILV.VectorLoopValueMap, - &ILV, - CallbackILV}; + assert(BestVF.hasValue() && "Vectorization Factor is missing"); + + VPTransformState State{*BestVF, + BestUF, + OrigLoop, + LI, + DT, + ILV.Builder, + ILV.VectorLoopValueMap, + &ILV, + CallbackILV}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; - ILV.printDebugTracesAtStart(); - + ILV.printDebugTracesAtStart(); + //===------------------------------------------------===// // // Notice: any optimization or new instruction that go @@ -7739,48 +7739,48 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. ILV.fixVectorizedLoop(); - - ILV.printDebugTracesAtEnd(); + + ILV.printDebugTracesAtEnd(); } void LoopVectorizationPlanner::collectTriviallyDeadInstructions( SmallPtrSetImpl<Instruction *> &DeadInstructions) { - // We create new control-flow for the vectorized loop, so the original exit - // conditions will be dead after vectorization if it's only used by the - // terminator - SmallVector<BasicBlock*> ExitingBlocks; - OrigLoop->getExitingBlocks(ExitingBlocks); - for (auto *BB : ExitingBlocks) { - auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); - if (!Cmp || !Cmp->hasOneUse()) - continue; - - // TODO: we should introduce a getUniqueExitingBlocks on Loop - if (!DeadInstructions.insert(Cmp).second) - continue; - - // The operands of the icmp is often a dead trunc, used by IndUpdate. - // TODO: can recurse through operands in general - for (Value *Op : Cmp->operands()) { - if (isa<TruncInst>(Op) && Op->hasOneUse()) - DeadInstructions.insert(cast<Instruction>(Op)); - } - } - + // We create new control-flow for the vectorized loop, so the original exit + // conditions will be dead after vectorization if it's only used by the + // terminator + SmallVector<BasicBlock*> ExitingBlocks; + OrigLoop->getExitingBlocks(ExitingBlocks); + for (auto *BB : ExitingBlocks) { + auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); + if (!Cmp || !Cmp->hasOneUse()) + continue; + + // TODO: we should introduce a getUniqueExitingBlocks on Loop + if (!DeadInstructions.insert(Cmp).second) + continue; + + // The operands of the icmp is often a dead trunc, used by IndUpdate. + // TODO: can recurse through operands in general + for (Value *Op : Cmp->operands()) { + if (isa<TruncInst>(Op) && Op->hasOneUse()) + DeadInstructions.insert(cast<Instruction>(Op)); + } + } + // We create new "steps" for induction variable updates to which the original // induction variables map. An original update instruction will be dead if // all its users except the induction variable are dead. - auto *Latch = OrigLoop->getLoopLatch(); + auto *Latch = OrigLoop->getLoopLatch(); for (auto &Induction : Legal->getInductionVars()) { PHINode *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - - // If the tail is to be folded by masking, the primary induction variable, - // if exists, isn't dead: it will be used for masking. Don't kill it. - if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) - continue; - + + // If the tail is to be folded by masking, the primary induction variable, + // if exists, isn't dead: it will be used for masking. Don't kill it. + if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) + continue; + if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { return U == Ind || DeadInstructions.count(cast<Instruction>(U)); })) @@ -7855,284 +7855,284 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } } -//===--------------------------------------------------------------------===// -// EpilogueVectorizerMainLoop -//===--------------------------------------------------------------------===// - -/// This function is partially responsible for generating the control flow -/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { - MDNode *OrigLoopID = OrigLoop->getLoopID(); - Loop *Lp = createVectorLoopSkeleton(""); - - // Generate the code to check the minimum iteration count of the vector - // epilogue (see below). - EPI.EpilogueIterationCountCheck = - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); - EPI.EpilogueIterationCountCheck->setName("iter.check"); - - // Generate the code to check any assumptions that we've made for SCEV - // expressions. - BasicBlock *SavedPreHeader = LoopVectorPreHeader; - emitSCEVChecks(Lp, LoopScalarPreHeader); - - // If a safety check was generated save it. - if (SavedPreHeader != LoopVectorPreHeader) - EPI.SCEVSafetyCheck = SavedPreHeader; - - // Generate the code that checks at runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - SavedPreHeader = LoopVectorPreHeader; - emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - - // If a safety check was generated save/overwite it. - if (SavedPreHeader != LoopVectorPreHeader) - EPI.MemSafetyCheck = SavedPreHeader; - - // Generate the iteration count check for the main loop, *after* the check - // for the epilogue loop, so that the path-length is shorter for the case - // that goes directly through the vector epilogue. The longer-path length for - // the main loop is compensated for, by the gain from vectorizing the larger - // trip count. Note: the branch will get updated later on when we vectorize - // the epilogue. - EPI.MainLoopIterationCountCheck = - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); - - // Generate the induction variable. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - Value *StartIdx = ConstantInt::get(IdxTy, 0); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - EPI.VectorTripCount = CountRoundDown; - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); - - // Skip induction resume value creation here because they will be created in - // the second pass. If we created them here, they wouldn't be used anyway, - // because the vplan in the second pass still contains the inductions from the - // original loop. - - return completeLoopSkeleton(Lp, OrigLoopID); -} - -void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { - LLVM_DEBUG({ - dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" - << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() - << ", Main Loop UF:" << EPI.MainLoopUF - << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() - << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; - }); -} - -void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { - DEBUG_WITH_TYPE(VerboseDebug, { - dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; - }); -} - -BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( - Loop *L, BasicBlock *Bypass, bool ForEpilogue) { - assert(L && "Expected valid Loop."); - assert(Bypass && "Expected valid bypass basic block."); - unsigned VFactor = - ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); - unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; - Value *Count = getOrCreateTripCount(L); - // Reuse existing vector loop preheader for TC checks. - // Note that new preheader block is generated for vector loop. - BasicBlock *const TCCheckBlock = LoopVectorPreHeader; - IRBuilder<> Builder(TCCheckBlock->getTerminator()); - - // Generate code to check if the loop's trip count is less than VF * UF of the - // main vector loop. - auto P = - Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; - - Value *CheckMinIters = Builder.CreateICmp( - P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), - "min.iters.check"); - - if (!ForEpilogue) - TCCheckBlock->setName("vector.main.loop.iter.check"); - - // Create new preheader for vector loop. - LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), - DT, LI, nullptr, "vector.ph"); - - if (ForEpilogue) { - assert(DT->properlyDominates(DT->getNode(TCCheckBlock), - DT->getNode(Bypass)->getIDom()) && - "TC check is expected to dominate Bypass"); - - // Update dominator for Bypass & LoopExit. - DT->changeImmediateDominator(Bypass, TCCheckBlock); - DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); - - LoopBypassBlocks.push_back(TCCheckBlock); - - // Save the trip count so we don't have to regenerate it in the - // vec.epilog.iter.check. This is safe to do because the trip count - // generated here dominates the vector epilog iter check. - EPI.TripCount = Count; - } - - ReplaceInstWithInst( - TCCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); - - return TCCheckBlock; -} - -//===--------------------------------------------------------------------===// -// EpilogueVectorizerEpilogueLoop -//===--------------------------------------------------------------------===// - -/// This function is partially responsible for generating the control flow -/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock * -EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { - MDNode *OrigLoopID = OrigLoop->getLoopID(); - Loop *Lp = createVectorLoopSkeleton("vec.epilog."); - - // Now, compare the remaining count and if there aren't enough iterations to - // execute the vectorized epilogue skip to the scalar part. - BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; - VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); - LoopVectorPreHeader = - SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - LI, nullptr, "vec.epilog.ph"); - emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, - VecEpilogueIterationCountCheck); - - // Adjust the control flow taking the state info from the main loop - // vectorization into account. - assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && - "expected this to be saved from the previous pass."); - EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopVectorPreHeader); - - DT->changeImmediateDominator(LoopVectorPreHeader, - EPI.MainLoopIterationCountCheck); - - EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopScalarPreHeader); - - if (EPI.SCEVSafetyCheck) - EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopScalarPreHeader); - if (EPI.MemSafetyCheck) - EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopScalarPreHeader); - - DT->changeImmediateDominator( - VecEpilogueIterationCountCheck, - VecEpilogueIterationCountCheck->getSinglePredecessor()); - - DT->changeImmediateDominator(LoopScalarPreHeader, - EPI.EpilogueIterationCountCheck); - DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); - - // Keep track of bypass blocks, as they feed start values to the induction - // phis in the scalar loop preheader. - if (EPI.SCEVSafetyCheck) - LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); - if (EPI.MemSafetyCheck) - LoopBypassBlocks.push_back(EPI.MemSafetyCheck); - LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); - - // Generate a resume induction for the vector epilogue and put it in the - // vector epilogue preheader - Type *IdxTy = Legal->getWidestInductionType(); - PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", - LoopVectorPreHeader->getFirstNonPHI()); - EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); - EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), - EPI.MainLoopIterationCountCheck); - - // Generate the induction variable. - OldInduction = Legal->getPrimaryInduction(); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); - Value *StartIdx = EPResumeVal; - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); - - // Generate induction resume values. These variables save the new starting - // indexes for the scalar loop. They are used to test if there are any tail - // iterations left once the vector loop has completed. - // Note that when the vectorized epilogue is skipped due to iteration count - // check, then the resume value for the induction variable comes from - // the trip count of the main vector loop, hence passing the AdditionalBypass - // argument. - createInductionResumeValues(Lp, CountRoundDown, - {VecEpilogueIterationCountCheck, - EPI.VectorTripCount} /* AdditionalBypass */); - - AddRuntimeUnrollDisableMetaData(Lp); - return completeLoopSkeleton(Lp, OrigLoopID); -} - -BasicBlock * -EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( - Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { - - assert(EPI.TripCount && - "Expected trip count to have been safed in the first pass."); - assert( - (!isa<Instruction>(EPI.TripCount) || - DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && - "saved trip count does not dominate insertion point."); - Value *TC = EPI.TripCount; - IRBuilder<> Builder(Insert->getTerminator()); - Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); - - // Generate code to check if the loop's trip count is less than VF * UF of the - // vector epilogue loop. - auto P = - Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; - - Value *CheckMinIters = Builder.CreateICmp( - P, Count, - ConstantInt::get(Count->getType(), - EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), - "min.epilog.iters.check"); - - ReplaceInstWithInst( - Insert->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); - - LoopBypassBlocks.push_back(Insert); - return Insert; -} - -void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { - LLVM_DEBUG({ - dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" - << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() - << ", Main Loop UF:" << EPI.MainLoopUF - << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() - << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; - }); -} - -void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { - DEBUG_WITH_TYPE(VerboseDebug, { - dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; - }); -} - +//===--------------------------------------------------------------------===// +// EpilogueVectorizerMainLoop +//===--------------------------------------------------------------------===// + +/// This function is partially responsible for generating the control flow +/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. +BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { + MDNode *OrigLoopID = OrigLoop->getLoopID(); + Loop *Lp = createVectorLoopSkeleton(""); + + // Generate the code to check the minimum iteration count of the vector + // epilogue (see below). + EPI.EpilogueIterationCountCheck = + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); + EPI.EpilogueIterationCountCheck->setName("iter.check"); + + // Generate the code to check any assumptions that we've made for SCEV + // expressions. + BasicBlock *SavedPreHeader = LoopVectorPreHeader; + emitSCEVChecks(Lp, LoopScalarPreHeader); + + // If a safety check was generated save it. + if (SavedPreHeader != LoopVectorPreHeader) + EPI.SCEVSafetyCheck = SavedPreHeader; + + // Generate the code that checks at runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + SavedPreHeader = LoopVectorPreHeader; + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); + + // If a safety check was generated save/overwite it. + if (SavedPreHeader != LoopVectorPreHeader) + EPI.MemSafetyCheck = SavedPreHeader; + + // Generate the iteration count check for the main loop, *after* the check + // for the epilogue loop, so that the path-length is shorter for the case + // that goes directly through the vector epilogue. The longer-path length for + // the main loop is compensated for, by the gain from vectorizing the larger + // trip count. Note: the branch will get updated later on when we vectorize + // the epilogue. + EPI.MainLoopIterationCountCheck = + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); + + // Generate the induction variable. + OldInduction = Legal->getPrimaryInduction(); + Type *IdxTy = Legal->getWidestInductionType(); + Value *StartIdx = ConstantInt::get(IdxTy, 0); + Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + EPI.VectorTripCount = CountRoundDown; + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + + // Skip induction resume value creation here because they will be created in + // the second pass. If we created them here, they wouldn't be used anyway, + // because the vplan in the second pass still contains the inductions from the + // original loop. + + return completeLoopSkeleton(Lp, OrigLoopID); +} + +void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { + LLVM_DEBUG({ + dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" + << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() + << ", Main Loop UF:" << EPI.MainLoopUF + << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() + << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; + }); +} + +void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { + DEBUG_WITH_TYPE(VerboseDebug, { + dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; + }); +} + +BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( + Loop *L, BasicBlock *Bypass, bool ForEpilogue) { + assert(L && "Expected valid Loop."); + assert(Bypass && "Expected valid bypass basic block."); + unsigned VFactor = + ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); + unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; + Value *Count = getOrCreateTripCount(L); + // Reuse existing vector loop preheader for TC checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + IRBuilder<> Builder(TCCheckBlock->getTerminator()); + + // Generate code to check if the loop's trip count is less than VF * UF of the + // main vector loop. + auto P = + Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + + Value *CheckMinIters = Builder.CreateICmp( + P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), + "min.iters.check"); + + if (!ForEpilogue) + TCCheckBlock->setName("vector.main.loop.iter.check"); + + // Create new preheader for vector loop. + LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), + DT, LI, nullptr, "vector.ph"); + + if (ForEpilogue) { + assert(DT->properlyDominates(DT->getNode(TCCheckBlock), + DT->getNode(Bypass)->getIDom()) && + "TC check is expected to dominate Bypass"); + + // Update dominator for Bypass & LoopExit. + DT->changeImmediateDominator(Bypass, TCCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); + + LoopBypassBlocks.push_back(TCCheckBlock); + + // Save the trip count so we don't have to regenerate it in the + // vec.epilog.iter.check. This is safe to do because the trip count + // generated here dominates the vector epilog iter check. + EPI.TripCount = Count; + } + + ReplaceInstWithInst( + TCCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + + return TCCheckBlock; +} + +//===--------------------------------------------------------------------===// +// EpilogueVectorizerEpilogueLoop +//===--------------------------------------------------------------------===// + +/// This function is partially responsible for generating the control flow +/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. +BasicBlock * +EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { + MDNode *OrigLoopID = OrigLoop->getLoopID(); + Loop *Lp = createVectorLoopSkeleton("vec.epilog."); + + // Now, compare the remaining count and if there aren't enough iterations to + // execute the vectorized epilogue skip to the scalar part. + BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; + VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); + LoopVectorPreHeader = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + LI, nullptr, "vec.epilog.ph"); + emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, + VecEpilogueIterationCountCheck); + + // Adjust the control flow taking the state info from the main loop + // vectorization into account. + assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && + "expected this to be saved from the previous pass."); + EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopVectorPreHeader); + + DT->changeImmediateDominator(LoopVectorPreHeader, + EPI.MainLoopIterationCountCheck); + + EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopScalarPreHeader); + + if (EPI.SCEVSafetyCheck) + EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopScalarPreHeader); + if (EPI.MemSafetyCheck) + EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopScalarPreHeader); + + DT->changeImmediateDominator( + VecEpilogueIterationCountCheck, + VecEpilogueIterationCountCheck->getSinglePredecessor()); + + DT->changeImmediateDominator(LoopScalarPreHeader, + EPI.EpilogueIterationCountCheck); + DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); + + // Keep track of bypass blocks, as they feed start values to the induction + // phis in the scalar loop preheader. + if (EPI.SCEVSafetyCheck) + LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); + if (EPI.MemSafetyCheck) + LoopBypassBlocks.push_back(EPI.MemSafetyCheck); + LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); + + // Generate a resume induction for the vector epilogue and put it in the + // vector epilogue preheader + Type *IdxTy = Legal->getWidestInductionType(); + PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", + LoopVectorPreHeader->getFirstNonPHI()); + EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); + EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), + EPI.MainLoopIterationCountCheck); + + // Generate the induction variable. + OldInduction = Legal->getPrimaryInduction(); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + Value *StartIdx = EPResumeVal; + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + + // Generate induction resume values. These variables save the new starting + // indexes for the scalar loop. They are used to test if there are any tail + // iterations left once the vector loop has completed. + // Note that when the vectorized epilogue is skipped due to iteration count + // check, then the resume value for the induction variable comes from + // the trip count of the main vector loop, hence passing the AdditionalBypass + // argument. + createInductionResumeValues(Lp, CountRoundDown, + {VecEpilogueIterationCountCheck, + EPI.VectorTripCount} /* AdditionalBypass */); + + AddRuntimeUnrollDisableMetaData(Lp); + return completeLoopSkeleton(Lp, OrigLoopID); +} + +BasicBlock * +EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( + Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { + + assert(EPI.TripCount && + "Expected trip count to have been safed in the first pass."); + assert( + (!isa<Instruction>(EPI.TripCount) || + DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && + "saved trip count does not dominate insertion point."); + Value *TC = EPI.TripCount; + IRBuilder<> Builder(Insert->getTerminator()); + Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); + + // Generate code to check if the loop's trip count is less than VF * UF of the + // vector epilogue loop. + auto P = + Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + + Value *CheckMinIters = Builder.CreateICmp( + P, Count, + ConstantInt::get(Count->getType(), + EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), + "min.epilog.iters.check"); + + ReplaceInstWithInst( + Insert->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + + LoopBypassBlocks.push_back(Insert); + return Insert; +} + +void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { + LLVM_DEBUG({ + dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" + << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() + << ", Main Loop UF:" << EPI.MainLoopUF + << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() + << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; + }); +} + +void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { + DEBUG_WITH_TYPE(VerboseDebug, { + dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; + }); +} + bool LoopVectorizationPlanner::getDecisionAndClampRange( - const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { - assert(!Range.isEmpty() && "Trying to test an empty VF range."); + const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { + assert(!Range.isEmpty() && "Trying to test an empty VF range."); bool PredicateAtRangeStart = Predicate(Range.Start); - for (ElementCount TmpVF = Range.Start * 2; - ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) + for (ElementCount TmpVF = Range.Start * 2; + ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) if (Predicate(TmpVF) != PredicateAtRangeStart) { Range.End = TmpVF; break; @@ -8146,11 +8146,11 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange( /// of VF's starting at a given VF and extending it as much as possible. Each /// vectorization decision can potentially shorten this sub-range during /// buildVPlan(). -void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, - ElementCount MaxVF) { - auto MaxVFPlusOne = MaxVF.getWithIncrement(1); - for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { - VFRange SubRange = {VF, MaxVFPlusOne}; +void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, + ElementCount MaxVF) { + auto MaxVFPlusOne = MaxVF.getWithIncrement(1); + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { + VFRange SubRange = {VF, MaxVFPlusOne}; VPlans.push_back(buildVPlan(SubRange)); VF = SubRange.End; } @@ -8175,27 +8175,27 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) return EdgeMaskCache[Edge] = SrcMask; - // If source is an exiting block, we know the exit edge is dynamically dead - // in the vector loop, and thus we don't need to restrict the mask. Avoid - // adding uses of an otherwise potentially dead instruction. - if (OrigLoop->isLoopExiting(Src)) - return EdgeMaskCache[Edge] = SrcMask; - - VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); + // If source is an exiting block, we know the exit edge is dynamically dead + // in the vector loop, and thus we don't need to restrict the mask. Avoid + // adding uses of an otherwise potentially dead instruction. + if (OrigLoop->isLoopExiting(Src)) + return EdgeMaskCache[Edge] = SrcMask; + + VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); assert(EdgeMask && "No Edge Mask found for condition"); if (BI->getSuccessor(0) != Dst) EdgeMask = Builder.createNot(EdgeMask); - if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. - // The condition is 'SrcMask && EdgeMask', which is equivalent to - // 'select i1 SrcMask, i1 EdgeMask, i1 false'. - // The select version does not introduce new UB if SrcMask is false and - // EdgeMask is poison. Using 'and' here introduces undefined behavior. - VPValue *False = Plan->getOrAddVPValue( - ConstantInt::getFalse(BI->getCondition()->getType())); - EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); - } + if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. + // The condition is 'SrcMask && EdgeMask', which is equivalent to + // 'select i1 SrcMask, i1 EdgeMask, i1 false'. + // The select version does not introduce new UB if SrcMask is false and + // EdgeMask is poison. Using 'and' here introduces undefined behavior. + VPValue *False = Plan->getOrAddVPValue( + ConstantInt::getFalse(BI->getCondition()->getType())); + EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); + } return EdgeMaskCache[Edge] = EdgeMask; } @@ -8216,34 +8216,34 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { if (!CM.blockNeedsPredication(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - // Create the block in mask as the first non-phi instruction in the block. - VPBuilder::InsertPointGuard Guard(Builder); - auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); - Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); - + // Create the block in mask as the first non-phi instruction in the block. + VPBuilder::InsertPointGuard Guard(Builder); + auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); + Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); + // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. // Start by constructing the desired canonical IV. VPValue *IV = nullptr; if (Legal->getPrimaryInduction()) - IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); + IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); else { auto IVRecipe = new VPWidenCanonicalIVRecipe(); - Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); + Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); IV = IVRecipe->getVPValue(); } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); bool TailFolded = !CM.isScalarEpilogueAllowed(); - - if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { - // While ActiveLaneMask is a binary op that consumes the loop tripcount - // as a second argument, we only pass the IV here and extract the - // tripcount from the transform state where codegen of the VP instructions - // happen. - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); - } else { + + if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { + // While ActiveLaneMask is a binary op that consumes the loop tripcount + // as a second argument, we only pass the IV here and extract the + // tripcount from the transform state where codegen of the VP instructions + // happen. + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); + } else { BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); - } + } return BlockMaskCache[BB] = BlockMask; } @@ -8264,13 +8264,13 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { return BlockMaskCache[BB] = BlockMask; } -VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan) { +VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan) { assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Must be called with either a load or store"); - auto willWiden = [&](ElementCount VF) -> bool { - if (VF.isScalar()) + auto willWiden = [&](ElementCount VF) -> bool { + if (VF.isScalar()) return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); @@ -8301,22 +8301,22 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, } VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { +VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. InductionDescriptor II = Legal->getInductionVars().lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) { - VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); - return new VPWidenIntOrFpInductionRecipe(Phi, Start); - } + II.getKind() == InductionDescriptor::IK_FpInduction) { + VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); + return new VPWidenIntOrFpInductionRecipe(Phi, Start); + } return nullptr; } VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, - VPlan &Plan) const { +VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, + VPlan &Plan) const { // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and @@ -8325,21 +8325,21 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, // Determine whether \p K is a truncation based on an induction variable that // can be optimized. auto isOptimizableIVTruncate = - [&](Instruction *K) -> std::function<bool(ElementCount)> { - return [=](ElementCount VF) -> bool { - return CM.isOptimizableIVTruncate(K, VF); - }; + [&](Instruction *K) -> std::function<bool(ElementCount)> { + return [=](ElementCount VF) -> bool { + return CM.isOptimizableIVTruncate(K, VF); + }; }; if (LoopVectorizationPlanner::getDecisionAndClampRange( - isOptimizableIVTruncate(I), Range)) { - - InductionDescriptor II = - Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); - VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); + isOptimizableIVTruncate(I), Range)) { + + InductionDescriptor II = + Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); + VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), - Start, I); - } + Start, I); + } return nullptr; } @@ -8368,9 +8368,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, VPlan &Plan) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](ElementCount VF) { - return CM.isScalarWithPredication(CI, VF); - }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, VF); + }, Range); if (IsPredicated) @@ -8378,23 +8378,23 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || - ID == Intrinsic::pseudoprobe || - ID == Intrinsic::experimental_noalias_scope_decl)) + ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || + ID == Intrinsic::pseudoprobe || + ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; - auto willWiden = [&](ElementCount VF) -> bool { + auto willWiden = [&](ElementCount VF) -> bool { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // The following case may be scalarized depending on the VF. // The flag shows whether we use Intrinsic or a usual Call for vectorized // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize = false; - InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); - InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; - bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; - assert(IntrinsicCost.isValid() && CallCost.isValid() && - "Cannot have invalid costs while widening"); + InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; + bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; + assert(IntrinsicCost.isValid() && CallCost.isValid() && + "Cannot have invalid costs while widening"); return UseVectorIntrinsic || !NeedToScalarize; }; @@ -8409,7 +8409,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { !isa<StoreInst>(I) && "Instruction should have been handled earlier"); // Instruction should be widened, unless it is scalar after vectorization, // scalarization is profitable or it is predicated. - auto WillScalarize = [this, I](ElementCount VF) -> bool { + auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I, VF); @@ -8472,17 +8472,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, VPlanPtr &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, + [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, - Range); + [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, + Range); auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); setRecipe(I, Recipe); - Plan->addVPValue(I, Recipe); + Plan->addVPValue(I, Recipe); // Find if I uses a predicated instruction. If so, it will use its scalar // value. Avoid hoisting the insert-element which packs the scalar value into @@ -8524,9 +8524,9 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, assert(Instr->getParent() && "Predicated instruction not in any basic block"); auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); - auto *PHIRecipe = Instr->getType()->isVoidTy() - ? nullptr - : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); + auto *PHIRecipe = Instr->getType()->isVoidTy() + ? nullptr + : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); @@ -8554,21 +8554,21 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (auto Phi = dyn_cast<PHINode>(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) return tryToBlend(Phi, Plan); - if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) + if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) return Recipe; - - if (Legal->isReductionVariable(Phi)) { - RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; - VPValue *StartV = - Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); - return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); - } - + + if (Legal->isReductionVariable(Phi)) { + RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + VPValue *StartV = + Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); + return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); + } + return new VPWidenPHIRecipe(Phi); } - if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( - cast<TruncInst>(Instr), Range, *Plan))) + if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( + cast<TruncInst>(Instr), Range, *Plan))) return Recipe; if (!shouldWiden(Instr, Range)) @@ -8588,9 +8588,9 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, return tryToWiden(Instr, *Plan); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, - ElementCount MaxVF) { - assert(OrigLoop->isInnermost() && "Inner loop expected."); +void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, + ElementCount MaxVF) { + assert(OrigLoop->isInnermost() && "Inner loop expected."); // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For @@ -8613,17 +8613,17 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, for (Instruction *I : DeadInstructions) SinkAfter.erase(I); - auto MaxVFPlusOne = MaxVF.getWithIncrement(1); - for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { - VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); + auto MaxVFPlusOne = MaxVF.getWithIncrement(1); + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { + VFRange SubRange = {VF, MaxVFPlusOne}; + VPlans.push_back( + buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); VF = SubRange.End; } } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, + VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, const DenseMap<Instruction *, Instruction *> &SinkAfter) { // Hold a mapping from predicated instructions to their recipes, in order to @@ -8646,28 +8646,28 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RecipeBuilder.recordRecipeOf(Entry.first); RecipeBuilder.recordRecipeOf(Entry.second); } - for (auto &Reduction : CM.getInLoopReductionChains()) { - PHINode *Phi = Reduction.first; - RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); - const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; - - RecipeBuilder.recordRecipeOf(Phi); - for (auto &R : ReductionOperations) { - RecipeBuilder.recordRecipeOf(R); - // For min/max reducitons, where we have a pair of icmp/select, we also - // need to record the ICmp recipe, so it can be removed later. - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) - RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); - } - } - + for (auto &Reduction : CM.getInLoopReductionChains()) { + PHINode *Phi = Reduction.first; + RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); + const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; + + RecipeBuilder.recordRecipeOf(Phi); + for (auto &R : ReductionOperations) { + RecipeBuilder.recordRecipeOf(R); + // For min/max reducitons, where we have a pair of icmp/select, we also + // need to record the ICmp recipe, so it can be removed later. + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) + RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); + } + } + // For each interleave group which is relevant for this (possibly trimmed) // Range, add it to the set of groups to be later applied to the VPlan and add // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](ElementCount VF) -> bool { - return (VF.isVector() && // Query is illegal for VF == 1 + auto applyIG = [IG, this](ElementCount VF) -> bool { + return (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); }; @@ -8715,11 +8715,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( if (auto Recipe = RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { - for (auto *Def : Recipe->definedValues()) { - auto *UV = Def->getUnderlyingValue(); - Plan->addVPValue(UV, Def); - } - + for (auto *Def : Recipe->definedValues()) { + auto *UV = Def->getUnderlyingValue(); + Plan->addVPValue(UV, Def); + } + RecipeBuilder.setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); continue; @@ -8755,18 +8755,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( for (auto &Entry : SinkAfter) { VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); - // If the target is in a replication region, make sure to move Sink to the - // block after it, not into the replication region itself. - if (auto *Region = - dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { - if (Region->isReplicator()) { - assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); - VPBasicBlock *NextBlock = - cast<VPBasicBlock>(Region->getSuccessors().front()); - Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); - continue; - } - } + // If the target is in a replication region, make sure to move Sink to the + // block after it, not into the replication region itself. + if (auto *Region = + dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { + if (Region->isReplicator()) { + assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); + VPBasicBlock *NextBlock = + cast<VPBasicBlock>(Region->getSuccessors().front()); + Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); + continue; + } + } Sink->moveAfter(Target); } @@ -8776,52 +8776,52 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( for (auto IG : InterleaveGroups) { auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( RecipeBuilder.getRecipe(IG->getInsertPos())); - SmallVector<VPValue *, 4> StoredValues; - for (unsigned i = 0; i < IG->getFactor(); ++i) - if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) - StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); - - auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, - Recipe->getMask()); - VPIG->insertBefore(Recipe); - unsigned J = 0; + SmallVector<VPValue *, 4> StoredValues; + for (unsigned i = 0; i < IG->getFactor(); ++i) + if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) + StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); + + auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, + Recipe->getMask()); + VPIG->insertBefore(Recipe); + unsigned J = 0; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *Member = IG->getMember(i)) { - if (!Member->getType()->isVoidTy()) { - VPValue *OriginalV = Plan->getVPValue(Member); - Plan->removeVPValueFor(Member); - Plan->addVPValue(Member, VPIG->getVPValue(J)); - OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); - J++; - } + if (!Member->getType()->isVoidTy()) { + VPValue *OriginalV = Plan->getVPValue(Member); + Plan->removeVPValueFor(Member); + Plan->addVPValue(Member, VPIG->getVPValue(J)); + OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); + J++; + } RecipeBuilder.getRecipe(Member)->eraseFromParent(); } } - // Adjust the recipes for any inloop reductions. - if (Range.Start.isVector()) - adjustRecipesForInLoopReductions(Plan, RecipeBuilder); - + // Adjust the recipes for any inloop reductions. + if (Range.Start.isVector()) + adjustRecipesForInLoopReductions(Plan, RecipeBuilder); + // Finally, if tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the end of the latch. - if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { + if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { Builder.setInsertPoint(VPBB); auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); for (auto &Reduction : Legal->getReductionVars()) { - if (CM.isInLoopReduction(Reduction.first)) - continue; - VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); - VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); + if (CM.isInLoopReduction(Reduction.first)) + continue; + VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); + VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); } } std::string PlanName; raw_string_ostream RSO(PlanName); - ElementCount VF = Range.Start; + ElementCount VF = Range.Start; Plan->addVF(VF); RSO << "Initial VPlan for VF={" << VF; - for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { + for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { Plan->addVF(VF); RSO << "," << VF; } @@ -8837,7 +8837,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in // the vectorization pipeline. - assert(!OrigLoop->isInnermost()); + assert(!OrigLoop->isInnermost()); assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan @@ -8847,8 +8847,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildHierarchicalCFG(); - for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); - VF *= 2) + for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); + VF *= 2) Plan->addVF(VF); if (EnableVPlanPredication) { @@ -8866,67 +8866,67 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { return Plan; } -// Adjust the recipes for any inloop reductions. The chain of instructions -// leading from the loop exit instr to the phi need to be converted to -// reductions, with one operand being vector and the other being the scalar -// reduction chain. -void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( - VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { - for (auto &Reduction : CM.getInLoopReductionChains()) { - PHINode *Phi = Reduction.first; - RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; - const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; - - // ReductionOperations are orders top-down from the phi's use to the - // LoopExitValue. We keep a track of the previous item (the Chain) to tell - // which of the two operands will remain scalar and which will be reduced. - // For minmax the chain will be the select instructions. - Instruction *Chain = Phi; - for (Instruction *R : ReductionOperations) { - VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); - RecurKind Kind = RdxDesc.getRecurrenceKind(); - - VPValue *ChainOp = Plan->getVPValue(Chain); - unsigned FirstOpId; - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { - assert(isa<VPWidenSelectRecipe>(WidenRecipe) && - "Expected to replace a VPWidenSelectSC"); - FirstOpId = 1; - } else { - assert(isa<VPWidenRecipe>(WidenRecipe) && - "Expected to replace a VPWidenSC"); - FirstOpId = 0; - } - unsigned VecOpId = - R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; - VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); - - auto *CondOp = CM.foldTailByMasking() - ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) - : nullptr; - VPReductionRecipe *RedRecipe = new VPReductionRecipe( - &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); - WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); - Plan->removeVPValueFor(R); - Plan->addVPValue(R, RedRecipe); - WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); - WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); - WidenRecipe->eraseFromParent(); - - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { - VPRecipeBase *CompareRecipe = - RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); - assert(isa<VPWidenRecipe>(CompareRecipe) && - "Expected to replace a VPWidenSC"); - assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && - "Expected no remaining users"); - CompareRecipe->eraseFromParent(); - } - Chain = R; - } - } -} - +// Adjust the recipes for any inloop reductions. The chain of instructions +// leading from the loop exit instr to the phi need to be converted to +// reductions, with one operand being vector and the other being the scalar +// reduction chain. +void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( + VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { + for (auto &Reduction : CM.getInLoopReductionChains()) { + PHINode *Phi = Reduction.first; + RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; + + // ReductionOperations are orders top-down from the phi's use to the + // LoopExitValue. We keep a track of the previous item (the Chain) to tell + // which of the two operands will remain scalar and which will be reduced. + // For minmax the chain will be the select instructions. + Instruction *Chain = Phi; + for (Instruction *R : ReductionOperations) { + VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); + RecurKind Kind = RdxDesc.getRecurrenceKind(); + + VPValue *ChainOp = Plan->getVPValue(Chain); + unsigned FirstOpId; + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { + assert(isa<VPWidenSelectRecipe>(WidenRecipe) && + "Expected to replace a VPWidenSelectSC"); + FirstOpId = 1; + } else { + assert(isa<VPWidenRecipe>(WidenRecipe) && + "Expected to replace a VPWidenSC"); + FirstOpId = 0; + } + unsigned VecOpId = + R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; + VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); + + auto *CondOp = CM.foldTailByMasking() + ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) + : nullptr; + VPReductionRecipe *RedRecipe = new VPReductionRecipe( + &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); + WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); + Plan->removeVPValueFor(R); + Plan->addVPValue(R, RedRecipe); + WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); + WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); + WidenRecipe->eraseFromParent(); + + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { + VPRecipeBase *CompareRecipe = + RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); + assert(isa<VPWidenRecipe>(CompareRecipe) && + "Expected to replace a VPWidenSC"); + assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && + "Expected no remaining users"); + CompareRecipe->eraseFromParent(); + } + Chain = R; + } + } +} + Value* LoopVectorizationPlanner::VPCallbackILV:: getOrCreateVectorValues(Value *V, unsigned Part) { return ILV.getOrCreateVectorValue(V, Part); @@ -8954,35 +8954,35 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } void VPWidenCallRecipe::execute(VPTransformState &State) { - State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, - *this, State); + State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, + *this, State); } void VPWidenSelectRecipe::execute(VPTransformState &State) { - State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), - this, *this, InvariantCond, State); + State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), + this, *this, InvariantCond, State); } void VPWidenRecipe::execute(VPTransformState &State) { - State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); + State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); } void VPWidenGEPRecipe::execute(VPTransformState &State) { - State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, - *this, State.UF, State.VF, IsPtrLoopInvariant, + State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, + *this, State.UF, State.VF, IsPtrLoopInvariant, IsIndexLoopInvariant, State); } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), - Trunc); + State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), + Trunc); } void VPWidenPHIRecipe::execute(VPTransformState &State) { - Value *StartV = - getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; - State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF); + Value *StartV = + getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; + State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF); } void VPBlendRecipe::execute(VPTransformState &State) { @@ -9026,59 +9026,59 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), - getStoredValues(), getMask()); -} - -void VPReductionRecipe::execute(VPTransformState &State) { - assert(!State.Instance && "Reduction being replicated."); - for (unsigned Part = 0; Part < State.UF; ++Part) { - RecurKind Kind = RdxDesc->getRecurrenceKind(); - Value *NewVecOp = State.get(getVecOp(), Part); - if (VPValue *Cond = getCondOp()) { - Value *NewCond = State.get(Cond, Part); - VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); - Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( - Kind, VecTy->getElementType()); - Constant *IdenVec = - ConstantVector::getSplat(VecTy->getElementCount(), Iden); - Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); - NewVecOp = Select; - } - Value *NewRed = - createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); - Value *PrevInChain = State.get(getChainOp(), Part); - Value *NextInChain; - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { - NextInChain = - createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), - NewRed, PrevInChain); - } else { - NextInChain = State.Builder.CreateBinOp( - (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, - PrevInChain); - } - State.set(this, getUnderlyingInstr(), NextInChain, Part); - } -} - + State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), + getStoredValues(), getMask()); +} + +void VPReductionRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "Reduction being replicated."); + for (unsigned Part = 0; Part < State.UF; ++Part) { + RecurKind Kind = RdxDesc->getRecurrenceKind(); + Value *NewVecOp = State.get(getVecOp(), Part); + if (VPValue *Cond = getCondOp()) { + Value *NewCond = State.get(Cond, Part); + VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); + Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( + Kind, VecTy->getElementType()); + Constant *IdenVec = + ConstantVector::getSplat(VecTy->getElementCount(), Iden); + Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); + NewVecOp = Select; + } + Value *NewRed = + createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); + Value *PrevInChain = State.get(getChainOp(), Part); + Value *NextInChain; + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { + NextInChain = + createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), + NewRed, PrevInChain); + } else { + NextInChain = State.Builder.CreateBinOp( + (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, + PrevInChain); + } + State.set(this, getUnderlyingInstr(), NextInChain, Part); + } +} + void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. - assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, - *State.Instance, IsPredicated, State); + assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); + State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, + *State.Instance, IsPredicated, State); // Insert scalar instance packing it into a vector. - if (AlsoPack && State.VF.isVector()) { - // If we're constructing lane 0, initialize to start from poison. + if (AlsoPack && State.VF.isVector()) { + // If we're constructing lane 0, initialize to start from poison. if (State.Instance->Lane == 0) { - assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); - Value *Poison = PoisonValue::get( - VectorType::get(getUnderlyingValue()->getType(), State.VF)); - State.ValueMap.setVectorValue(getUnderlyingInstr(), - State.Instance->Part, Poison); + assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); + Value *Poison = PoisonValue::get( + VectorType::get(getUnderlyingValue()->getType(), State.VF)); + State.ValueMap.setVectorValue(getUnderlyingInstr(), + State.Instance->Part, Poison); } - State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), - *State.Instance); + State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), + *State.Instance); } return; } @@ -9086,12 +9086,12 @@ void VPReplicateRecipe::execute(VPTransformState &State) { // Generate scalar instances for all VF lanes of all UF parts, unless the // instruction is uniform inwhich case generate only the first lane for each // of the UF parts. - unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); - assert((!State.VF.isScalable() || IsUniform) && - "Can't scalarize a scalable vector"); + unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); + assert((!State.VF.isScalable() || IsUniform) && + "Can't scalarize a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, + State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, IsPredicated, State); } @@ -9123,8 +9123,8 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) { void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Instance && "Predicated instruction PHI works per instance."); - Instruction *ScalarPredInst = - cast<Instruction>(State.get(getOperand(0), *State.Instance)); + Instruction *ScalarPredInst = + cast<Instruction>(State.get(getOperand(0), *State.Instance)); BasicBlock *PredicatedBB = ScalarPredInst->getParent(); BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); assert(PredicatingBB && "Predicated block has no single predecessor."); @@ -9136,8 +9136,8 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { // also do that packing, thereby "hoisting" the insert-element sequence. // Otherwise, a phi node for the scalar value is needed. unsigned Part = State.Instance->Part; - Instruction *PredInst = - cast<Instruction>(getOperand(0)->getUnderlyingValue()); + Instruction *PredInst = + cast<Instruction>(getOperand(0)->getUnderlyingValue()); if (State.ValueMap.hasVectorValue(PredInst, Part)) { Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); @@ -9148,17 +9148,17 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { } else { Type *PredInstType = PredInst->getType(); PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); - Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); + Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); Phi->addIncoming(ScalarPredInst, PredicatedBB); State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); } } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { - VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; - State.ILV->vectorizeMemoryInstruction(&Ingredient, State, - StoredValue ? nullptr : getVPValue(), - getAddr(), StoredValue, getMask()); + VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; + State.ILV->vectorizeMemoryInstruction(&Ingredient, State, + StoredValue ? nullptr : getVPValue(), + getAddr(), StoredValue, getMask()); } // Determine how to lower the scalar epilogue, which depends on 1) optimising @@ -9172,51 +9172,51 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( LoopVectorizationLegality &LVL) { // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. - // (For PGSO, as shouldOptimizeForSize isn't currently accessible from - // LoopAccessInfo (due to code dependency and not being able to reliably get - // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection - // of strides in LoopAccessInfo::analyzeLoop() and vectorize without - // versioning when the vectorization is forced, unlike hasOptSize. So revert - // back to the old way and vectorize with versioning when forced. See D81345.) - if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, - PGSOQueryType::IRPass) && - Hints.getForce() != LoopVectorizeHints::FK_Enabled)) + // (For PGSO, as shouldOptimizeForSize isn't currently accessible from + // LoopAccessInfo (due to code dependency and not being able to reliably get + // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection + // of strides in LoopAccessInfo::analyzeLoop() and vectorize without + // versioning when the vectorization is forced, unlike hasOptSize. So revert + // back to the old way and vectorize with versioning when forced. See D81345.) + if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, + PGSOQueryType::IRPass) && + Hints.getForce() != LoopVectorizeHints::FK_Enabled)) return CM_ScalarEpilogueNotAllowedOptSize; - // 2) If set, obey the directives - if (PreferPredicateOverEpilogue.getNumOccurrences()) { - switch (PreferPredicateOverEpilogue) { - case PreferPredicateTy::ScalarEpilogue: - return CM_ScalarEpilogueAllowed; - case PreferPredicateTy::PredicateElseScalarEpilogue: - return CM_ScalarEpilogueNotNeededUsePredicate; - case PreferPredicateTy::PredicateOrDontVectorize: - return CM_ScalarEpilogueNotAllowedUsePredicate; - }; - } - - // 3) If set, obey the hints - switch (Hints.getPredicate()) { - case LoopVectorizeHints::FK_Enabled: - return CM_ScalarEpilogueNotNeededUsePredicate; - case LoopVectorizeHints::FK_Disabled: + // 2) If set, obey the directives + if (PreferPredicateOverEpilogue.getNumOccurrences()) { + switch (PreferPredicateOverEpilogue) { + case PreferPredicateTy::ScalarEpilogue: + return CM_ScalarEpilogueAllowed; + case PreferPredicateTy::PredicateElseScalarEpilogue: + return CM_ScalarEpilogueNotNeededUsePredicate; + case PreferPredicateTy::PredicateOrDontVectorize: + return CM_ScalarEpilogueNotAllowedUsePredicate; + }; + } + + // 3) If set, obey the hints + switch (Hints.getPredicate()) { + case LoopVectorizeHints::FK_Enabled: + return CM_ScalarEpilogueNotNeededUsePredicate; + case LoopVectorizeHints::FK_Disabled: return CM_ScalarEpilogueAllowed; - }; + }; - // 4) if the TTI hook indicates this is profitable, request predication. - if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, - LVL.getLAI())) + // 4) if the TTI hook indicates this is profitable, request predication. + if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, + LVL.getLAI())) return CM_ScalarEpilogueNotNeededUsePredicate; return CM_ScalarEpilogueAllowed; } -void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, - unsigned Part) { - set(Def, V, Part); - ILV->setVectorValue(IRDef, Part, V); -} - +void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, + unsigned Part) { + set(Def, V, Part); + ILV->setVectorValue(IRDef, Part, V); +} + // Process the loop in the VPlan-native vectorization path. This path builds // VPlan upfront in the vectorization pipeline, which allows to apply // VPlan-to-VPlan transformations from the very beginning without modifying the @@ -9228,7 +9228,7 @@ static bool processLoopInVPlanNativePath( OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { - if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { + if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); return false; } @@ -9247,7 +9247,7 @@ static bool processLoopInVPlanNativePath( LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); // Get user vectorization factor. - ElementCount UserVF = Hints.getWidth(); + ElementCount UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); @@ -9262,7 +9262,7 @@ static bool processLoopInVPlanNativePath( LVP.setBestPlan(VF.Width, 1); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI); + &CM, BFI, PSI); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(LB, DT); @@ -9281,7 +9281,7 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) !EnableLoopVectorization) {} bool LoopVectorizePass::processLoop(Loop *L) { - assert((EnableVPlanNativePath || L->isInnermost()) && + assert((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."); #ifndef NDEBUG @@ -9326,7 +9326,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements(*ORE); LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, - &Requirements, &Hints, DB, AC, BFI, PSI); + &Requirements, &Hints, DB, AC, BFI, PSI); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); Hints.emitRemarkWithHints(); @@ -9343,11 +9343,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { // even evaluating whether vectorization is profitable. Since we cannot modify // the incoming IR, we need to build VPlan upfront in the vectorization // pipeline. - if (!L->isInnermost()) + if (!L->isInnermost()) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, ORE, BFI, PSI, Hints); - assert(L->isInnermost() && "Inner loop expected."); + assert(L->isInnermost() && "Inner loop expected."); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. @@ -9412,7 +9412,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); // Get user vectorization factor and interleave count. - ElementCount UserVF = Hints.getWidth(); + ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. @@ -9437,7 +9437,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - if (VF.Width.isScalar()) { + if (VF.Width.isScalar()) { LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( "VectorizationNotBeneficial", @@ -9526,8 +9526,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, - BFI, PSI); + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, + BFI, PSI); LVP.executePlan(Unroller, DT); ORE->emit([&]() { @@ -9539,51 +9539,51 @@ bool LoopVectorizePass::processLoop(Loop *L) { } else { // If we decided that it is *legal* to vectorize the loop, then do it. - // Consider vectorizing the epilogue too if it's profitable. - VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); - if (EpilogueVF.Width.isVector()) { - - // The first pass vectorizes the main loop and creates a scalar epilogue - // to be vectorized by executing the plan (potentially with a different - // factor) again shortly afterwards. - EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, - EpilogueVF.Width.getKnownMinValue(), 1); - EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, - &LVL, &CM, BFI, PSI); - - LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); - LVP.executePlan(MainILV, DT); - ++LoopsVectorized; - - simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); - formLCSSARecursively(*L, *DT, LI, SE); - - // Second pass vectorizes the epilogue and adjusts the control flow - // edges from the first pass. - LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); - EPI.MainLoopVF = EPI.EpilogueVF; - EPI.MainLoopUF = EPI.EpilogueUF; - EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, - ORE, EPI, &LVL, &CM, BFI, PSI); - LVP.executePlan(EpilogILV, DT); - ++LoopsEpilogueVectorized; - - if (!MainILV.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; - } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI); - LVP.executePlan(LB, DT); - ++LoopsVectorized; - - // Add metadata to disable runtime unrolling a scalar loop when there are - // no runtime checks about strides and memory. A scalar loop that is - // rarely used is not worth unrolling. - if (!LB.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; - } - + // Consider vectorizing the epilogue too if it's profitable. + VectorizationFactor EpilogueVF = + CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + if (EpilogueVF.Width.isVector()) { + + // The first pass vectorizes the main loop and creates a scalar epilogue + // to be vectorized by executing the plan (potentially with a different + // factor) again shortly afterwards. + EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, + EpilogueVF.Width.getKnownMinValue(), 1); + EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, + &LVL, &CM, BFI, PSI); + + LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); + LVP.executePlan(MainILV, DT); + ++LoopsVectorized; + + simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); + formLCSSARecursively(*L, *DT, LI, SE); + + // Second pass vectorizes the epilogue and adjusts the control flow + // edges from the first pass. + LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); + EPI.MainLoopVF = EPI.EpilogueVF; + EPI.MainLoopUF = EPI.EpilogueUF; + EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, + ORE, EPI, &LVL, &CM, BFI, PSI); + LVP.executePlan(EpilogILV, DT); + ++LoopsEpilogueVectorized; + + if (!MainILV.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } else { + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, + &LVL, &CM, BFI, PSI); + LVP.executePlan(LB, DT); + ++LoopsVectorized; + + // Add metadata to disable runtime unrolling a scalar loop when there are + // no runtime checks about strides and memory. A scalar loop that is + // rarely used is not worth unrolling. + if (!LB.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } + // Report the vectorization decision. ORE->emit([&]() { return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), @@ -9696,8 +9696,8 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, - TLI, TTI, nullptr, MSSA}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, + TLI, TTI, nullptr, MSSA}; return LAM.getResult<LoopAccessAnalysis>(L, AR); }; auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0b63019791..7cc322d4b6 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -26,16 +26,16 @@ #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" @@ -80,7 +80,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" -#include "llvm/Support/InstructionCost.h" +#include "llvm/Support/InstructionCost.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -128,10 +128,10 @@ static cl::opt<int> MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); -static cl::opt<unsigned> -MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, - cl::desc("Maximum SLP vectorization factor (0=unlimited)")); - +static cl::opt<unsigned> +MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, + cl::desc("Maximum SLP vectorization factor (0=unlimited)")); + static cl::opt<int> MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, cl::desc("Maximum depth of the lookup for consecutive stores.")); @@ -206,12 +206,12 @@ static bool allSameBlock(ArrayRef<Value *> VL) { if (!I0) return false; BasicBlock *BB = I0->getParent(); - for (int I = 1, E = VL.size(); I < E; I++) { - auto *II = dyn_cast<Instruction>(VL[I]); - if (!II) + for (int I = 1, E = VL.size(); I < E; I++) { + auto *II = dyn_cast<Instruction>(VL[I]); + if (!II) return false; - if (BB != II->getParent()) + if (BB != II->getParent()) return false; } return true; @@ -236,16 +236,16 @@ static bool isSplat(ArrayRef<Value *> VL) { return true; } -/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. +/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. static bool isCommutative(Instruction *I) { - if (auto *Cmp = dyn_cast<CmpInst>(I)) - return Cmp->isCommutative(); - if (auto *BO = dyn_cast<BinaryOperator>(I)) - return BO->isCommutative(); - // TODO: This should check for generic Instruction::isCommutative(), but - // we need to confirm that the caller code correctly handles Intrinsics - // for example (does not have 2 operands). - return false; + if (auto *Cmp = dyn_cast<CmpInst>(I)) + return Cmp->isCommutative(); + if (auto *BO = dyn_cast<BinaryOperator>(I)) + return BO->isCommutative(); + // TODO: This should check for generic Instruction::isCommutative(), but + // we need to confirm that the caller code correctly handles Intrinsics + // for example (does not have 2 operands). + return false; } /// Checks if the vector of instructions can be represented as a shuffle, like: @@ -257,7 +257,7 @@ static bool isCommutative(Instruction *I) { /// %x3x3 = mul i8 %x3, %x3 /// %y1y1 = mul i8 %y1, %y1 /// %y2y2 = mul i8 %y2, %y2 -/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 +/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 @@ -272,13 +272,13 @@ static bool isCommutative(Instruction *I) { /// %x3 = extractelement <4 x i8> %x, i32 3 /// %y1 = extractelement <4 x i8> %y, i32 1 /// %y2 = extractelement <4 x i8> %y, i32 2 -/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 +/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 /// %5 = mul <4 x i8> %4, %4 /// %6 = extractelement <4 x i8> %5, i32 0 -/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 +/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 /// %7 = extractelement <4 x i8> %5, i32 1 /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 /// %8 = extractelement <4 x i8> %5, i32 2 @@ -292,8 +292,8 @@ static bool isCommutative(Instruction *I) { static Optional<TargetTransformInfo::ShuffleKind> isShuffle(ArrayRef<Value *> VL) { auto *EI0 = cast<ExtractElementInst>(VL[0]); - unsigned Size = - cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements(); + unsigned Size = + cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements(); Value *Vec1 = nullptr; Value *Vec2 = nullptr; enum ShuffleMode { Unknown, Select, Permute }; @@ -302,7 +302,7 @@ isShuffle(ArrayRef<Value *> VL) { auto *EI = cast<ExtractElementInst>(VL[I]); auto *Vec = EI->getVectorOperand(); // All vector operands must have the same number of vector elements. - if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) + if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) return None; auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); if (!Idx) @@ -311,7 +311,7 @@ isShuffle(ArrayRef<Value *> VL) { if (Idx->getValue().uge(Size)) continue; unsigned IntIdx = Idx->getValue().getZExtValue(); - // We can extractelement from undef or poison vector. + // We can extractelement from undef or poison vector. if (isa<UndefValue>(Vec)) continue; // For correct shuffling we have to have at most 2 different vector operands @@ -508,7 +508,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, } /// \returns the AA location that is being access by the instruction. -static MemoryLocation getLocation(Instruction *I, AAResults *AA) { +static MemoryLocation getLocation(Instruction *I, AAResults *AA) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) return MemoryLocation::get(SI); if (LoadInst *LI = dyn_cast<LoadInst>(I)) @@ -529,15 +529,15 @@ static bool isSimple(Instruction *I) { namespace llvm { -static void inversePermutation(ArrayRef<unsigned> Indices, - SmallVectorImpl<int> &Mask) { - Mask.clear(); - const unsigned E = Indices.size(); - Mask.resize(E, E + 1); - for (unsigned I = 0; I < E; ++I) - Mask[Indices[I]] = I; -} - +static void inversePermutation(ArrayRef<unsigned> Indices, + SmallVectorImpl<int> &Mask) { + Mask.clear(); + const unsigned E = Indices.size(); + Mask.resize(E, E + 1); + for (unsigned I = 0; I < E; ++I) + Mask[Indices[I]] = I; +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -552,10 +552,10 @@ public: using StoreList = SmallVector<StoreInst *, 8>; using ExtraValueToDebugLocsMap = MapVector<Value *, SmallVector<Instruction *, 2>>; - using OrdersType = SmallVector<unsigned, 4>; + using OrdersType = SmallVector<unsigned, 4>; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, - TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, + TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE) : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), @@ -589,11 +589,11 @@ public: /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - InstructionCost getSpillCost() const; + InstructionCost getSpillCost() const; /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - InstructionCost getTreeCost(); + InstructionCost getTreeCost(); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -630,14 +630,14 @@ public: /// \returns The best order of instructions for vectorization. Optional<ArrayRef<unsigned>> bestOrder() const { - assert(llvm::all_of( - NumOpsWantToKeepOrder, - [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) { - return D.getFirst().size() == - VectorizableTree[0]->Scalars.size(); - }) && - "All orders must have the same size as number of instructions in " - "tree node."); + assert(llvm::all_of( + NumOpsWantToKeepOrder, + [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) { + return D.getFirst().size() == + VectorizableTree[0]->Scalars.size(); + }) && + "All orders must have the same size as number of instructions in " + "tree node."); auto I = std::max_element( NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, @@ -651,81 +651,81 @@ public: return makeArrayRef(I->getFirst()); } - /// Builds the correct order for root instructions. - /// If some leaves have the same instructions to be vectorized, we may - /// incorrectly evaluate the best order for the root node (it is built for the - /// vector of instructions without repeated instructions and, thus, has less - /// elements than the root node). This function builds the correct order for - /// the root node. - /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves - /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first - /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should - /// be reordered, the best order will be \<1, 0\>. We need to extend this - /// order for the root node. For the root node this order should look like - /// \<3, 0, 1, 2\>. This function extends the order for the reused - /// instructions. - void findRootOrder(OrdersType &Order) { - // If the leaf has the same number of instructions to vectorize as the root - // - order must be set already. - unsigned RootSize = VectorizableTree[0]->Scalars.size(); - if (Order.size() == RootSize) - return; - SmallVector<unsigned, 4> RealOrder(Order.size()); - std::swap(Order, RealOrder); - SmallVector<int, 4> Mask; - inversePermutation(RealOrder, Mask); - Order.assign(Mask.begin(), Mask.end()); - // The leaf has less number of instructions - need to find the true order of - // the root. - // Scan the nodes starting from the leaf back to the root. - const TreeEntry *PNode = VectorizableTree.back().get(); - SmallVector<const TreeEntry *, 4> Nodes(1, PNode); - SmallPtrSet<const TreeEntry *, 4> Visited; - while (!Nodes.empty() && Order.size() != RootSize) { - const TreeEntry *PNode = Nodes.pop_back_val(); - if (!Visited.insert(PNode).second) - continue; - const TreeEntry &Node = *PNode; - for (const EdgeInfo &EI : Node.UserTreeIndices) - if (EI.UserTE) - Nodes.push_back(EI.UserTE); - if (Node.ReuseShuffleIndices.empty()) - continue; - // Build the order for the parent node. - OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize); - SmallVector<unsigned, 4> OrderCounter(Order.size(), 0); - // The algorithm of the order extension is: - // 1. Calculate the number of the same instructions for the order. - // 2. Calculate the index of the new order: total number of instructions - // with order less than the order of the current instruction + reuse - // number of the current instruction. - // 3. The new order is just the index of the instruction in the original - // vector of the instructions. - for (unsigned I : Node.ReuseShuffleIndices) - ++OrderCounter[Order[I]]; - SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0); - for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) { - unsigned ReusedIdx = Node.ReuseShuffleIndices[I]; - unsigned OrderIdx = Order[ReusedIdx]; - unsigned NewIdx = 0; - for (unsigned J = 0; J < OrderIdx; ++J) - NewIdx += OrderCounter[J]; - NewIdx += CurrentCounter[OrderIdx]; - ++CurrentCounter[OrderIdx]; - assert(NewOrder[NewIdx] == RootSize && - "The order index should not be written already."); - NewOrder[NewIdx] = I; - } - std::swap(Order, NewOrder); - } - assert(Order.size() == RootSize && - "Root node is expected or the size of the order must be the same as " - "the number of elements in the root node."); - assert(llvm::all_of(Order, - [RootSize](unsigned Val) { return Val != RootSize; }) && - "All indices must be initialized"); - } - + /// Builds the correct order for root instructions. + /// If some leaves have the same instructions to be vectorized, we may + /// incorrectly evaluate the best order for the root node (it is built for the + /// vector of instructions without repeated instructions and, thus, has less + /// elements than the root node). This function builds the correct order for + /// the root node. + /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves + /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first + /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should + /// be reordered, the best order will be \<1, 0\>. We need to extend this + /// order for the root node. For the root node this order should look like + /// \<3, 0, 1, 2\>. This function extends the order for the reused + /// instructions. + void findRootOrder(OrdersType &Order) { + // If the leaf has the same number of instructions to vectorize as the root + // - order must be set already. + unsigned RootSize = VectorizableTree[0]->Scalars.size(); + if (Order.size() == RootSize) + return; + SmallVector<unsigned, 4> RealOrder(Order.size()); + std::swap(Order, RealOrder); + SmallVector<int, 4> Mask; + inversePermutation(RealOrder, Mask); + Order.assign(Mask.begin(), Mask.end()); + // The leaf has less number of instructions - need to find the true order of + // the root. + // Scan the nodes starting from the leaf back to the root. + const TreeEntry *PNode = VectorizableTree.back().get(); + SmallVector<const TreeEntry *, 4> Nodes(1, PNode); + SmallPtrSet<const TreeEntry *, 4> Visited; + while (!Nodes.empty() && Order.size() != RootSize) { + const TreeEntry *PNode = Nodes.pop_back_val(); + if (!Visited.insert(PNode).second) + continue; + const TreeEntry &Node = *PNode; + for (const EdgeInfo &EI : Node.UserTreeIndices) + if (EI.UserTE) + Nodes.push_back(EI.UserTE); + if (Node.ReuseShuffleIndices.empty()) + continue; + // Build the order for the parent node. + OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize); + SmallVector<unsigned, 4> OrderCounter(Order.size(), 0); + // The algorithm of the order extension is: + // 1. Calculate the number of the same instructions for the order. + // 2. Calculate the index of the new order: total number of instructions + // with order less than the order of the current instruction + reuse + // number of the current instruction. + // 3. The new order is just the index of the instruction in the original + // vector of the instructions. + for (unsigned I : Node.ReuseShuffleIndices) + ++OrderCounter[Order[I]]; + SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0); + for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) { + unsigned ReusedIdx = Node.ReuseShuffleIndices[I]; + unsigned OrderIdx = Order[ReusedIdx]; + unsigned NewIdx = 0; + for (unsigned J = 0; J < OrderIdx; ++J) + NewIdx += OrderCounter[J]; + NewIdx += CurrentCounter[OrderIdx]; + ++CurrentCounter[OrderIdx]; + assert(NewOrder[NewIdx] == RootSize && + "The order index should not be written already."); + NewOrder[NewIdx] = I; + } + std::swap(Order, NewOrder); + } + assert(Order.size() == RootSize && + "Root node is expected or the size of the order must be the same as " + "the number of elements in the root node."); + assert(llvm::all_of(Order, + [RootSize](unsigned Val) { return Val != RootSize; }) && + "All indices must be initialized"); + } + /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of /// the stored value. Otherwise, the size is the width of the largest loaded @@ -747,12 +747,12 @@ public: return MinVecRegSize; } - unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { - unsigned MaxVF = MaxVFOption.getNumOccurrences() ? - MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); - return MaxVF ? MaxVF : UINT_MAX; - } - + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { + unsigned MaxVF = MaxVFOption.getNumOccurrences() ? + MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); + return MaxVF ? MaxVF : UINT_MAX; + } + /// Check if homogeneous aggregate is isomorphic to some VectorType. /// Accepts homogeneous multidimensional aggregate of scalars/vectors like /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, @@ -772,7 +772,7 @@ public: /// effectively impossible for the backend to undo. /// TODO: If load combining is allowed in the IR optimizer, this analysis /// may not be necessary. - bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; + bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values /// can be load combined in the backend. Load combining may not be allowed in @@ -987,14 +987,14 @@ public: std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}}; for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { Value *V = Values[Idx].first; - if (isa<Constant>(V)) { - // Since this is a function pass, it doesn't make semantic sense to - // walk the users of a subclass of Constant. The users could be in - // another function, or even another module that happens to be in - // the same LLVMContext. - continue; - } - + if (isa<Constant>(V)) { + // Since this is a function pass, it doesn't make semantic sense to + // walk the users of a subclass of Constant. The users could be in + // another function, or even another module that happens to be in + // the same LLVMContext. + continue; + } + // Calculate the absolute lane, using the minimum relative lane of LHS // and RHS as base and Idx as the offset. int Ln = std::min(LHS.second, RHS.second) + Idx; @@ -1503,7 +1503,7 @@ private: bool areAllUsersVectorized(Instruction *I) const; /// \returns the cost of the vectorizable entry. - InstructionCost getEntryCost(TreeEntry *E); + InstructionCost getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, @@ -1525,21 +1525,21 @@ private: /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - InstructionCost - getGatherCost(FixedVectorType *Ty, - const DenseSet<unsigned> &ShuffledIndices) const; + InstructionCost + getGatherCost(FixedVectorType *Ty, + const DenseSet<unsigned> &ShuffledIndices) const; /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. - InstructionCost getGatherCost(ArrayRef<Value *> VL) const; + InstructionCost getGatherCost(ArrayRef<Value *> VL) const; /// Set the Builder insert point to one after the last instruction in /// the bundle void setInsertPointAfterBundle(TreeEntry *E); /// \returns a vector from a collection of scalars in \p VL. - Value *gather(ArrayRef<Value *> VL); + Value *gather(ArrayRef<Value *> VL); /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. @@ -1573,17 +1573,17 @@ private: /// The Scalars are vectorized into this value. It is initialized to Null. Value *VectorizedValue = nullptr; - /// Do we need to gather this sequence or vectorize it - /// (either with vector instruction or with scatter/gather - /// intrinsics for store/load)? - enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; + /// Do we need to gather this sequence or vectorize it + /// (either with vector instruction or with scatter/gather + /// intrinsics for store/load)? + enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; EntryState State; /// Does this sequence require some shuffling? SmallVector<int, 4> ReuseShuffleIndices; /// Does this entry require reordering? - SmallVector<unsigned, 4> ReorderIndices; + SmallVector<unsigned, 4> ReorderIndices; /// Points back to the VectorizableTree. /// @@ -1724,9 +1724,9 @@ private: case Vectorize: dbgs() << "Vectorize\n"; break; - case ScatterVectorize: - dbgs() << "ScatterVectorize\n"; - break; + case ScatterVectorize: + dbgs() << "ScatterVectorize\n"; + break; case NeedToGather: dbgs() << "NeedToGather\n"; break; @@ -1748,7 +1748,7 @@ private: dbgs() << "NULL\n"; dbgs() << "ReuseShuffleIndices: "; if (ReuseShuffleIndices.empty()) - dbgs() << "Empty"; + dbgs() << "Empty"; else for (unsigned ReuseIdx : ReuseShuffleIndices) dbgs() << ReuseIdx << ", "; @@ -1765,55 +1765,55 @@ private: #endif }; -#ifndef NDEBUG - void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost, - InstructionCost VecCost, - InstructionCost ScalarCost) const { - dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump(); - dbgs() << "SLP: Costs:\n"; - dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n"; - dbgs() << "SLP: VectorCost = " << VecCost << "\n"; - dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n"; - dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " << - ReuseShuffleCost + VecCost - ScalarCost << "\n"; - } -#endif - +#ifndef NDEBUG + void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost, + InstructionCost VecCost, + InstructionCost ScalarCost) const { + dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump(); + dbgs() << "SLP: Costs:\n"; + dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n"; + dbgs() << "SLP: VectorCost = " << VecCost << "\n"; + dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n"; + dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " << + ReuseShuffleCost + VecCost - ScalarCost << "\n"; + } +#endif + /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, ArrayRef<unsigned> ReuseShuffleIndices = None, ArrayRef<unsigned> ReorderIndices = None) { - TreeEntry::EntryState EntryState = - Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; - return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, - ReuseShuffleIndices, ReorderIndices); - } - - TreeEntry *newTreeEntry(ArrayRef<Value *> VL, - TreeEntry::EntryState EntryState, - Optional<ScheduleData *> Bundle, - const InstructionsState &S, - const EdgeInfo &UserTreeIdx, - ArrayRef<unsigned> ReuseShuffleIndices = None, - ArrayRef<unsigned> ReorderIndices = None) { - assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || - (Bundle && EntryState != TreeEntry::NeedToGather)) && - "Need to vectorize gather entry?"); + TreeEntry::EntryState EntryState = + Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; + return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, + ReuseShuffleIndices, ReorderIndices); + } + + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, + TreeEntry::EntryState EntryState, + Optional<ScheduleData *> Bundle, + const InstructionsState &S, + const EdgeInfo &UserTreeIdx, + ArrayRef<unsigned> ReuseShuffleIndices = None, + ArrayRef<unsigned> ReorderIndices = None) { + assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || + (Bundle && EntryState != TreeEntry::NeedToGather)) && + "Need to vectorize gather entry?"); VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); - Last->State = EntryState; + Last->State = EntryState; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); - Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); + Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); Last->setOperations(S); - if (Last->State != TreeEntry::NeedToGather) { - for (Value *V : VL) { - assert(!getTreeEntry(V) && "Scalar already in tree!"); - ScalarToTreeEntry[V] = Last; + if (Last->State != TreeEntry::NeedToGather) { + for (Value *V : VL) { + assert(!getTreeEntry(V) && "Scalar already in tree!"); + ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. unsigned Lane = 0; @@ -1849,10 +1849,10 @@ private: } #endif - TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); } + TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); } const TreeEntry *getTreeEntry(Value *V) const { - return ScalarToTreeEntry.lookup(V); + return ScalarToTreeEntry.lookup(V); } /// Maps a specific scalar to its tree entry. @@ -2374,7 +2374,7 @@ private: ScalarEvolution *SE; TargetTransformInfo *TTI; TargetLibraryInfo *TLI; - AAResults *AA; + AAResults *AA; LoopInfo *LI; DominatorTree *DT; AssumptionCache *AC; @@ -2473,9 +2473,9 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { } for (auto V : Entry->Scalars) { OS << *V; - if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) { - return EU.Scalar == V; - })) + if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) { + return EU.Scalar == V; + })) OS << " <extract>"; OS << "\n"; } @@ -2507,17 +2507,17 @@ BoUpSLP::~BoUpSLP() { "trying to erase instruction with users."); Pair.getFirst()->eraseFromParent(); } -#ifdef EXPENSIVE_CHECKS - // If we could guarantee that this call is not extremely slow, we could - // remove the ifdef limitation (see PR47712). +#ifdef EXPENSIVE_CHECKS + // If we could guarantee that this call is not extremely slow, we could + // remove the ifdef limitation (see PR47712). assert(!verifyFunction(*F, &dbgs())); -#endif +#endif } void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { for (auto *V : AV) { if (auto *I = dyn_cast<Instruction>(V)) - eraseInstruction(I, /*ReplaceOpsWithUndef=*/true); + eraseInstruction(I, /*ReplaceOpsWithUndef=*/true); }; } @@ -2742,11 +2742,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, auto *PH = cast<PHINode>(VL0); // Check for terminator values (e.g. invoke). - for (Value *V : VL) - for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { + for (Value *V : VL) + for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { Instruction *Term = dyn_cast<Instruction>( - cast<PHINode>(V)->getIncomingValueForBlock( - PH->getIncomingBlock(I))); + cast<PHINode>(V)->getIncomingValueForBlock( + PH->getIncomingBlock(I))); if (Term && Term->isTerminator()) { LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); @@ -2763,13 +2763,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Keeps the reordered operands to avoid code duplication. SmallVector<ValueList, 2> OperandsVec; - for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { + for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { ValueList Operands; // Prepare the operand vector. - for (Value *V : VL) - Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock( - PH->getIncomingBlock(I))); - TE->setOperand(I, Operands); + for (Value *V : VL) + Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock( + PH->getIncomingBlock(I))); + TE->setOperand(I, Operands); OperandsVec.push_back(Operands); } for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) @@ -2803,9 +2803,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Insert new order with initial value 0, if it does not exist, // otherwise return the iterator to the existing one. newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; + ReuseShuffleIndicies, CurrentOrder); + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; @@ -2884,21 +2884,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Need to reorder. TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); + ReuseShuffleIndicies, CurrentOrder); TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; } return; } - // Vectorizing non-consecutive loads with `llvm.masked.gather`. - TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, - UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - buildTree_rec(PointerOps, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); - return; + // Vectorizing non-consecutive loads with `llvm.masked.gather`. + TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(PointerOps, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); + return; } LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); @@ -3033,8 +3033,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *V : VL) - Operands.push_back(cast<Instruction>(V)->getOperand(i)); + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -3102,16 +3102,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::Store: { // Check if the stores are consecutive or if we need to swizzle them. llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); - // Avoid types that are padded when being allocated as scalars, while - // being packed together in a vector (such as i1). - if (DL->getTypeSizeInBits(ScalarTy) != - DL->getTypeAllocSizeInBits(ScalarTy)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); - return; - } + // Avoid types that are padded when being allocated as scalars, while + // being packed together in a vector (such as i1). + if (DL->getTypeSizeInBits(ScalarTy) != + DL->getTypeAllocSizeInBits(ScalarTy)) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); + return; + } // Make sure all stores in the bundle are simple - we can't vectorize // atomic or volatile stores. SmallVector<Value *, 4> PointerOps(VL.size()); @@ -3163,12 +3163,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } else { TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); + ReuseShuffleIndicies, CurrentOrder); TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; } return; } @@ -3187,7 +3187,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); VFShape Shape = VFShape::get( - *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())), + *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())), false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); @@ -3324,7 +3324,7 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { N *= AT->getNumElements(); EltTy = AT->getElementType(); } else { - auto *VT = cast<FixedVectorType>(EltTy); + auto *VT = cast<FixedVectorType>(EltTy); N *= VT->getNumElements(); EltTy = VT->getElementType(); } @@ -3362,7 +3362,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) return false; } else { - NElts = cast<FixedVectorType>(Vec->getType())->getNumElements(); + NElts = cast<FixedVectorType>(Vec->getType())->getNumElements(); } if (NElts != VL.size()) @@ -3406,26 +3406,26 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, } bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { - return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) { + return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) { return ScalarToTreeEntry.count(U) > 0; }); } -static std::pair<InstructionCost, InstructionCost> -getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, - TargetTransformInfo *TTI, TargetLibraryInfo *TLI) { +static std::pair<InstructionCost, InstructionCost> +getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, + TargetTransformInfo *TTI, TargetLibraryInfo *TLI) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount()); - auto IntrinsicCost = + IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount()); + auto IntrinsicCost = TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); - auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( - VecTy->getNumElements())), - false /*HasGlobalPred*/); + auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( + VecTy->getNumElements())), + false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); - auto LibCost = IntrinsicCost; + auto LibCost = IntrinsicCost; if (!CI->isNoBuiltin() && VecFunc) { // Calculate the cost of the vector library call. SmallVector<Type *, 4> VecTys; @@ -3440,7 +3440,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, return {IntrinsicCost, LibCost}; } -InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { +InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef<Value*> VL = E->Scalars; Type *ScalarTy = VL[0]->getType(); @@ -3459,7 +3459,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - InstructionCost ReuseShuffleCost = 0; + InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) { ReuseShuffleCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); @@ -3475,8 +3475,8 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { allSameType(VL) && allSameBlock(VL)) { Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL); if (ShuffleKind.hasValue()) { - InstructionCost Cost = - TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); + InstructionCost Cost = + TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); for (auto *V : VL) { // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this @@ -3495,9 +3495,9 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { } return ReuseShuffleCost + getGatherCost(VL); } - assert((E->State == TreeEntry::Vectorize || - E->State == TreeEntry::ScatterVectorize) && - "Unhandled state"); + assert((E->State == TreeEntry::Vectorize || + E->State == TreeEntry::ScatterVectorize) && + "Unhandled state"); assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = @@ -3536,37 +3536,37 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - InstructionCost DeadCost = ReuseShuffleCost; + InstructionCost DeadCost = ReuseShuffleCost; if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. DeadCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - for (unsigned I = 0, E = VL.size(); I < E; ++I) { - Instruction *EI = cast<Instruction>(VL[I]); + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + Instruction *EI = cast<Instruction>(VL[I]); // If all users are going to be vectorized, instruction can be // considered as dead. // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(EI)) { + if (areAllUsersVectorized(EI)) { // Take credit for instruction that will become dead. - if (EI->hasOneUse()) { - Instruction *Ext = EI->user_back(); + if (EI->hasOneUse()) { + Instruction *Ext = EI->user_back(); if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && all_of(Ext->users(), [](User *U) { return isa<GetElementPtrInst>(U); })) { // Use getExtractWithExtendCost() to calculate the cost of // extractelement/ext pair. DeadCost -= TTI->getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), VecTy, I); + Ext->getOpcode(), Ext->getType(), VecTy, I); // Add back the cost of s|zext which is subtracted separately. DeadCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EI->getType(), - TTI::getCastContextHint(Ext), CostKind, Ext); + Ext->getOpcode(), Ext->getType(), EI->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); continue; } } DeadCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } } return DeadCost; @@ -3584,78 +3584,78 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - InstructionCost ScalarEltCost = - TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, - TTI::getCastContextHint(VL0), CostKind, VL0); + InstructionCost ScalarEltCost = + TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, + TTI::getCastContextHint(VL0), CostKind, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } // Calculate the cost of this instruction. - InstructionCost ScalarCost = VL.size() * ScalarEltCost; + InstructionCost ScalarCost = VL.size() * ScalarEltCost; auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); - InstructionCost VecCost = 0; + InstructionCost VecCost = 0; // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { - VecCost = - ReuseShuffleCost + - TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, - TTI::getCastContextHint(VL0), CostKind, VL0); + VecCost = + ReuseShuffleCost + + TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, + TTI::getCastContextHint(VL0), CostKind, VL0); } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return VecCost - ScalarCost; } case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. - InstructionCost ScalarEltCost = - TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), - CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); + InstructionCost ScalarEltCost = + TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), + CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); - InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; - - // Check if all entries in VL are either compares or selects with compares - // as condition that have the same predicates. - CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE; - bool First = true; - for (auto *V : VL) { - CmpInst::Predicate CurrentPred; - auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); - if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) && - !match(V, MatchCmp)) || - (!First && VecPred != CurrentPred)) { - VecPred = CmpInst::BAD_ICMP_PREDICATE; - break; - } - First = false; - VecPred = CurrentPred; - } - - InstructionCost VecCost = TTI->getCmpSelInstrCost( - E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); - // Check if it is possible and profitable to use min/max for selects in - // VL. - // - auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); - if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { - IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy, - {VecTy, VecTy}); - InstructionCost IntrinsicCost = - TTI->getIntrinsicInstrCost(CostAttrs, CostKind); - // If the selects are the only uses of the compares, they will be dead - // and we can adjust the cost by removing their cost. - if (IntrinsicAndUse.second) - IntrinsicCost -= - TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy, - CmpInst::BAD_ICMP_PREDICATE, CostKind); - VecCost = std::min(VecCost, IntrinsicCost); - } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); + InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; + + // Check if all entries in VL are either compares or selects with compares + // as condition that have the same predicates. + CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE; + bool First = true; + for (auto *V : VL) { + CmpInst::Predicate CurrentPred; + auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); + if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) && + !match(V, MatchCmp)) || + (!First && VecPred != CurrentPred)) { + VecPred = CmpInst::BAD_ICMP_PREDICATE; + break; + } + First = false; + VecPred = CurrentPred; + } + + InstructionCost VecCost = TTI->getCmpSelInstrCost( + E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); + // Check if it is possible and profitable to use min/max for selects in + // VL. + // + auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); + if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { + IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy, + {VecTy, VecTy}); + InstructionCost IntrinsicCost = + TTI->getIntrinsicInstrCost(CostAttrs, CostKind); + // If the selects are the only uses of the compares, they will be dead + // and we can adjust the cost by removing their cost. + if (IntrinsicAndUse.second) + IntrinsicCost -= + TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + VecCost = std::min(VecCost, IntrinsicCost); + } + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::FNeg: @@ -3715,17 +3715,17 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { } SmallVector<const Value *, 4> Operands(VL0->operand_values()); - InstructionCost ScalarEltCost = - TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, - Op2VK, Op1VP, Op2VP, Operands, VL0); + InstructionCost ScalarEltCost = + TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, + Op2VK, Op1VP, Op2VP, Operands, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } - InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecCost = - TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, - Op2VK, Op1VP, Op2VP, Operands, VL0); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); + InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; + InstructionCost VecCost = + TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, + Op2VK, Op1VP, Op2VP, Operands, VL0); + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -3734,42 +3734,42 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; - InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( - Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); + InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( + Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } - InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecCost = TTI->getArithmeticInstrCost( - Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); + InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; + InstructionCost VecCost = TTI->getArithmeticInstrCost( + Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. Align alignment = cast<LoadInst>(VL0)->getAlign(); - InstructionCost ScalarEltCost = TTI->getMemoryOpCost( - Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0); + InstructionCost ScalarEltCost = TTI->getMemoryOpCost( + Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } - InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecLdCost; - if (E->State == TreeEntry::Vectorize) { - VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, - CostKind, VL0); - } else { - assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); - VecLdCost = TTI->getGatherScatterOpCost( - Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(), - /*VariableMask=*/false, alignment, CostKind, VL0); - } + InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; + InstructionCost VecLdCost; + if (E->State == TreeEntry::Vectorize) { + VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, + CostKind, VL0); + } else { + assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); + VecLdCost = TTI->getGatherScatterOpCost( + Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(), + /*VariableMask=*/false, alignment, CostKind, VL0); + } if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); return ReuseShuffleCost + VecLdCost - ScalarLdCost; } case Instruction::Store: { @@ -3778,19 +3778,19 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { auto *SI = cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); Align Alignment = SI->getAlign(); - InstructionCost ScalarEltCost = TTI->getMemoryOpCost( - Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); + InstructionCost ScalarEltCost = TTI->getMemoryOpCost( + Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); if (NeedToShuffleReuses) ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; - InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecStCost = TTI->getMemoryOpCost( - Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); + InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; + InstructionCost VecStCost = TTI->getMemoryOpCost( + Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); if (IsReorder) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecStCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); return ReuseShuffleCost + VecStCost - ScalarStCost; } case Instruction::Call: { @@ -3798,17 +3798,17 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1); - InstructionCost ScalarEltCost = - TTI->getIntrinsicInstrCost(CostAttrs, CostKind); + IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1); + InstructionCost ScalarEltCost = + TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } - InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; + InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); - InstructionCost VecCallCost = - std::min(VecCallCosts.first, VecCallCosts.second); + InstructionCost VecCallCost = + std::min(VecCallCosts.first, VecCallCosts.second); LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" @@ -3823,7 +3823,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"); - InstructionCost ScalarCost = 0; + InstructionCost ScalarCost = 0; if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { Instruction *I = cast<Instruction>(VL[Idx]); @@ -3841,7 +3841,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. - InstructionCost VecCost = 0; + InstructionCost VecCost = 0; if (Instruction::isBinaryOp(E->getOpcode())) { VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, @@ -3852,12 +3852,12 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size()); auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size()); VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, - TTI::CastContextHint::None, CostKind); + TTI::CastContextHint::None, CostKind); VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, - TTI::CastContextHint::None, CostKind); + TTI::CastContextHint::None, CostKind); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return ReuseShuffleCost + VecCost - ScalarCost; } default: @@ -3895,13 +3895,13 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI) { // Look past the root to find a source value. Arbitrarily follow the // path through operand 0 of any 'or'. Also, peek through optional - // shift-left-by-multiple-of-8-bits. + // shift-left-by-multiple-of-8-bits. Value *ZextLoad = Root; - const APInt *ShAmtC; + const APInt *ShAmtC; while (!isa<ConstantExpr>(ZextLoad) && (match(ZextLoad, m_Or(m_Value(), m_Value())) || - (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && - ShAmtC->urem(8) == 0))) + (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && + ShAmtC->urem(8) == 0))) ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0); // Check if the input is an extended load of the required or/shift expression. @@ -3925,8 +3925,8 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, return true; } -bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { - if (RdxKind != RecurKind::Or) +bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { + if (RdxKind != RecurKind::Or) return false; unsigned NumElts = VectorizableTree[0]->Scalars.size(); @@ -3967,35 +3967,35 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { return true; } -InstructionCost BoUpSLP::getSpillCost() const { +InstructionCost BoUpSLP::getSpillCost() const { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it // (for example, if spills and fills are required). unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); - InstructionCost Cost = 0; + InstructionCost Cost = 0; SmallPtrSet<Instruction*, 4> LiveValues; Instruction *PrevInst = nullptr; - // The entries in VectorizableTree are not necessarily ordered by their - // position in basic blocks. Collect them and order them by dominance so later - // instructions are guaranteed to be visited first. For instructions in - // different basic blocks, we only scan to the beginning of the block, so - // their order does not matter, as long as all instructions in a basic block - // are grouped together. Using dominance ensures a deterministic order. - SmallVector<Instruction *, 16> OrderedScalars; + // The entries in VectorizableTree are not necessarily ordered by their + // position in basic blocks. Collect them and order them by dominance so later + // instructions are guaranteed to be visited first. For instructions in + // different basic blocks, we only scan to the beginning of the block, so + // their order does not matter, as long as all instructions in a basic block + // are grouped together. Using dominance ensures a deterministic order. + SmallVector<Instruction *, 16> OrderedScalars; for (const auto &TEPtr : VectorizableTree) { Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]); if (!Inst) continue; - OrderedScalars.push_back(Inst); - } - llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) { - return DT->dominates(B, A); - }); + OrderedScalars.push_back(Inst); + } + llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) { + return DT->dominates(B, A); + }); - for (Instruction *Inst : OrderedScalars) { + for (Instruction *Inst : OrderedScalars) { if (!PrevInst) { PrevInst = Inst; continue; @@ -4049,8 +4049,8 @@ InstructionCost BoUpSLP::getSpillCost() const { return Cost; } -InstructionCost BoUpSLP::getTreeCost() { - InstructionCost Cost = 0; +InstructionCost BoUpSLP::getTreeCost() { + InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); @@ -4080,16 +4080,16 @@ InstructionCost BoUpSLP::getTreeCost() { })) continue; - InstructionCost C = getEntryCost(&TE); - Cost += C; + InstructionCost C = getEntryCost(&TE); + Cost += C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " << *TE.Scalars[0] - << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); + << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); } SmallPtrSet<Value *, 16> ExtractCostCalculated; - InstructionCost ExtractCost = 0; + InstructionCost ExtractCost = 0; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!ExtractCostCalculated.insert(EU.Scalar).second) @@ -4119,13 +4119,13 @@ InstructionCost BoUpSLP::getTreeCost() { } } - InstructionCost SpillCost = getSpillCost(); + InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; -#ifndef NDEBUG - SmallString<256> Str; +#ifndef NDEBUG + SmallString<256> Str; { - raw_svector_ostream OS(Str); + raw_svector_ostream OS(Str); OS << "SLP: Spill Cost = " << SpillCost << ".\n" << "SLP: Extract Cost = " << ExtractCost << ".\n" << "SLP: Total Cost = " << Cost << ".\n"; @@ -4133,28 +4133,28 @@ InstructionCost BoUpSLP::getTreeCost() { LLVM_DEBUG(dbgs() << Str); if (ViewSLPTree) ViewGraph(this, "SLP" + F->getName(), false, Str); -#endif +#endif return Cost; } -InstructionCost -BoUpSLP::getGatherCost(FixedVectorType *Ty, - const DenseSet<unsigned> &ShuffledIndices) const { +InstructionCost +BoUpSLP::getGatherCost(FixedVectorType *Ty, + const DenseSet<unsigned> &ShuffledIndices) const { unsigned NumElts = Ty->getNumElements(); APInt DemandedElts = APInt::getNullValue(NumElts); - for (unsigned I = 0; I < NumElts; ++I) - if (!ShuffledIndices.count(I)) - DemandedElts.setBit(I); - InstructionCost Cost = - TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, - /*Extract*/ false); + for (unsigned I = 0; I < NumElts; ++I) + if (!ShuffledIndices.count(I)) + DemandedElts.setBit(I); + InstructionCost Cost = + TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, + /*Extract*/ false); if (!ShuffledIndices.empty()) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; } -InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { +InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // Find the type of the operands in VL. Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) @@ -4196,10 +4196,10 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { // should be in this block. auto *Front = E->getMainOp(); auto *BB = Front->getParent(); - assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool { - auto *I = cast<Instruction>(V); - return !E->isOpcodeOrAlt(I) || I->getParent() == BB; - })); + assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool { + auto *I = cast<Instruction>(V); + return !E->isOpcodeOrAlt(I) || I->getParent() == BB; + })); // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; @@ -4252,30 +4252,30 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } -Value *BoUpSLP::gather(ArrayRef<Value *> VL) { - Value *Val0 = - isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0]; - FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size()); - Value *Vec = PoisonValue::get(VecTy); - unsigned InsIndex = 0; - for (Value *Val : VL) { - Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++)); - auto *InsElt = dyn_cast<InsertElementInst>(Vec); - if (!InsElt) - continue; - GatherSeq.insert(InsElt); - CSEBlocks.insert(InsElt->getParent()); - // Add to our 'need-to-extract' list. - if (TreeEntry *Entry = getTreeEntry(Val)) { - // Find which lane we need to extract. - unsigned FoundLane = std::distance(Entry->Scalars.begin(), - find(Entry->Scalars, Val)); - assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane"); - if (!Entry->ReuseShuffleIndices.empty()) { - FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), - find(Entry->ReuseShuffleIndices, FoundLane)); - } - ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane)); +Value *BoUpSLP::gather(ArrayRef<Value *> VL) { + Value *Val0 = + isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0]; + FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size()); + Value *Vec = PoisonValue::get(VecTy); + unsigned InsIndex = 0; + for (Value *Val : VL) { + Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++)); + auto *InsElt = dyn_cast<InsertElementInst>(Vec); + if (!InsElt) + continue; + GatherSeq.insert(InsElt); + CSEBlocks.insert(InsElt->getParent()); + // Add to our 'need-to-extract' list. + if (TreeEntry *Entry = getTreeEntry(Val)) { + // Find which lane we need to extract. + unsigned FoundLane = std::distance(Entry->Scalars.begin(), + find(Entry->Scalars, Val)); + assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane"); + if (!Entry->ReuseShuffleIndices.empty()) { + FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), + find(Entry->ReuseShuffleIndices, FoundLane)); + } + ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane)); } } @@ -4299,7 +4299,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { for (int Idx : E->ReuseShuffleIndices) if (UsedIdxs.insert(Idx).second) UniqueIdxs.emplace_back(Idx); - V = Builder.CreateShuffleVector(V, UniqueIdxs); + V = Builder.CreateShuffleVector(V, UniqueIdxs); } } return V; @@ -4327,15 +4327,15 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { VL = UniqueValues; } - Value *Vec = gather(VL); + Value *Vec = gather(VL); if (!ReuseShuffleIndicies.empty()) { - Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle"); - if (auto *I = dyn_cast<Instruction>(Vec)) { + Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle"); + if (auto *I = dyn_cast<Instruction>(Vec)) { GatherSeq.insert(I); CSEBlocks.insert(I->getParent()); } } - return Vec; + return Vec; } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { @@ -4349,28 +4349,28 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->State == TreeEntry::NeedToGather) { setInsertPointAfterBundle(E); - Value *Vec = gather(E->Scalars); + Value *Vec = gather(E->Scalars); if (NeedToShuffleReuses) { - Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast<Instruction>(Vec)) { + Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle"); + if (auto *I = dyn_cast<Instruction>(Vec)) { GatherSeq.insert(I); CSEBlocks.insert(I->getParent()); } } - E->VectorizedValue = Vec; - return Vec; + E->VectorizedValue = Vec; + return Vec; } - assert((E->State == TreeEntry::Vectorize || - E->State == TreeEntry::ScatterVectorize) && - "Unhandled state"); + assert((E->State == TreeEntry::Vectorize || + E->State == TreeEntry::ScatterVectorize) && + "Unhandled state"); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); - Instruction *VL0 = E->getMainOp(); - Type *ScalarTy = VL0->getType(); - if (auto *Store = dyn_cast<StoreInst>(VL0)) - ScalarTy = Store->getValueOperand()->getType(); - auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); + Instruction *VL0 = E->getMainOp(); + Type *ScalarTy = VL0->getType(); + if (auto *Store = dyn_cast<StoreInst>(VL0)) + ScalarTy = Store->getValueOperand()->getType(); + auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); switch (ShuffleOrOp) { case Instruction::PHI: { auto *PH = cast<PHINode>(VL0); @@ -4378,9 +4378,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); Value *V = NewPhi; - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; // PHINodes may have multiple entries from the same block. We want to @@ -4413,33 +4413,33 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector<int, 4> Mask; inversePermutation(E->ReorderIndices, Mask); Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); + V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); } if (NeedToShuffleReuses) { // TODO: Merge this shuffle with the ReorderShuffleMask. if (E->ReorderIndices.empty()) Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); } E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { - auto *LI = cast<LoadInst>(E->getSingleOperand(0)); + auto *LI = cast<LoadInst>(E->getSingleOperand(0)); Builder.SetInsertPoint(LI); - auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); + auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); Value *NewV = propagateMetadata(V, E->Scalars); if (!E->ReorderIndices.empty()) { SmallVector<int, 4> Mask; inversePermutation(E->ReorderIndices, Mask); - NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle"); + NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle"); } if (NeedToShuffleReuses) { // TODO: Merge this shuffle with the ReorderShuffleMask. - NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices, - "shuffle"); + NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices, + "shuffle"); } E->VectorizedValue = NewV; return NewV; @@ -4467,9 +4467,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *CI = cast<CastInst>(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4489,9 +4489,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4509,9 +4509,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateSelect(Cond, True, False); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4532,9 +4532,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; @@ -4575,9 +4575,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; @@ -4592,40 +4592,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E); LoadInst *LI = cast<LoadInst>(VL0); - Instruction *NewLI; + Instruction *NewLI; unsigned AS = LI->getPointerAddressSpace(); - Value *PO = LI->getPointerOperand(); - if (E->State == TreeEntry::Vectorize) { - - Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS)); - - // The pointer operand uses an in-tree scalar so we add the new BitCast - // to ExternalUses list to make sure that an extract will be generated - // in the future. - if (getTreeEntry(PO)) - ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0); - - NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); - } else { - assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); - Value *VecPtr = vectorizeTree(E->getOperand(0)); - // Use the minimum alignment of the gathered loads. - Align CommonAlignment = LI->getAlign(); - for (Value *V : E->Scalars) - CommonAlignment = - commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); - NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment); - } - Value *V = propagateMetadata(NewLI, E->Scalars); - + Value *PO = LI->getPointerOperand(); + if (E->State == TreeEntry::Vectorize) { + + Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS)); + + // The pointer operand uses an in-tree scalar so we add the new BitCast + // to ExternalUses list to make sure that an extract will be generated + // in the future. + if (getTreeEntry(PO)) + ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0); + + NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); + } else { + assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); + Value *VecPtr = vectorizeTree(E->getOperand(0)); + // Use the minimum alignment of the gathered loads. + Align CommonAlignment = LI->getAlign(); + for (Value *V : E->Scalars) + CommonAlignment = + commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); + NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment); + } + Value *V = propagateMetadata(NewLI, E->Scalars); + if (IsReorder) { SmallVector<int, 4> Mask; inversePermutation(E->ReorderIndices, Mask); - V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); + V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); } if (NeedToShuffleReuses) { // TODO: Merge this shuffle with the ReorderShuffleMask. - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); } E->VectorizedValue = V; ++NumVectorInstructions; @@ -4643,7 +4643,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (IsReorder) { SmallVector<int, 4> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end()); - VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf"); + VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf"); } Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast( @@ -4658,9 +4658,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0)); Value *V = propagateMetadata(ST, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4697,9 +4697,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (Instruction *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; @@ -4739,10 +4739,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Function *CF; if (!UseIntrinsic) { - VFShape Shape = - VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( - VecTy->getNumElements())), - false /*HasGlobalPred*/); + VFShape Shape = + VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( + VecTy->getNumElements())), + false /*HasGlobalPred*/); CF = VFDatabase(*CI).getVectorizedFunction(Shape); } else { Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())}; @@ -4760,9 +4760,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); propagateIRFlags(V, E->Scalars, VL0); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4827,9 +4827,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (Instruction *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; @@ -4894,8 +4894,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { continue; TreeEntry *E = getTreeEntry(Scalar); assert(E && "Invalid scalar"); - assert(E->State != TreeEntry::NeedToGather && - "Extracting from a gather list"); + assert(E->State != TreeEntry::NeedToGather && + "Extracting from a gather list"); Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); @@ -5053,8 +5053,8 @@ void BoUpSLP::optimizeGatherSequence() { // instructions into different buckets based on the insert lane. SmallVector<Instruction *, 16> Visited; for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { - assert(*I && - (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && + assert(*I && + (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"); BasicBlock *BB = (*I)->getBlock(); // For all instructions in blocks containing gather sequences: @@ -5164,7 +5164,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, // cancelScheduling). while (!Bundle->isReady() && !ReadyInsts.empty()) { - ScheduleData *pickedSD = ReadyInsts.pop_back_val(); + ScheduleData *pickedSD = ReadyInsts.pop_back_val(); if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { schedule(pickedSD, ReadyInsts); @@ -5308,9 +5308,9 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, if (I->mayReadOrWriteMemory() && (!isa<IntrinsicInst>(I) || - (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect && - cast<IntrinsicInst>(I)->getIntrinsicID() != - Intrinsic::pseudoprobe))) { + (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect && + cast<IntrinsicInst>(I)->getIntrinsicID() != + Intrinsic::pseudoprobe))) { // Update the linked list of memory accessing instructions. if (CurrentLoadStore) { CurrentLoadStore->NextLoadStore = SD; @@ -5337,7 +5337,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, WorkList.push_back(SD); while (!WorkList.empty()) { - ScheduleData *SD = WorkList.pop_back_val(); + ScheduleData *SD = WorkList.pop_back_val(); ScheduleData *BundleMember = SD; while (BundleMember) { @@ -5534,15 +5534,15 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { } unsigned BoUpSLP::getVectorElementSize(Value *V) { - // If V is a store, just return the width of the stored value (or value - // truncated just before storing) without traversing the expression tree. - // This is the common case. - if (auto *Store = dyn_cast<StoreInst>(V)) { - if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) - return DL->getTypeSizeInBits(Trunc->getSrcTy()); - else - return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); - } + // If V is a store, just return the width of the stored value (or value + // truncated just before storing) without traversing the expression tree. + // This is the common case. + if (auto *Store = dyn_cast<StoreInst>(V)) { + if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) + return DL->getTypeSizeInBits(Trunc->getSrcTy()); + else + return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); + } auto E = InstrElementSize.find(V); if (E != InstrElementSize.end()) @@ -5891,7 +5891,7 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, - TargetLibraryInfo *TLI_, AAResults *AA_, + TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_) { @@ -5991,11 +5991,11 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, R.computeMinimumValueSizes(); - InstructionCost Cost = R.getTreeCost(); + InstructionCost Cost = R.getTreeCost(); - LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n"); if (Cost < -SLPCostThreshold) { - LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n"); using namespace ore; @@ -6068,7 +6068,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, // If a vector register can't hold 1 element, we are done. unsigned MaxVecRegSize = R.getMaxVecRegSize(); - unsigned EltSize = R.getVectorElementSize(Operands[0]); + unsigned EltSize = R.getVectorElementSize(Operands[0]); if (MaxVecRegSize % EltSize != 0) continue; @@ -6119,7 +6119,7 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { continue; if (!isValidElementType(SI->getValueOperand()->getType())) continue; - Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI); + Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI); } // Ignore getelementptr instructions that have more than one index, a @@ -6183,7 +6183,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, unsigned Sz = R.getVectorElementSize(I0); unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); - MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); + MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); if (MaxVF < 2) { R.getORE()->emit([&]() { return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) @@ -6195,7 +6195,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool Changed = false; bool CandidateFound = false; - InstructionCost MinCost = SLPCostThreshold.getValue(); + InstructionCost MinCost = SLPCostThreshold.getValue(); bool CompensateUseCost = !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) { @@ -6251,7 +6251,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, continue; R.computeMinimumValueSizes(); - InstructionCost Cost = R.getTreeCost(); + InstructionCost Cost = R.getTreeCost(); CandidateFound = true; if (CompensateUseCost) { // TODO: Use TTI's getScalarizationOverhead for sequence of inserts @@ -6261,7 +6261,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, // part should also switch to same interface. // For example, the following case is projected code after SLP: // %4 = extractelement <4 x i64> %3, i32 0 - // %v0 = insertelement <4 x i64> poison, i64 %4, i32 0 + // %v0 = insertelement <4 x i64> poison, i64 %4, i32 0 // %5 = extractelement <4 x i64> %3, i32 1 // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1 // %6 = extractelement <4 x i64> %3, i32 2 @@ -6281,7 +6281,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, // Switching to the TTI interface might help a bit. // Alternative solution could be pattern-match to detect a no-op or // shuffle. - InstructionCost UserCost = 0; + InstructionCost UserCost = 0; for (unsigned Lane = 0; Lane < OpsWidth; Lane++) { auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]); if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) @@ -6376,16 +6376,16 @@ namespace { /// Model horizontal reductions. /// -/// A horizontal reduction is a tree of reduction instructions that has values -/// that can be put into a vector as its leaves. For example: +/// A horizontal reduction is a tree of reduction instructions that has values +/// that can be put into a vector as its leaves. For example: /// /// mul mul mul mul /// \ / \ / /// + + /// \ / /// + -/// This tree has "mul" as its leaf values and "+" as its reduction -/// instructions. A reduction can feed into a store or a binary operation +/// This tree has "mul" as its leaf values and "+" as its reduction +/// instructions. A reduction can feed into a store or a binary operation /// feeding a phi. /// ... /// \ / @@ -6403,345 +6403,345 @@ namespace { class HorizontalReduction { using ReductionOpsType = SmallVector<Value *, 16>; using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; - ReductionOpsListType ReductionOps; + ReductionOpsListType ReductionOps; SmallVector<Value *, 32> ReducedVals; // Use map vector to make stable output. MapVector<Instruction *, Value *> ExtraArgs; - WeakTrackingVH ReductionRoot; - /// The type of reduction operation. - RecurKind RdxKind; - - /// Checks if instruction is associative and can be vectorized. - static bool isVectorizable(RecurKind Kind, Instruction *I) { - if (Kind == RecurKind::None) - return false; - if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind)) - return true; - - if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { - // FP min/max are associative except for NaN and -0.0. We do not - // have to rule out -0.0 here because the intrinsic semantics do not - // specify a fixed result for it. - return I->getFastMathFlags().noNaNs(); - } - - return I->isAssociative(); - } - - /// Checks if the ParentStackElem.first should be marked as a reduction - /// operation with an extra argument or as extra argument itself. - void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem, - Value *ExtraArg) { - if (ExtraArgs.count(ParentStackElem.first)) { - ExtraArgs[ParentStackElem.first] = nullptr; - // We ran into something like: - // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. - // The whole ParentStackElem.first should be considered as an extra value - // in this case. - // Do not perform analysis of remaining operands of ParentStackElem.first - // instruction, this whole instruction is an extra argument. - RecurKind ParentRdxKind = getRdxKind(ParentStackElem.first); - ParentStackElem.second = getNumberOfOperands(ParentRdxKind); - } else { - // We ran into something like: - // ParentStackElem.first += ... + ExtraArg + ... - ExtraArgs[ParentStackElem.first] = ExtraArg; - } - } - - /// Creates reduction operation with the current opcode. - static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, - Value *RHS, const Twine &Name) { - unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); - switch (Kind) { - case RecurKind::Add: - case RecurKind::Mul: - case RecurKind::Or: - case RecurKind::And: - case RecurKind::Xor: - case RecurKind::FAdd: - case RecurKind::FMul: - return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, - Name); - case RecurKind::FMax: - return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); - case RecurKind::FMin: - return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); - - case RecurKind::SMax: { - Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - case RecurKind::SMin: { - Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - case RecurKind::UMax: { - Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - case RecurKind::UMin: { - Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - default: - llvm_unreachable("Unknown reduction operation."); - } - } - - /// Creates reduction operation with the current opcode with the IR flags - /// from \p ReductionOps. - static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, - Value *RHS, const Twine &Name, - const ReductionOpsListType &ReductionOps) { - Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name); - if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { - if (auto *Sel = dyn_cast<SelectInst>(Op)) - propagateIRFlags(Sel->getCondition(), ReductionOps[0]); - propagateIRFlags(Op, ReductionOps[1]); - return Op; - } - propagateIRFlags(Op, ReductionOps[0]); - return Op; - } - /// Creates reduction operation with the current opcode with the IR flags - /// from \p I. - static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, - Value *RHS, const Twine &Name, Instruction *I) { - Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name); - if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { - if (auto *Sel = dyn_cast<SelectInst>(Op)) { - propagateIRFlags(Sel->getCondition(), - cast<SelectInst>(I)->getCondition()); - } - } - propagateIRFlags(Op, I); - return Op; - } - - static RecurKind getRdxKind(Instruction *I) { - assert(I && "Expected instruction for reduction matching"); - TargetTransformInfo::ReductionFlags RdxFlags; - if (match(I, m_Add(m_Value(), m_Value()))) - return RecurKind::Add; - if (match(I, m_Mul(m_Value(), m_Value()))) - return RecurKind::Mul; - if (match(I, m_And(m_Value(), m_Value()))) - return RecurKind::And; - if (match(I, m_Or(m_Value(), m_Value()))) - return RecurKind::Or; - if (match(I, m_Xor(m_Value(), m_Value()))) - return RecurKind::Xor; - if (match(I, m_FAdd(m_Value(), m_Value()))) - return RecurKind::FAdd; - if (match(I, m_FMul(m_Value(), m_Value()))) - return RecurKind::FMul; - - if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) - return RecurKind::FMax; - if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) - return RecurKind::FMin; - - if (match(I, m_SMax(m_Value(), m_Value()))) - return RecurKind::SMax; - if (match(I, m_SMin(m_Value(), m_Value()))) - return RecurKind::SMin; - if (match(I, m_UMax(m_Value(), m_Value()))) - return RecurKind::UMax; - if (match(I, m_UMin(m_Value(), m_Value()))) - return RecurKind::UMin; - - if (auto *Select = dyn_cast<SelectInst>(I)) { - // Try harder: look for min/max pattern based on instructions producing - // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). - // During the intermediate stages of SLP, it's very common to have - // pattern like this (since optimizeGatherSequence is run only once - // at the end): - // %1 = extractelement <2 x i32> %a, i32 0 - // %2 = extractelement <2 x i32> %a, i32 1 - // %cond = icmp sgt i32 %1, %2 - // %3 = extractelement <2 x i32> %a, i32 0 - // %4 = extractelement <2 x i32> %a, i32 1 - // %select = select i1 %cond, i32 %3, i32 %4 - CmpInst::Predicate Pred; - Instruction *L1; - Instruction *L2; - - Value *LHS = Select->getTrueValue(); - Value *RHS = Select->getFalseValue(); - Value *Cond = Select->getCondition(); - - // TODO: Support inverse predicates. - if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) { - if (!isa<ExtractElementInst>(RHS) || - !L2->isIdenticalTo(cast<Instruction>(RHS))) - return RecurKind::None; - } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) { - if (!isa<ExtractElementInst>(LHS) || - !L1->isIdenticalTo(cast<Instruction>(LHS))) - return RecurKind::None; - } else { - if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS)) - return RecurKind::None; - if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) || - !L1->isIdenticalTo(cast<Instruction>(LHS)) || - !L2->isIdenticalTo(cast<Instruction>(RHS))) - return RecurKind::None; - } - - TargetTransformInfo::ReductionFlags RdxFlags; - switch (Pred) { - default: - return RecurKind::None; - case CmpInst::ICMP_SGT: - case CmpInst::ICMP_SGE: - return RecurKind::SMax; - case CmpInst::ICMP_SLT: - case CmpInst::ICMP_SLE: - return RecurKind::SMin; - case CmpInst::ICMP_UGT: - case CmpInst::ICMP_UGE: - return RecurKind::UMax; - case CmpInst::ICMP_ULT: - case CmpInst::ICMP_ULE: - return RecurKind::UMin; - } - } - return RecurKind::None; - } - - /// Return true if this operation is a cmp+select idiom. - static bool isCmpSel(RecurKind Kind) { - return RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind); - } - - /// Get the index of the first operand. - static unsigned getFirstOperandIndex(RecurKind Kind) { - // We allow calling this before 'Kind' is set, so handle that specially. - if (Kind == RecurKind::None) - return 0; - return isCmpSel(Kind) ? 1 : 0; - } - - /// Total number of operands in the reduction operation. - static unsigned getNumberOfOperands(RecurKind Kind) { - return isCmpSel(Kind) ? 3 : 2; - } - - /// Checks if the instruction is in basic block \p BB. - /// For a min/max reduction check that both compare and select are in \p BB. - static bool hasSameParent(RecurKind Kind, Instruction *I, BasicBlock *BB, - bool IsRedOp) { - if (IsRedOp && isCmpSel(Kind)) { - auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition()); - return I->getParent() == BB && Cmp && Cmp->getParent() == BB; - } - return I->getParent() == BB; - } - - /// Expected number of uses for reduction operations/reduced values. - static bool hasRequiredNumberOfUses(RecurKind Kind, Instruction *I, - bool IsReductionOp) { - // SelectInst must be used twice while the condition op must have single - // use only. - if (isCmpSel(Kind)) - return I->hasNUses(2) && - (!IsReductionOp || - cast<SelectInst>(I)->getCondition()->hasOneUse()); - - // Arithmetic reduction operation must be used once only. - return I->hasOneUse(); - } - - /// Initializes the list of reduction operations. - void initReductionOps(RecurKind Kind) { - if (isCmpSel(Kind)) - ReductionOps.assign(2, ReductionOpsType()); - else - ReductionOps.assign(1, ReductionOpsType()); - } - - /// Add all reduction operations for the reduction instruction \p I. - void addReductionOps(RecurKind Kind, Instruction *I) { - assert(Kind != RecurKind::None && "Expected reduction operation."); - if (isCmpSel(Kind)) { - ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); - ReductionOps[1].emplace_back(I); - } else { - ReductionOps[0].emplace_back(I); - } - } - - static Value *getLHS(RecurKind Kind, Instruction *I) { - if (Kind == RecurKind::None) - return nullptr; - return I->getOperand(getFirstOperandIndex(Kind)); - } - static Value *getRHS(RecurKind Kind, Instruction *I) { - if (Kind == RecurKind::None) - return nullptr; - return I->getOperand(getFirstOperandIndex(Kind) + 1); - } - + WeakTrackingVH ReductionRoot; + /// The type of reduction operation. + RecurKind RdxKind; + + /// Checks if instruction is associative and can be vectorized. + static bool isVectorizable(RecurKind Kind, Instruction *I) { + if (Kind == RecurKind::None) + return false; + if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind)) + return true; + + if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { + // FP min/max are associative except for NaN and -0.0. We do not + // have to rule out -0.0 here because the intrinsic semantics do not + // specify a fixed result for it. + return I->getFastMathFlags().noNaNs(); + } + + return I->isAssociative(); + } + + /// Checks if the ParentStackElem.first should be marked as a reduction + /// operation with an extra argument or as extra argument itself. + void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem, + Value *ExtraArg) { + if (ExtraArgs.count(ParentStackElem.first)) { + ExtraArgs[ParentStackElem.first] = nullptr; + // We ran into something like: + // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. + // The whole ParentStackElem.first should be considered as an extra value + // in this case. + // Do not perform analysis of remaining operands of ParentStackElem.first + // instruction, this whole instruction is an extra argument. + RecurKind ParentRdxKind = getRdxKind(ParentStackElem.first); + ParentStackElem.second = getNumberOfOperands(ParentRdxKind); + } else { + // We ran into something like: + // ParentStackElem.first += ... + ExtraArg + ... + ExtraArgs[ParentStackElem.first] = ExtraArg; + } + } + + /// Creates reduction operation with the current opcode. + static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, + Value *RHS, const Twine &Name) { + unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); + switch (Kind) { + case RecurKind::Add: + case RecurKind::Mul: + case RecurKind::Or: + case RecurKind::And: + case RecurKind::Xor: + case RecurKind::FAdd: + case RecurKind::FMul: + return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, + Name); + case RecurKind::FMax: + return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); + case RecurKind::FMin: + return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); + + case RecurKind::SMax: { + Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + case RecurKind::SMin: { + Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + case RecurKind::UMax: { + Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + case RecurKind::UMin: { + Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + default: + llvm_unreachable("Unknown reduction operation."); + } + } + + /// Creates reduction operation with the current opcode with the IR flags + /// from \p ReductionOps. + static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, + Value *RHS, const Twine &Name, + const ReductionOpsListType &ReductionOps) { + Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name); + if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { + if (auto *Sel = dyn_cast<SelectInst>(Op)) + propagateIRFlags(Sel->getCondition(), ReductionOps[0]); + propagateIRFlags(Op, ReductionOps[1]); + return Op; + } + propagateIRFlags(Op, ReductionOps[0]); + return Op; + } + /// Creates reduction operation with the current opcode with the IR flags + /// from \p I. + static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, + Value *RHS, const Twine &Name, Instruction *I) { + Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name); + if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { + if (auto *Sel = dyn_cast<SelectInst>(Op)) { + propagateIRFlags(Sel->getCondition(), + cast<SelectInst>(I)->getCondition()); + } + } + propagateIRFlags(Op, I); + return Op; + } + + static RecurKind getRdxKind(Instruction *I) { + assert(I && "Expected instruction for reduction matching"); + TargetTransformInfo::ReductionFlags RdxFlags; + if (match(I, m_Add(m_Value(), m_Value()))) + return RecurKind::Add; + if (match(I, m_Mul(m_Value(), m_Value()))) + return RecurKind::Mul; + if (match(I, m_And(m_Value(), m_Value()))) + return RecurKind::And; + if (match(I, m_Or(m_Value(), m_Value()))) + return RecurKind::Or; + if (match(I, m_Xor(m_Value(), m_Value()))) + return RecurKind::Xor; + if (match(I, m_FAdd(m_Value(), m_Value()))) + return RecurKind::FAdd; + if (match(I, m_FMul(m_Value(), m_Value()))) + return RecurKind::FMul; + + if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) + return RecurKind::FMax; + if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) + return RecurKind::FMin; + + if (match(I, m_SMax(m_Value(), m_Value()))) + return RecurKind::SMax; + if (match(I, m_SMin(m_Value(), m_Value()))) + return RecurKind::SMin; + if (match(I, m_UMax(m_Value(), m_Value()))) + return RecurKind::UMax; + if (match(I, m_UMin(m_Value(), m_Value()))) + return RecurKind::UMin; + + if (auto *Select = dyn_cast<SelectInst>(I)) { + // Try harder: look for min/max pattern based on instructions producing + // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). + // During the intermediate stages of SLP, it's very common to have + // pattern like this (since optimizeGatherSequence is run only once + // at the end): + // %1 = extractelement <2 x i32> %a, i32 0 + // %2 = extractelement <2 x i32> %a, i32 1 + // %cond = icmp sgt i32 %1, %2 + // %3 = extractelement <2 x i32> %a, i32 0 + // %4 = extractelement <2 x i32> %a, i32 1 + // %select = select i1 %cond, i32 %3, i32 %4 + CmpInst::Predicate Pred; + Instruction *L1; + Instruction *L2; + + Value *LHS = Select->getTrueValue(); + Value *RHS = Select->getFalseValue(); + Value *Cond = Select->getCondition(); + + // TODO: Support inverse predicates. + if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) { + if (!isa<ExtractElementInst>(RHS) || + !L2->isIdenticalTo(cast<Instruction>(RHS))) + return RecurKind::None; + } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) { + if (!isa<ExtractElementInst>(LHS) || + !L1->isIdenticalTo(cast<Instruction>(LHS))) + return RecurKind::None; + } else { + if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS)) + return RecurKind::None; + if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) || + !L1->isIdenticalTo(cast<Instruction>(LHS)) || + !L2->isIdenticalTo(cast<Instruction>(RHS))) + return RecurKind::None; + } + + TargetTransformInfo::ReductionFlags RdxFlags; + switch (Pred) { + default: + return RecurKind::None; + case CmpInst::ICMP_SGT: + case CmpInst::ICMP_SGE: + return RecurKind::SMax; + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SLE: + return RecurKind::SMin; + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_UGE: + return RecurKind::UMax; + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_ULE: + return RecurKind::UMin; + } + } + return RecurKind::None; + } + + /// Return true if this operation is a cmp+select idiom. + static bool isCmpSel(RecurKind Kind) { + return RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind); + } + + /// Get the index of the first operand. + static unsigned getFirstOperandIndex(RecurKind Kind) { + // We allow calling this before 'Kind' is set, so handle that specially. + if (Kind == RecurKind::None) + return 0; + return isCmpSel(Kind) ? 1 : 0; + } + + /// Total number of operands in the reduction operation. + static unsigned getNumberOfOperands(RecurKind Kind) { + return isCmpSel(Kind) ? 3 : 2; + } + + /// Checks if the instruction is in basic block \p BB. + /// For a min/max reduction check that both compare and select are in \p BB. + static bool hasSameParent(RecurKind Kind, Instruction *I, BasicBlock *BB, + bool IsRedOp) { + if (IsRedOp && isCmpSel(Kind)) { + auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition()); + return I->getParent() == BB && Cmp && Cmp->getParent() == BB; + } + return I->getParent() == BB; + } + + /// Expected number of uses for reduction operations/reduced values. + static bool hasRequiredNumberOfUses(RecurKind Kind, Instruction *I, + bool IsReductionOp) { + // SelectInst must be used twice while the condition op must have single + // use only. + if (isCmpSel(Kind)) + return I->hasNUses(2) && + (!IsReductionOp || + cast<SelectInst>(I)->getCondition()->hasOneUse()); + + // Arithmetic reduction operation must be used once only. + return I->hasOneUse(); + } + + /// Initializes the list of reduction operations. + void initReductionOps(RecurKind Kind) { + if (isCmpSel(Kind)) + ReductionOps.assign(2, ReductionOpsType()); + else + ReductionOps.assign(1, ReductionOpsType()); + } + + /// Add all reduction operations for the reduction instruction \p I. + void addReductionOps(RecurKind Kind, Instruction *I) { + assert(Kind != RecurKind::None && "Expected reduction operation."); + if (isCmpSel(Kind)) { + ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); + ReductionOps[1].emplace_back(I); + } else { + ReductionOps[0].emplace_back(I); + } + } + + static Value *getLHS(RecurKind Kind, Instruction *I) { + if (Kind == RecurKind::None) + return nullptr; + return I->getOperand(getFirstOperandIndex(Kind)); + } + static Value *getRHS(RecurKind Kind, Instruction *I) { + if (Kind == RecurKind::None) + return nullptr; + return I->getOperand(getFirstOperandIndex(Kind) + 1); + } + public: HorizontalReduction() = default; /// Try to find a reduction tree. bool matchAssociativeReduction(PHINode *Phi, Instruction *B) { assert((!Phi || is_contained(Phi->operands(), B)) && - "Phi needs to use the binary operator"); + "Phi needs to use the binary operator"); - RdxKind = getRdxKind(B); + RdxKind = getRdxKind(B); // We could have a initial reductions that is not an add. // r *= v1 + v2 + v3 + v4 // In such a case start looking for a tree rooted in the first '+'. if (Phi) { - if (getLHS(RdxKind, B) == Phi) { + if (getLHS(RdxKind, B) == Phi) { Phi = nullptr; - B = dyn_cast<Instruction>(getRHS(RdxKind, B)); - if (!B) - return false; - RdxKind = getRdxKind(B); - } else if (getRHS(RdxKind, B) == Phi) { + B = dyn_cast<Instruction>(getRHS(RdxKind, B)); + if (!B) + return false; + RdxKind = getRdxKind(B); + } else if (getRHS(RdxKind, B) == Phi) { Phi = nullptr; - B = dyn_cast<Instruction>(getLHS(RdxKind, B)); - if (!B) - return false; - RdxKind = getRdxKind(B); + B = dyn_cast<Instruction>(getLHS(RdxKind, B)); + if (!B) + return false; + RdxKind = getRdxKind(B); } } - if (!isVectorizable(RdxKind, B)) + if (!isVectorizable(RdxKind, B)) return false; - // Analyze "regular" integer/FP types for reductions - no target-specific - // types or pointers. + // Analyze "regular" integer/FP types for reductions - no target-specific + // types or pointers. Type *Ty = B->getType(); - if (!isValidElementType(Ty) || Ty->isPointerTy()) + if (!isValidElementType(Ty) || Ty->isPointerTy()) return false; ReductionRoot = B; - // The opcode for leaf values that we perform a reduction on. - // For example: load(x) + load(y) + load(z) + fptoui(w) - // The leaf opcode for 'w' does not match, so we don't include it as a - // potential candidate for the reduction. - unsigned LeafOpcode = 0; - + // The opcode for leaf values that we perform a reduction on. + // For example: load(x) + load(y) + load(z) + fptoui(w) + // The leaf opcode for 'w' does not match, so we don't include it as a + // potential candidate for the reduction. + unsigned LeafOpcode = 0; + // Post order traverse the reduction tree starting at B. We only handle true // trees containing only binary operators. SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; - Stack.push_back(std::make_pair(B, getFirstOperandIndex(RdxKind))); - initReductionOps(RdxKind); + Stack.push_back(std::make_pair(B, getFirstOperandIndex(RdxKind))); + initReductionOps(RdxKind); while (!Stack.empty()) { Instruction *TreeN = Stack.back().first; - unsigned EdgeToVisit = Stack.back().second++; - const RecurKind TreeRdxKind = getRdxKind(TreeN); - bool IsReducedValue = TreeRdxKind != RdxKind; + unsigned EdgeToVisit = Stack.back().second++; + const RecurKind TreeRdxKind = getRdxKind(TreeN); + bool IsReducedValue = TreeRdxKind != RdxKind; - // Postorder visit. - if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeRdxKind)) { + // Postorder visit. + if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeRdxKind)) { if (IsReducedValue) ReducedVals.push_back(TreeN); else { @@ -6759,7 +6759,7 @@ public: markExtraArg(Stack[Stack.size() - 2], TreeN); ExtraArgs.erase(TreeN); } else - addReductionOps(RdxKind, TreeN); + addReductionOps(RdxKind, TreeN); } // Retract. Stack.pop_back(); @@ -6767,72 +6767,72 @@ public: } // Visit left or right. - Value *EdgeVal = TreeN->getOperand(EdgeToVisit); - auto *I = dyn_cast<Instruction>(EdgeVal); - if (!I) { - // Edge value is not a reduction instruction or a leaf instruction. - // (It may be a constant, function argument, or something else.) - markExtraArg(Stack.back(), EdgeVal); - continue; - } - RecurKind EdgeRdxKind = getRdxKind(I); - // Continue analysis if the next operand is a reduction operation or - // (possibly) a leaf value. If the leaf value opcode is not set, - // the first met operation != reduction operation is considered as the - // leaf opcode. - // Only handle trees in the current basic block. - // Each tree node needs to have minimal number of users except for the - // ultimate reduction. - const bool IsRdxInst = EdgeRdxKind == RdxKind; - if (I != Phi && I != B && - hasSameParent(RdxKind, I, B->getParent(), IsRdxInst) && - hasRequiredNumberOfUses(RdxKind, I, IsRdxInst) && - (!LeafOpcode || LeafOpcode == I->getOpcode() || IsRdxInst)) { - if (IsRdxInst) { - // We need to be able to reassociate the reduction operations. - if (!isVectorizable(EdgeRdxKind, I)) { + Value *EdgeVal = TreeN->getOperand(EdgeToVisit); + auto *I = dyn_cast<Instruction>(EdgeVal); + if (!I) { + // Edge value is not a reduction instruction or a leaf instruction. + // (It may be a constant, function argument, or something else.) + markExtraArg(Stack.back(), EdgeVal); + continue; + } + RecurKind EdgeRdxKind = getRdxKind(I); + // Continue analysis if the next operand is a reduction operation or + // (possibly) a leaf value. If the leaf value opcode is not set, + // the first met operation != reduction operation is considered as the + // leaf opcode. + // Only handle trees in the current basic block. + // Each tree node needs to have minimal number of users except for the + // ultimate reduction. + const bool IsRdxInst = EdgeRdxKind == RdxKind; + if (I != Phi && I != B && + hasSameParent(RdxKind, I, B->getParent(), IsRdxInst) && + hasRequiredNumberOfUses(RdxKind, I, IsRdxInst) && + (!LeafOpcode || LeafOpcode == I->getOpcode() || IsRdxInst)) { + if (IsRdxInst) { + // We need to be able to reassociate the reduction operations. + if (!isVectorizable(EdgeRdxKind, I)) { // I is an extra argument for TreeN (its parent operation). markExtraArg(Stack.back(), I); continue; } - } else if (!LeafOpcode) { - LeafOpcode = I->getOpcode(); + } else if (!LeafOpcode) { + LeafOpcode = I->getOpcode(); } - Stack.push_back(std::make_pair(I, getFirstOperandIndex(EdgeRdxKind))); - continue; + Stack.push_back(std::make_pair(I, getFirstOperandIndex(EdgeRdxKind))); + continue; } - // I is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), I); + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); } return true; } - /// Attempt to vectorize the tree found by matchAssociativeReduction. + /// Attempt to vectorize the tree found by matchAssociativeReduction. bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { - // If there are a sufficient number of reduction values, reduce - // to a nearby power-of-2. We can safely generate oversized + // If there are a sufficient number of reduction values, reduce + // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. unsigned NumReducedVals = ReducedVals.size(); if (NumReducedVals < 4) return false; - // Intersect the fast-math-flags from all reduction operations. - FastMathFlags RdxFMF; - RdxFMF.set(); - for (ReductionOpsType &RdxOp : ReductionOps) { - for (Value *RdxVal : RdxOp) { - if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal)) - RdxFMF &= FPMO->getFastMathFlags(); - } - } + // Intersect the fast-math-flags from all reduction operations. + FastMathFlags RdxFMF; + RdxFMF.set(); + for (ReductionOpsType &RdxOp : ReductionOps) { + for (Value *RdxVal : RdxOp) { + if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal)) + RdxFMF &= FPMO->getFastMathFlags(); + } + } IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); - Builder.setFastMathFlags(RdxFMF); + Builder.setFastMathFlags(RdxFMF); BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; - // The same extra argument may be used several times, so log each attempt + // The same extra argument may be used several times, so log each attempt // to use it. - for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) { + for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) { assert(Pair.first && "DebugLoc must be set."); ExternallyUsedValues[Pair.second].push_back(Pair.first); } @@ -6852,48 +6852,48 @@ public: // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; SmallVector<Value *, 16> IgnoreList; - for (ReductionOpsType &RdxOp : ReductionOps) - IgnoreList.append(RdxOp.begin(), RdxOp.end()); - - unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); - if (NumReducedVals > ReduxWidth) { - // In the loop below, we are building a tree based on a window of - // 'ReduxWidth' values. - // If the operands of those values have common traits (compare predicate, - // constant operand, etc), then we want to group those together to - // minimize the cost of the reduction. - - // TODO: This should be extended to count common operands for - // compares and binops. - - // Step 1: Count the number of times each compare predicate occurs. - SmallDenseMap<unsigned, unsigned> PredCountMap; - for (Value *RdxVal : ReducedVals) { - CmpInst::Predicate Pred; - if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value()))) - ++PredCountMap[Pred]; - } - // Step 2: Sort the values so the most common predicates come first. - stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) { - CmpInst::Predicate PredA, PredB; - if (match(A, m_Cmp(PredA, m_Value(), m_Value())) && - match(B, m_Cmp(PredB, m_Value(), m_Value()))) { - return PredCountMap[PredA] > PredCountMap[PredB]; - } - return false; - }); - } - - Value *VectorizedTree = nullptr; - unsigned i = 0; + for (ReductionOpsType &RdxOp : ReductionOps) + IgnoreList.append(RdxOp.begin(), RdxOp.end()); + + unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); + if (NumReducedVals > ReduxWidth) { + // In the loop below, we are building a tree based on a window of + // 'ReduxWidth' values. + // If the operands of those values have common traits (compare predicate, + // constant operand, etc), then we want to group those together to + // minimize the cost of the reduction. + + // TODO: This should be extended to count common operands for + // compares and binops. + + // Step 1: Count the number of times each compare predicate occurs. + SmallDenseMap<unsigned, unsigned> PredCountMap; + for (Value *RdxVal : ReducedVals) { + CmpInst::Predicate Pred; + if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value()))) + ++PredCountMap[Pred]; + } + // Step 2: Sort the values so the most common predicates come first. + stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) { + CmpInst::Predicate PredA, PredB; + if (match(A, m_Cmp(PredA, m_Value(), m_Value())) && + match(B, m_Cmp(PredB, m_Value(), m_Value()))) { + return PredCountMap[PredA] > PredCountMap[PredB]; + } + return false; + }); + } + + Value *VectorizedTree = nullptr; + unsigned i = 0; while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { - ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth); + ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, IgnoreList); Optional<ArrayRef<unsigned>> Order = V.bestOrder(); - if (Order) { - assert(Order->size() == VL.size() && - "Order size must be the same as number of vectorized " - "instructions."); + if (Order) { + assert(Order->size() == VL.size() && + "Order size must be the same as number of vectorized " + "instructions."); // TODO: reorder tree nodes without tree rebuilding. SmallVector<Value *, 4> ReorderedOps(VL.size()); llvm::transform(*Order, ReorderedOps.begin(), @@ -6902,66 +6902,66 @@ public: } if (V.isTreeTinyAndNotFullyVectorizable()) break; - if (V.isLoadCombineReductionCandidate(RdxKind)) + if (V.isLoadCombineReductionCandidate(RdxKind)) break; V.computeMinimumValueSizes(); // Estimate cost. - InstructionCost TreeCost = V.getTreeCost(); - InstructionCost ReductionCost = - getReductionCost(TTI, ReducedVals[i], ReduxWidth); - InstructionCost Cost = TreeCost + ReductionCost; - if (!Cost.isValid()) { - LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); - return false; - } + InstructionCost TreeCost = V.getTreeCost(); + InstructionCost ReductionCost = + getReductionCost(TTI, ReducedVals[i], ReduxWidth); + InstructionCost Cost = TreeCost + ReductionCost; + if (!Cost.isValid()) { + LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); + return false; + } if (Cost >= -SLPCostThreshold) { - V.getORE()->emit([&]() { - return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", - cast<Instruction>(VL[0])) - << "Vectorizing horizontal reduction is possible" - << "but not beneficial with cost " << ore::NV("Cost", Cost) - << " and threshold " - << ore::NV("Threshold", -SLPCostThreshold); - }); - break; + V.getORE()->emit([&]() { + return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", + cast<Instruction>(VL[0])) + << "Vectorizing horizontal reduction is possible" + << "but not beneficial with cost " << ore::NV("Cost", Cost) + << " and threshold " + << ore::NV("Threshold", -SLPCostThreshold); + }); + break; } LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost << ". (HorRdx)\n"); V.getORE()->emit([&]() { - return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", - cast<Instruction>(VL[0])) - << "Vectorized horizontal reduction with cost " - << ore::NV("Cost", Cost) << " and with tree size " - << ore::NV("TreeSize", V.getTreeSize()); + return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", + cast<Instruction>(VL[0])) + << "Vectorized horizontal reduction with cost " + << ore::NV("Cost", Cost) << " and with tree size " + << ore::NV("TreeSize", V.getTreeSize()); }); // Vectorize a tree. DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); - // Emit a reduction. If the root is a select (min/max idiom), the insert + // Emit a reduction. If the root is a select (min/max idiom), the insert // point is the compare condition of that select. Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); - if (isCmpSel(RdxKind)) + if (isCmpSel(RdxKind)) Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst)); else Builder.SetInsertPoint(RdxRootInst); Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); - - if (!VectorizedTree) { - // Initialize the final value in the reduction. - VectorizedTree = ReducedSubTree; - } else { - // Update the final value in the reduction. + + if (!VectorizedTree) { + // Initialize the final value in the reduction. + VectorizedTree = ReducedSubTree; + } else { + // Update the final value in the reduction. Builder.SetCurrentDebugLocation(Loc); - VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, - ReducedSubTree, "op.rdx", ReductionOps); - } + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, + ReducedSubTree, "op.rdx", ReductionOps); + } i += ReduxWidth; ReduxWidth = PowerOf2Floor(NumReducedVals - i); } @@ -6971,15 +6971,15 @@ public: for (; i < NumReducedVals; ++i) { auto *I = cast<Instruction>(ReducedVals[i]); Builder.SetCurrentDebugLocation(I->getDebugLoc()); - VectorizedTree = - createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps); + VectorizedTree = + createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps); } for (auto &Pair : ExternallyUsedValues) { // Add each externally used value to the final reduction. for (auto *I : Pair.second) { Builder.SetCurrentDebugLocation(I->getDebugLoc()); - VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, - Pair.first, "op.extra", I); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, + Pair.first, "op.extra", I); } } @@ -6987,7 +6987,7 @@ public: // select, we also have to RAUW for the compare instruction feeding the // reduction root. That's because the original compare may have extra uses // besides the final select of the reduction. - if (isCmpSel(RdxKind)) { + if (isCmpSel(RdxKind)) { if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) { Instruction *ScalarCmp = getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot)); @@ -7003,68 +7003,68 @@ public: return VectorizedTree != nullptr; } - unsigned numReductionValues() const { return ReducedVals.size(); } + unsigned numReductionValues() const { return ReducedVals.size(); } private: /// Calculate the cost of a reduction. - InstructionCost getReductionCost(TargetTransformInfo *TTI, - Value *FirstReducedVal, - unsigned ReduxWidth) { + InstructionCost getReductionCost(TargetTransformInfo *TTI, + Value *FirstReducedVal, + unsigned ReduxWidth) { Type *ScalarTy = FirstReducedVal->getType(); - FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); - InstructionCost VectorCost, ScalarCost; - switch (RdxKind) { - case RecurKind::Add: - case RecurKind::Mul: - case RecurKind::Or: - case RecurKind::And: - case RecurKind::Xor: - case RecurKind::FAdd: - case RecurKind::FMul: { - unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); - VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, - /*IsPairwiseForm=*/false); - ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); + FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); + InstructionCost VectorCost, ScalarCost; + switch (RdxKind) { + case RecurKind::Add: + case RecurKind::Mul: + case RecurKind::Or: + case RecurKind::And: + case RecurKind::Xor: + case RecurKind::FAdd: + case RecurKind::FMul: { + unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); + VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, + /*IsPairwiseForm=*/false); + ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); break; - } - case RecurKind::FMax: - case RecurKind::FMin: { - auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); - VectorCost = - TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*pairwise=*/false, /*unsigned=*/false); - ScalarCost = - TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, - CmpInst::makeCmpResultType(ScalarTy)); + } + case RecurKind::FMax: + case RecurKind::FMin: { + auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); + VectorCost = + TTI->getMinMaxReductionCost(VectorTy, VecCondTy, + /*pairwise=*/false, /*unsigned=*/false); + ScalarCost = + TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, + CmpInst::makeCmpResultType(ScalarTy)); break; } - case RecurKind::SMax: - case RecurKind::SMin: - case RecurKind::UMax: - case RecurKind::UMin: { - auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); - bool IsUnsigned = - RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; - VectorCost = - TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*IsPairwiseForm=*/false, IsUnsigned); - ScalarCost = - TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) + + case RecurKind::SMax: + case RecurKind::SMin: + case RecurKind::UMax: + case RecurKind::UMin: { + auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); + bool IsUnsigned = + RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; + VectorCost = + TTI->getMinMaxReductionCost(VectorTy, VecCondTy, + /*IsPairwiseForm=*/false, IsUnsigned); + ScalarCost = + TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, CmpInst::makeCmpResultType(ScalarTy)); break; - } - default: + } + default: llvm_unreachable("Expected arithmetic or min/max reduction operation"); } - // Scalar cost is repeated for N-1 elements. - ScalarCost *= (ReduxWidth - 1); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost + // Scalar cost is repeated for N-1 elements. + ScalarCost *= (ReduxWidth - 1); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost << " for reduction that starts with " << *FirstReducedVal - << " (It is a splitting reduction)\n"); - return VectorCost - ScalarCost; + << " (It is a splitting reduction)\n"); + return VectorCost - ScalarCost; } /// Emit a horizontal reduction of the vectorized value. @@ -7074,142 +7074,142 @@ private: assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); - return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, - ReductionOps.back()); - } -}; - -} // end anonymous namespace - -static Optional<unsigned> getAggregateSize(Instruction *InsertInst) { - if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) - return cast<FixedVectorType>(IE->getType())->getNumElements(); - - unsigned AggregateSize = 1; - auto *IV = cast<InsertValueInst>(InsertInst); - Type *CurrentType = IV->getType(); - do { - if (auto *ST = dyn_cast<StructType>(CurrentType)) { - for (auto *Elt : ST->elements()) - if (Elt != ST->getElementType(0)) // check homogeneity - return None; - AggregateSize *= ST->getNumElements(); - CurrentType = ST->getElementType(0); - } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { - AggregateSize *= AT->getNumElements(); - CurrentType = AT->getElementType(); - } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) { - AggregateSize *= VT->getNumElements(); - return AggregateSize; - } else if (CurrentType->isSingleValueType()) { - return AggregateSize; - } else { - return None; - } - } while (true); -} - -static Optional<unsigned> getOperandIndex(Instruction *InsertInst, - unsigned OperandOffset) { - unsigned OperandIndex = OperandOffset; - if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { - if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { - auto *VT = cast<FixedVectorType>(IE->getType()); - OperandIndex *= VT->getNumElements(); - OperandIndex += CI->getZExtValue(); - return OperandIndex; - } - return None; - } - - auto *IV = cast<InsertValueInst>(InsertInst); - Type *CurrentType = IV->getType(); - for (unsigned int Index : IV->indices()) { - if (auto *ST = dyn_cast<StructType>(CurrentType)) { - OperandIndex *= ST->getNumElements(); - CurrentType = ST->getElementType(Index); - } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { - OperandIndex *= AT->getNumElements(); - CurrentType = AT->getElementType(); - } else { - return None; - } - OperandIndex += Index; - } - return OperandIndex; -} - -static bool findBuildAggregate_rec(Instruction *LastInsertInst, - TargetTransformInfo *TTI, - SmallVectorImpl<Value *> &BuildVectorOpds, - SmallVectorImpl<Value *> &InsertElts, - unsigned OperandOffset) { - do { - Value *InsertedOperand = LastInsertInst->getOperand(1); - Optional<unsigned> OperandIndex = - getOperandIndex(LastInsertInst, OperandOffset); - if (!OperandIndex) - return false; - if (isa<InsertElementInst>(InsertedOperand) || - isa<InsertValueInst>(InsertedOperand)) { - if (!findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI, - BuildVectorOpds, InsertElts, *OperandIndex)) - return false; - } else { - BuildVectorOpds[*OperandIndex] = InsertedOperand; - InsertElts[*OperandIndex] = LastInsertInst; - } - if (isa<UndefValue>(LastInsertInst->getOperand(0))) - return true; - LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0)); - } while (LastInsertInst != nullptr && - (isa<InsertValueInst>(LastInsertInst) || - isa<InsertElementInst>(LastInsertInst)) && - LastInsertInst->hasOneUse()); - return false; -} - + return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, + ReductionOps.back()); + } +}; + +} // end anonymous namespace + +static Optional<unsigned> getAggregateSize(Instruction *InsertInst) { + if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) + return cast<FixedVectorType>(IE->getType())->getNumElements(); + + unsigned AggregateSize = 1; + auto *IV = cast<InsertValueInst>(InsertInst); + Type *CurrentType = IV->getType(); + do { + if (auto *ST = dyn_cast<StructType>(CurrentType)) { + for (auto *Elt : ST->elements()) + if (Elt != ST->getElementType(0)) // check homogeneity + return None; + AggregateSize *= ST->getNumElements(); + CurrentType = ST->getElementType(0); + } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { + AggregateSize *= AT->getNumElements(); + CurrentType = AT->getElementType(); + } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) { + AggregateSize *= VT->getNumElements(); + return AggregateSize; + } else if (CurrentType->isSingleValueType()) { + return AggregateSize; + } else { + return None; + } + } while (true); +} + +static Optional<unsigned> getOperandIndex(Instruction *InsertInst, + unsigned OperandOffset) { + unsigned OperandIndex = OperandOffset; + if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { + if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { + auto *VT = cast<FixedVectorType>(IE->getType()); + OperandIndex *= VT->getNumElements(); + OperandIndex += CI->getZExtValue(); + return OperandIndex; + } + return None; + } + + auto *IV = cast<InsertValueInst>(InsertInst); + Type *CurrentType = IV->getType(); + for (unsigned int Index : IV->indices()) { + if (auto *ST = dyn_cast<StructType>(CurrentType)) { + OperandIndex *= ST->getNumElements(); + CurrentType = ST->getElementType(Index); + } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { + OperandIndex *= AT->getNumElements(); + CurrentType = AT->getElementType(); + } else { + return None; + } + OperandIndex += Index; + } + return OperandIndex; +} + +static bool findBuildAggregate_rec(Instruction *LastInsertInst, + TargetTransformInfo *TTI, + SmallVectorImpl<Value *> &BuildVectorOpds, + SmallVectorImpl<Value *> &InsertElts, + unsigned OperandOffset) { + do { + Value *InsertedOperand = LastInsertInst->getOperand(1); + Optional<unsigned> OperandIndex = + getOperandIndex(LastInsertInst, OperandOffset); + if (!OperandIndex) + return false; + if (isa<InsertElementInst>(InsertedOperand) || + isa<InsertValueInst>(InsertedOperand)) { + if (!findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI, + BuildVectorOpds, InsertElts, *OperandIndex)) + return false; + } else { + BuildVectorOpds[*OperandIndex] = InsertedOperand; + InsertElts[*OperandIndex] = LastInsertInst; + } + if (isa<UndefValue>(LastInsertInst->getOperand(0))) + return true; + LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0)); + } while (LastInsertInst != nullptr && + (isa<InsertValueInst>(LastInsertInst) || + isa<InsertElementInst>(LastInsertInst)) && + LastInsertInst->hasOneUse()); + return false; +} + /// Recognize construction of vectors like -/// %ra = insertelement <4 x float> poison, float %s0, i32 0 +/// %ra = insertelement <4 x float> poison, float %s0, i32 0 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 /// starting from the last insertelement or insertvalue instruction. /// -/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, +/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, /// {{float, float}, {float, float}}, [2 x {float, float}] and so on. /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. /// /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. /// /// \return true if it matches. -static bool findBuildAggregate(Instruction *LastInsertInst, - TargetTransformInfo *TTI, +static bool findBuildAggregate(Instruction *LastInsertInst, + TargetTransformInfo *TTI, SmallVectorImpl<Value *> &BuildVectorOpds, SmallVectorImpl<Value *> &InsertElts) { - + assert((isa<InsertElementInst>(LastInsertInst) || isa<InsertValueInst>(LastInsertInst)) && "Expected insertelement or insertvalue instruction!"); - - assert((BuildVectorOpds.empty() && InsertElts.empty()) && - "Expected empty result vectors!"); - - Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst); - if (!AggregateSize) - return false; - BuildVectorOpds.resize(*AggregateSize); - InsertElts.resize(*AggregateSize); - - if (findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, - 0)) { - llvm::erase_value(BuildVectorOpds, nullptr); - llvm::erase_value(InsertElts, nullptr); - if (BuildVectorOpds.size() >= 2) - return true; - } - - return false; + + assert((BuildVectorOpds.empty() && InsertElts.empty()) && + "Expected empty result vectors!"); + + Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst); + if (!AggregateSize) + return false; + BuildVectorOpds.resize(*AggregateSize); + InsertElts.resize(*AggregateSize); + + if (findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, + 0)) { + llvm::erase_value(BuildVectorOpds, nullptr); + llvm::erase_value(InsertElts, nullptr); + if (BuildVectorOpds.size() >= 2) + return true; + } + + return false; } static bool PhiTypeSorterFunc(Value *V, Value *V2) { @@ -7267,16 +7267,16 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P, return nullptr; } -static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { - if (match(I, m_BinOp(m_Value(V0), m_Value(V1)))) - return true; - if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1)))) - return true; - if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1)))) - return true; - return false; -} - +static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { + if (match(I, m_BinOp(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1)))) + return true; + return false; +} + /// Attempt to reduce a horizontal reduction. /// If it is legal to match a horizontal reduction feeding the phi node \a P /// with reduction operators \a Root (or one of its operands) in a basic block @@ -7316,10 +7316,10 @@ static bool tryToVectorizeHorReductionOrInstOperands( Instruction *Inst; unsigned Level; std::tie(Inst, Level) = Stack.pop_back_val(); - Value *B0, *B1; - bool IsBinop = matchRdxBop(Inst, B0, B1); - bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); - if (IsBinop || IsSelect) { + Value *B0, *B1; + bool IsBinop = matchRdxBop(Inst, B0, B1); + bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); + if (IsBinop || IsSelect) { HorizontalReduction HorRdx; if (HorRdx.matchAssociativeReduction(P, Inst)) { if (HorRdx.tryToReduce(R, TTI)) { @@ -7330,10 +7330,10 @@ static bool tryToVectorizeHorReductionOrInstOperands( continue; } } - if (P && IsBinop) { - Inst = dyn_cast<Instruction>(B0); + if (P && IsBinop) { + Inst = dyn_cast<Instruction>(B0); if (Inst == P) - Inst = dyn_cast<Instruction>(B1); + Inst = dyn_cast<Instruction>(B1); if (!Inst) { // Set P to nullptr to avoid re-analysis of phi node in // matchAssociativeReduction function unless this is the root node. @@ -7366,7 +7366,7 @@ static bool tryToVectorizeHorReductionOrInstOperands( bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI) { - auto *I = dyn_cast_or_null<Instruction>(V); + auto *I = dyn_cast_or_null<Instruction>(V); if (!I) return false; @@ -7388,7 +7388,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, SmallVector<Value *, 16> BuildVectorOpds; SmallVector<Value *, 16> BuildVectorInsts; - if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts)) + if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts)) return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); @@ -7475,7 +7475,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Look for the next elements with the same type. SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; while (SameTypeIt != E && - (*SameTypeIt)->getType() == (*IncIt)->getType()) { + (*SameTypeIt)->getType() == (*IncIt)->getType()) { VisitedInstrs.insert(*SameTypeIt); ++SameTypeIt; } @@ -7507,17 +7507,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { SmallVector<Instruction *, 8> PostProcessInstructions; SmallDenseSet<Instruction *, 4> KeyNodes; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - // Skip instructions with scalable type. The num of elements is unknown at - // compile-time for scalable type. - if (isa<ScalableVectorType>(it->getType())) - continue; - + // Skip instructions with scalable type. The num of elements is unknown at + // compile-time for scalable type. + if (isa<ScalableVectorType>(it->getType())) + continue; + // Skip instructions marked for the deletion. if (R.isDeleted(&*it)) continue; // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second) { - if (it->use_empty() && KeyNodes.contains(&*it) && + if (it->use_empty() && KeyNodes.contains(&*it) && vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. @@ -7534,29 +7534,29 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Try to vectorize reductions that use PHINodes. if (PHINode *P = dyn_cast<PHINode>(it)) { // Check that the PHI is a reduction PHI. - if (P->getNumIncomingValues() == 2) { - // Try to match and vectorize a horizontal reduction. - if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, - TTI)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } - } - // Try to vectorize the incoming values of the PHI, to catch reductions - // that feed into PHIs. - for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) { - // Skip if the incoming block is the current BB for now. Also, bypass - // unreachable IR for efficiency and to avoid crashing. - // TODO: Collect the skipped incoming values and try to vectorize them - // after processing BB. - if (BB == P->getIncomingBlock(I) || - !DT->isReachableFromEntry(P->getIncomingBlock(I))) - continue; - - Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), - P->getIncomingBlock(I), R, TTI); + if (P->getNumIncomingValues() == 2) { + // Try to match and vectorize a horizontal reduction. + if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, + TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; + } + } + // Try to vectorize the incoming values of the PHI, to catch reductions + // that feed into PHIs. + for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) { + // Skip if the incoming block is the current BB for now. Also, bypass + // unreachable IR for efficiency and to avoid crashing. + // TODO: Collect the skipped incoming values and try to vectorize them + // after processing BB. + if (BB == P->getIncomingBlock(I) || + !DT->isReachableFromEntry(P->getIncomingBlock(I))) + continue; + + Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), + P->getIncomingBlock(I), R, TTI); } continue; } @@ -7620,7 +7620,7 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { unsigned MaxElts = MaxVecRegSize / EltSize; for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) { auto Len = std::min<unsigned>(BE - BI, MaxElts); - ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len); + ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len); // Initialize a set a candidate getelementptrs. Note that we use a // SetVector here to preserve program order. If the index computations diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h index 8737016760..dd33853d34 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -61,19 +61,19 @@ class VPRecipeBuilder { /// Check if the load or store instruction \p I should widened for \p /// Range.Start and potentially masked. Such instructions are handled by a /// recipe that takes an additional VPInstruction for the mask. - VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan); + VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan); /// Check if an induction recipe should be constructed for \I. If so build and /// return it. If not, return null. - VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi, - VPlan &Plan) const; + VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi, + VPlan &Plan) const; /// Optimize the special case where the operand of \p I is a constant integer /// induction variable. VPWidenIntOrFpInductionRecipe * - tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, - VPlan &Plan) const; + tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, + VPlan &Plan) const; /// Handle non-loop phi nodes. Currently all such phi nodes are turned into /// a sequence of select instructions as the vectorizer currently performs diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp index b26399e0ae..e65b4ea4a7 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp @@ -20,10 +20,10 @@ #include "VPlanDominatorTree.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" -#include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -58,69 +58,69 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { return OS; } -VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def) - : SubclassID(SC), UnderlyingVal(UV), Def(Def) { - if (Def) - Def->addDefinedValue(this); -} - -VPValue::~VPValue() { - assert(Users.empty() && "trying to delete a VPValue with remaining users"); - if (Def) - Def->removeDefinedValue(this); -} - +VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def) + : SubclassID(SC), UnderlyingVal(UV), Def(Def) { + if (Def) + Def->addDefinedValue(this); +} + +VPValue::~VPValue() { + assert(Users.empty() && "trying to delete a VPValue with remaining users"); + if (Def) + Def->removeDefinedValue(this); +} + void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const { - if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def)) - R->print(OS, "", SlotTracker); + if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def)) + R->print(OS, "", SlotTracker); else printAsOperand(OS, SlotTracker); } -void VPValue::dump() const { - const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def); - VPSlotTracker SlotTracker( - (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); - print(dbgs(), SlotTracker); - dbgs() << "\n"; -} - -void VPDef::dump() const { - const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this); - VPSlotTracker SlotTracker( - (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); - print(dbgs(), "", SlotTracker); - dbgs() << "\n"; -} - -VPUser *VPRecipeBase::toVPUser() { - if (auto *U = dyn_cast<VPInstruction>(this)) - return U; - if (auto *U = dyn_cast<VPWidenRecipe>(this)) - return U; - if (auto *U = dyn_cast<VPWidenCallRecipe>(this)) - return U; - if (auto *U = dyn_cast<VPWidenSelectRecipe>(this)) - return U; - if (auto *U = dyn_cast<VPWidenGEPRecipe>(this)) - return U; - if (auto *U = dyn_cast<VPBlendRecipe>(this)) - return U; - if (auto *U = dyn_cast<VPInterleaveRecipe>(this)) - return U; - if (auto *U = dyn_cast<VPReplicateRecipe>(this)) - return U; - if (auto *U = dyn_cast<VPBranchOnMaskRecipe>(this)) - return U; - if (auto *U = dyn_cast<VPWidenMemoryInstructionRecipe>(this)) - return U; - if (auto *U = dyn_cast<VPReductionRecipe>(this)) - return U; - if (auto *U = dyn_cast<VPPredInstPHIRecipe>(this)) - return U; - return nullptr; -} - +void VPValue::dump() const { + const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def); + VPSlotTracker SlotTracker( + (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); + print(dbgs(), SlotTracker); + dbgs() << "\n"; +} + +void VPDef::dump() const { + const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this); + VPSlotTracker SlotTracker( + (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); + print(dbgs(), "", SlotTracker); + dbgs() << "\n"; +} + +VPUser *VPRecipeBase::toVPUser() { + if (auto *U = dyn_cast<VPInstruction>(this)) + return U; + if (auto *U = dyn_cast<VPWidenRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPWidenCallRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPWidenSelectRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPWidenGEPRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPBlendRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPInterleaveRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPReplicateRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPBranchOnMaskRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPWidenMemoryInstructionRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPReductionRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPPredInstPHIRecipe>(this)) + return U; + return nullptr; +} + // Get the top-most entry block of \p Start. This is the entry block of the // containing VPlan. This function is templated to support both const and non-const blocks template <typename T> static T *getPlanEntry(T *Start) { @@ -200,43 +200,43 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { } void VPBlockBase::deleteCFG(VPBlockBase *Entry) { - SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry)); + SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry)); for (VPBlockBase *Block : Blocks) delete Block; } -VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { - iterator It = begin(); - while (It != end() && (isa<VPWidenPHIRecipe>(&*It) || - isa<VPWidenIntOrFpInductionRecipe>(&*It) || - isa<VPPredInstPHIRecipe>(&*It) || - isa<VPWidenCanonicalIVRecipe>(&*It))) - It++; - return It; -} - -Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { - if (!Def->getDef() && OrigLoop->isLoopInvariant(Def->getLiveInIRValue())) - return Def->getLiveInIRValue(); - - if (hasScalarValue(Def, Instance)) - return Data.PerPartScalars[Def][Instance.Part][Instance.Lane]; - - if (hasVectorValue(Def, Instance.Part)) { - assert(Data.PerPartOutput.count(Def)); - auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; - if (!VecPart->getType()->isVectorTy()) { - assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar"); - return VecPart; - } - // TODO: Cache created scalar values. - return Builder.CreateExtractElement(VecPart, - Builder.getInt32(Instance.Lane)); - } - return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance); -} - +VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { + iterator It = begin(); + while (It != end() && (isa<VPWidenPHIRecipe>(&*It) || + isa<VPWidenIntOrFpInductionRecipe>(&*It) || + isa<VPPredInstPHIRecipe>(&*It) || + isa<VPWidenCanonicalIVRecipe>(&*It))) + It++; + return It; +} + +Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { + if (!Def->getDef() && OrigLoop->isLoopInvariant(Def->getLiveInIRValue())) + return Def->getLiveInIRValue(); + + if (hasScalarValue(Def, Instance)) + return Data.PerPartScalars[Def][Instance.Part][Instance.Lane]; + + if (hasVectorValue(Def, Instance.Part)) { + assert(Data.PerPartOutput.count(Def)); + auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; + if (!VecPart->getType()->isVectorTy()) { + assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar"); + return VecPart; + } + // TODO: Cache created scalar values. + return Builder.CreateExtractElement(VecPart, + Builder.getInt32(Instance.Lane)); + } + return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance); +} + BasicBlock * VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks. @@ -354,24 +354,24 @@ void VPBasicBlock::execute(VPTransformState *State) { LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB); } -void VPBasicBlock::dropAllReferences(VPValue *NewValue) { - for (VPRecipeBase &R : Recipes) { - for (auto *Def : R.definedValues()) - Def->replaceAllUsesWith(NewValue); - - if (auto *User = R.toVPUser()) - for (unsigned I = 0, E = User->getNumOperands(); I != E; I++) - User->setOperand(I, NewValue); - } -} - -void VPRegionBlock::dropAllReferences(VPValue *NewValue) { - for (VPBlockBase *Block : depth_first(Entry)) - // Drop all references in VPBasicBlocks and replace all uses with - // DummyValue. - Block->dropAllReferences(NewValue); -} - +void VPBasicBlock::dropAllReferences(VPValue *NewValue) { + for (VPRecipeBase &R : Recipes) { + for (auto *Def : R.definedValues()) + Def->replaceAllUsesWith(NewValue); + + if (auto *User = R.toVPUser()) + for (unsigned I = 0, E = User->getNumOperands(); I != E; I++) + User->setOperand(I, NewValue); + } +} + +void VPRegionBlock::dropAllReferences(VPValue *NewValue) { + for (VPBlockBase *Block : depth_first(Entry)) + // Drop all references in VPBasicBlocks and replace all uses with + // DummyValue. + Block->dropAllReferences(NewValue); +} + void VPRegionBlock::execute(VPTransformState *State) { ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry); @@ -405,9 +405,9 @@ void VPRegionBlock::execute(VPTransformState *State) { for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { State->Instance->Part = Part; - assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); - for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; - ++Lane) { + assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); + for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; + ++Lane) { State->Instance->Lane = Lane; // Visit the VPBlocks connected to \p this, starting from it. for (VPBlockBase *Block : RPOT) { @@ -453,14 +453,14 @@ void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { insertAfter(InsertPos); } -void VPRecipeBase::moveBefore(VPBasicBlock &BB, - iplist<VPRecipeBase>::iterator I) { - assert(I == BB.end() || I->getParent() == &BB); - removeFromParent(); - Parent = &BB; - BB.getRecipeList().insert(I, this); -} - +void VPRecipeBase::moveBefore(VPBasicBlock &BB, + iplist<VPRecipeBase>::iterator I) { + assert(I == BB.end() || I->getParent() == &BB); + removeFromParent(); + Parent = &BB; + BB.getRecipeList().insert(I, this); +} + void VPInstruction::generateInstruction(VPTransformState &State, unsigned Part) { IRBuilder<> &Builder = State.Builder; @@ -498,14 +498,14 @@ void VPInstruction::generateInstruction(VPTransformState &State, case VPInstruction::ActiveLaneMask: { // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), {Part, 0}); - // Get the original loop tripcount. - Value *ScalarTC = State.TripCount; + // Get the original loop tripcount. + Value *ScalarTC = State.TripCount; auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue()); + auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue()); Instruction *Call = Builder.CreateIntrinsic( - Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, - {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); + Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, + {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); State.set(this, Call, Part); break; } @@ -520,14 +520,14 @@ void VPInstruction::execute(VPTransformState &State) { generateInstruction(State, Part); } -void VPInstruction::dump() const { - VPSlotTracker SlotTracker(getParent()->getPlan()); - print(dbgs(), "", SlotTracker); -} - +void VPInstruction::dump() const { + VPSlotTracker SlotTracker(getParent()->getPlan()); + print(dbgs(), "", SlotTracker); +} + void VPInstruction::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "EMIT "; + O << "EMIT "; if (hasResult()) { printAsOperand(O, SlotTracker); @@ -573,7 +573,7 @@ void VPlan::execute(VPTransformState *State) { "trip.count.minus.1"); auto VF = State->VF; Value *VTCMO = - VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); + VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) State->set(BackedgeTakenCount, VTCMO, Part); } @@ -778,7 +778,7 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { // Dump the block predicate. const VPValue *Pred = BasicBlock->getPredicate(); if (Pred) { - OS << " +\n" << Indent << " \"BlockPredicate: \""; + OS << " +\n" << Indent << " \"BlockPredicate: \""; if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) { PredI->printAsOperand(OS, SlotTracker); OS << " (" << DOT::EscapeString(PredI->getParent()->getName()) @@ -788,7 +788,7 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { } for (const VPRecipeBase &Recipe : *BasicBlock) { - OS << " +\n" << Indent << "\""; + OS << " +\n" << Indent << "\""; Recipe.print(OS, Indent, SlotTracker); OS << "\\l\""; } @@ -827,7 +827,7 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { dumpEdges(Region); } -void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) { +void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) { std::string IngredientString; raw_string_ostream RSO(IngredientString); if (auto *Inst = dyn_cast<Instruction>(V)) { @@ -850,45 +850,45 @@ void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) { void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "WIDEN-CALL "; - - auto *CI = cast<CallInst>(getUnderlyingInstr()); - if (CI->getType()->isVoidTy()) - O << "void "; - else { - printAsOperand(O, SlotTracker); - O << " = "; - } - - O << "call @" << CI->getCalledFunction()->getName() << "("; - printOperands(O, SlotTracker); - O << ")"; + O << "WIDEN-CALL "; + + auto *CI = cast<CallInst>(getUnderlyingInstr()); + if (CI->getType()->isVoidTy()) + O << "void "; + else { + printAsOperand(O, SlotTracker); + O << " = "; + } + + O << "call @" << CI->getCalledFunction()->getName() << "("; + printOperands(O, SlotTracker); + O << ")"; } void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "WIDEN-SELECT "; - printAsOperand(O, SlotTracker); - O << " = select "; - getOperand(0)->printAsOperand(O, SlotTracker); - O << ", "; - getOperand(1)->printAsOperand(O, SlotTracker); - O << ", "; - getOperand(2)->printAsOperand(O, SlotTracker); - O << (InvariantCond ? " (condition is loop invariant)" : ""); + O << "WIDEN-SELECT "; + printAsOperand(O, SlotTracker); + O << " = select "; + getOperand(0)->printAsOperand(O, SlotTracker); + O << ", "; + getOperand(1)->printAsOperand(O, SlotTracker); + O << ", "; + getOperand(2)->printAsOperand(O, SlotTracker); + O << (InvariantCond ? " (condition is loop invariant)" : ""); } void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "WIDEN "; - printAsOperand(O, SlotTracker); - O << " = " << getUnderlyingInstr()->getOpcodeName() << " "; - printOperands(O, SlotTracker); + O << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = " << getUnderlyingInstr()->getOpcodeName() << " "; + printOperands(O, SlotTracker); } void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "WIDEN-INDUCTION"; + O << "WIDEN-INDUCTION"; if (Trunc) { O << "\\l\""; O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; @@ -899,26 +899,26 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "WIDEN-GEP "; + O << "WIDEN-GEP "; O << (IsPtrLoopInvariant ? "Inv" : "Var"); size_t IndicesNumber = IsIndexLoopInvariant.size(); for (size_t I = 0; I < IndicesNumber; ++I) O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; - - O << " "; - printAsOperand(O, SlotTracker); - O << " = getelementptr "; - printOperands(O, SlotTracker); + + O << " "; + printAsOperand(O, SlotTracker); + O << " = getelementptr "; + printOperands(O, SlotTracker); } void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "WIDEN-PHI " << VPlanIngredient(Phi); + O << "WIDEN-PHI " << VPlanIngredient(Phi); } void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "BLEND "; + O << "BLEND "; Phi->printAsOperand(O, false); O << " ="; if (getNumIncomingValues() == 1) { @@ -936,75 +936,75 @@ void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, } } -void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << "REDUCE "; - printAsOperand(O, SlotTracker); - O << " = "; - getChainOp()->printAsOperand(O, SlotTracker); - O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) - << " ("; - getVecOp()->printAsOperand(O, SlotTracker); - if (getCondOp()) { - O << ", "; - getCondOp()->printAsOperand(O, SlotTracker); - } - O << ")"; -} - +void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "REDUCE "; + printAsOperand(O, SlotTracker); + O << " = "; + getChainOp()->printAsOperand(O, SlotTracker); + O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) + << " ("; + getVecOp()->printAsOperand(O, SlotTracker); + if (getCondOp()) { + O << ", "; + getCondOp()->printAsOperand(O, SlotTracker); + } + O << ")"; +} + void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << (IsUniform ? "CLONE " : "REPLICATE "); - - if (!getUnderlyingInstr()->getType()->isVoidTy()) { - printAsOperand(O, SlotTracker); - O << " = "; - } - O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " "; - printOperands(O, SlotTracker); - + O << (IsUniform ? "CLONE " : "REPLICATE "); + + if (!getUnderlyingInstr()->getType()->isVoidTy()) { + printAsOperand(O, SlotTracker); + O << " = "; + } + O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " "; + printOperands(O, SlotTracker); + if (AlsoPack) O << " (S->V)"; } void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "PHI-PREDICATED-INSTRUCTION "; - printOperands(O, SlotTracker); + O << "PHI-PREDICATED-INSTRUCTION "; + printOperands(O, SlotTracker); } void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "WIDEN "; - - if (!isStore()) { - getVPValue()->printAsOperand(O, SlotTracker); - O << " = "; + O << "WIDEN "; + + if (!isStore()) { + getVPValue()->printAsOperand(O, SlotTracker); + O << " = "; } - O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; - - printOperands(O, SlotTracker); + O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; + + printOperands(O, SlotTracker); } void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { Value *CanonicalIV = State.CanonicalIV; Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - ElementCount VF = State.VF; - assert(!VF.isScalable() && "the code following assumes non scalables ECs"); - Value *VStart = VF.isScalar() + ElementCount VF = State.VF; + assert(!VF.isScalable() && "the code following assumes non scalables ECs"); + Value *VStart = VF.isScalar() ? CanonicalIV - : Builder.CreateVectorSplat(VF.getKnownMinValue(), - CanonicalIV, "broadcast"); + : Builder.CreateVectorSplat(VF.getKnownMinValue(), + CanonicalIV, "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { SmallVector<Constant *, 8> Indices; - for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) - Indices.push_back( - ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane)); + for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) + Indices.push_back( + ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane)); // If VF == 1, there is only one iteration in the loop above, thus the // element pushed back into Indices is ConstantInt::get(STy, Part) - Constant *VStep = - VF.isScalar() ? Indices.back() : ConstantVector::get(Indices); + Constant *VStep = + VF.isScalar() ? Indices.back() : ConstantVector::get(Indices); // Add the consecutive indices to the vector value. Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); State.set(getVPValue(), CanonicalVectorIV, Part); @@ -1013,7 +1013,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "EMIT "; + O << "EMIT "; getVPValue()->printAsOperand(O, SlotTracker); O << " = WIDEN-CANONICAL-INDUCTION"; } @@ -1021,18 +1021,18 @@ void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT); void VPValue::replaceAllUsesWith(VPValue *New) { - for (unsigned J = 0; J < getNumUsers();) { - VPUser *User = Users[J]; - unsigned NumUsers = getNumUsers(); + for (unsigned J = 0; J < getNumUsers();) { + VPUser *User = Users[J]; + unsigned NumUsers = getNumUsers(); for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) if (User->getOperand(I) == this) User->setOperand(I, New); - // If a user got removed after updating the current user, the next user to - // update will be moved to the current position, so we only need to - // increment the index if the number of users did not change. - if (NumUsers == getNumUsers()) - J++; - } + // If a user got removed after updating the current user, the next user to + // update will be moved to the current position, so we only need to + // increment the index if the number of users did not change. + if (NumUsers == getNumUsers()) + J++; + } } void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const { @@ -1050,12 +1050,12 @@ void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const { OS << "vp<%" << Tracker.getSlot(this) << ">"; } -void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const { - interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { - Op->printAsOperand(O, SlotTracker); - }); -} - +void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const { + interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { + Op->printAsOperand(O, SlotTracker); + }); +} + void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New, InterleavedAccessInfo &IAI) { @@ -1122,8 +1122,8 @@ void VPSlotTracker::assignSlots(const VPRegionBlock *Region) { void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) { for (const VPRecipeBase &Recipe : *VPBB) { - for (VPValue *Def : Recipe.definedValues()) - assignSlot(Def); + for (VPValue *Def : Recipe.definedValues()) + assignSlot(Def); } } diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h index 2cce127cd4..eec59ef006 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h @@ -53,7 +53,7 @@ class DominatorTree; class InnerLoopVectorizer; class LoopInfo; class raw_ostream; -class RecurrenceDescriptor; +class RecurrenceDescriptor; class Value; class VPBasicBlock; class VPRegionBlock; @@ -65,22 +65,22 @@ class VPlanSlp; /// [1, 9) = {1, 2, 4, 8} struct VFRange { // A power of 2. - const ElementCount Start; + const ElementCount Start; // Need not be a power of 2. If End <= Start range is empty. - ElementCount End; - - bool isEmpty() const { - return End.getKnownMinValue() <= Start.getKnownMinValue(); - } - - VFRange(const ElementCount &Start, const ElementCount &End) - : Start(Start), End(End) { - assert(Start.isScalable() == End.isScalable() && - "Both Start and End should have the same scalable flag"); - assert(isPowerOf2_32(Start.getKnownMinValue()) && - "Expected Start to be a power of 2"); - } + ElementCount End; + + bool isEmpty() const { + return End.getKnownMinValue() <= Start.getKnownMinValue(); + } + + VFRange(const ElementCount &Start, const ElementCount &End) + : Start(Start), End(End) { + assert(Start.isScalable() == End.isScalable() && + "Both Start and End should have the same scalable flag"); + assert(isPowerOf2_32(Start.getKnownMinValue()) && + "Expected Start to be a power of 2"); + } }; using VPlanPtr = std::unique_ptr<VPlan>; @@ -125,7 +125,7 @@ private: /// The vectorization factor. Each entry in the scalar map contains UF x VF /// scalar values. - ElementCount VF; + ElementCount VF; /// The vector and scalar map storage. We use std::map and not DenseMap /// because insertions to DenseMap invalidate its iterators. @@ -136,7 +136,7 @@ private: public: /// Construct an empty map with the given unroll and vectorization factors. - VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {} + VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {} /// \return True if the map has any vector entry for \p Key. bool hasAnyVectorValue(Value *Key) const { @@ -161,14 +161,14 @@ public: /// \return True if the map has a scalar entry for \p Key and \p Instance. bool hasScalarValue(Value *Key, const VPIteration &Instance) const { assert(Instance.Part < UF && "Queried Scalar Part is too large."); - assert(Instance.Lane < VF.getKnownMinValue() && - "Queried Scalar Lane is too large."); - + assert(Instance.Lane < VF.getKnownMinValue() && + "Queried Scalar Lane is too large."); + if (!hasAnyScalarValue(Key)) return false; const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF.getKnownMinValue() && + assert(Entry[Instance.Part].size() == VF.getKnownMinValue() && "ScalarParts has wrong dimensions."); return Entry[Instance.Part][Instance.Lane] != nullptr; } @@ -207,7 +207,7 @@ public: // TODO: Consider storing uniform values only per-part, as they occupy // lane 0 only, keeping the other VF-1 redundant entries null. for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF.getKnownMinValue(), nullptr); + Entry[Part].resize(VF.getKnownMinValue(), nullptr); ScalarMapStorage[Key] = Entry; } ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; @@ -246,15 +246,15 @@ struct VPCallback { /// VPTransformState holds information passed down when "executing" a VPlan, /// needed for generating the output IR. struct VPTransformState { - VPTransformState(ElementCount VF, unsigned UF, Loop *OrigLoop, LoopInfo *LI, - DominatorTree *DT, IRBuilder<> &Builder, - VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV, - VPCallback &Callback) - : VF(VF), UF(UF), Instance(), OrigLoop(OrigLoop), LI(LI), DT(DT), - Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} + VPTransformState(ElementCount VF, unsigned UF, Loop *OrigLoop, LoopInfo *LI, + DominatorTree *DT, IRBuilder<> &Builder, + VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV, + VPCallback &Callback) + : VF(VF), UF(UF), Instance(), OrigLoop(OrigLoop), LI(LI), DT(DT), + Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. - ElementCount VF; + ElementCount VF; unsigned UF; /// Hold the indices to generate specific scalar instructions. Null indicates @@ -269,9 +269,9 @@ struct VPTransformState { typedef SmallVector<Value *, 2> PerPartValuesTy; DenseMap<VPValue *, PerPartValuesTy> PerPartOutput; - - using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>; - DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars; + + using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>; + DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars; } Data; /// Get the generated Value for a given VPValue and a given Part. Note that @@ -288,23 +288,23 @@ struct VPTransformState { } /// Get the generated Value for a given VPValue and given Part and Lane. - Value *get(VPValue *Def, const VPIteration &Instance); - - bool hasVectorValue(VPValue *Def, unsigned Part) { - auto I = Data.PerPartOutput.find(Def); - return I != Data.PerPartOutput.end() && Part < I->second.size() && - I->second[Part]; - } - - bool hasScalarValue(VPValue *Def, VPIteration Instance) { - auto I = Data.PerPartScalars.find(Def); - if (I == Data.PerPartScalars.end()) - return false; - return Instance.Part < I->second.size() && - Instance.Lane < I->second[Instance.Part].size() && - I->second[Instance.Part][Instance.Lane]; - } - + Value *get(VPValue *Def, const VPIteration &Instance); + + bool hasVectorValue(VPValue *Def, unsigned Part) { + auto I = Data.PerPartOutput.find(Def); + return I != Data.PerPartOutput.end() && Part < I->second.size() && + I->second[Part]; + } + + bool hasScalarValue(VPValue *Def, VPIteration Instance) { + auto I = Data.PerPartScalars.find(Def); + if (I == Data.PerPartScalars.end()) + return false; + return Instance.Part < I->second.size() && + Instance.Lane < I->second[Instance.Part].size() && + I->second[Instance.Part][Instance.Lane]; + } + /// Set the generated Value for a given VPValue and a given Part. void set(VPValue *Def, Value *V, unsigned Part) { if (!Data.PerPartOutput.count(Def)) { @@ -313,19 +313,19 @@ struct VPTransformState { } Data.PerPartOutput[Def][Part] = V; } - void set(VPValue *Def, Value *IRDef, Value *V, unsigned Part); - - void set(VPValue *Def, Value *V, const VPIteration &Instance) { - auto Iter = Data.PerPartScalars.insert({Def, {}}); - auto &PerPartVec = Iter.first->second; - while (PerPartVec.size() <= Instance.Part) - PerPartVec.emplace_back(); - auto &Scalars = PerPartVec[Instance.Part]; - while (Scalars.size() <= Instance.Lane) - Scalars.push_back(nullptr); - Scalars[Instance.Lane] = V; - } - + void set(VPValue *Def, Value *IRDef, Value *V, unsigned Part); + + void set(VPValue *Def, Value *V, const VPIteration &Instance) { + auto Iter = Data.PerPartScalars.insert({Def, {}}); + auto &PerPartVec = Iter.first->second; + while (PerPartVec.size() <= Instance.Part) + PerPartVec.emplace_back(); + auto &Scalars = PerPartVec[Instance.Part]; + while (Scalars.size() <= Instance.Lane) + Scalars.push_back(nullptr); + Scalars[Instance.Lane] = V; + } + /// Hold state information used when constructing the CFG of the output IR, /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. struct CFGState { @@ -351,9 +351,9 @@ struct VPTransformState { CFGState() = default; } CFG; - /// Hold a pointer to the original loop. - Loop *OrigLoop; - + /// Hold a pointer to the original loop. + Loop *OrigLoop; + /// Hold a pointer to LoopInfo to register new basic blocks in the loop. LoopInfo *LI; @@ -427,14 +427,14 @@ class VPBlockBase { /// Remove \p Predecessor from the predecessors of this block. void removePredecessor(VPBlockBase *Predecessor) { - auto Pos = find(Predecessors, Predecessor); + auto Pos = find(Predecessors, Predecessor); assert(Pos && "Predecessor does not exist"); Predecessors.erase(Pos); } /// Remove \p Successor from the successors of this block. void removeSuccessor(VPBlockBase *Successor) { - auto Pos = find(Successors, Successor); + auto Pos = find(Successors, Successor); assert(Pos && "Successor does not exist"); Successors.erase(Pos); } @@ -627,19 +627,19 @@ public: // hoisted into a VPBlockBase. return true; } - - /// Replace all operands of VPUsers in the block with \p NewValue and also - /// replaces all uses of VPValues defined in the block with NewValue. - virtual void dropAllReferences(VPValue *NewValue) = 0; + + /// Replace all operands of VPUsers in the block with \p NewValue and also + /// replaces all uses of VPValues defined in the block with NewValue. + virtual void dropAllReferences(VPValue *NewValue) = 0; }; /// VPRecipeBase is a base class modeling a sequence of one or more output IR -/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef -/// and is responsible for deleting its defined values. Single-value -/// VPRecipeBases that also inherit from VPValue must make sure to inherit from -/// VPRecipeBase before VPValue. -class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>, - public VPDef { +/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef +/// and is responsible for deleting its defined values. Single-value +/// VPRecipeBases that also inherit from VPValue must make sure to inherit from +/// VPRecipeBase before VPValue. +class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>, + public VPDef { friend VPBasicBlock; friend class VPBlockUtils; @@ -648,7 +648,7 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>, VPBasicBlock *Parent = nullptr; public: - VPRecipeBase(const unsigned char SC) : VPDef(SC) {} + VPRecipeBase(const unsigned char SC) : VPDef(SC) {} virtual ~VPRecipeBase() = default; /// \return the VPBasicBlock which this VPRecipe belongs to. @@ -671,11 +671,11 @@ public: /// the VPBasicBlock that MovePos lives in, right after MovePos. void moveAfter(VPRecipeBase *MovePos); - /// Unlink this recipe and insert into BB before I. - /// - /// \pre I is a valid iterator into BB. - void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I); - + /// Unlink this recipe and insert into BB before I. + /// + /// \pre I is a valid iterator into BB. + void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I); + /// This method unlinks 'this' from the containing basic block, but does not /// delete it. void removeFromParent(); @@ -684,46 +684,46 @@ public: /// /// \returns an iterator pointing to the element after the erased one iplist<VPRecipeBase>::iterator eraseFromParent(); - - /// Returns a pointer to a VPUser, if the recipe inherits from VPUser or - /// nullptr otherwise. - VPUser *toVPUser(); - - /// Returns the underlying instruction, if the recipe is a VPValue or nullptr - /// otherwise. - Instruction *getUnderlyingInstr() { - return cast<Instruction>(getVPValue()->getUnderlyingValue()); - } - const Instruction *getUnderlyingInstr() const { - return cast<Instruction>(getVPValue()->getUnderlyingValue()); - } - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - // All VPDefs are also VPRecipeBases. - return true; - } + + /// Returns a pointer to a VPUser, if the recipe inherits from VPUser or + /// nullptr otherwise. + VPUser *toVPUser(); + + /// Returns the underlying instruction, if the recipe is a VPValue or nullptr + /// otherwise. + Instruction *getUnderlyingInstr() { + return cast<Instruction>(getVPValue()->getUnderlyingValue()); + } + const Instruction *getUnderlyingInstr() const { + return cast<Instruction>(getVPValue()->getUnderlyingValue()); + } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + // All VPDefs are also VPRecipeBases. + return true; + } }; -inline bool VPUser::classof(const VPDef *Def) { - return Def->getVPDefID() == VPRecipeBase::VPInstructionSC || - Def->getVPDefID() == VPRecipeBase::VPWidenSC || - Def->getVPDefID() == VPRecipeBase::VPWidenCallSC || - Def->getVPDefID() == VPRecipeBase::VPWidenSelectSC || - Def->getVPDefID() == VPRecipeBase::VPWidenGEPSC || - Def->getVPDefID() == VPRecipeBase::VPBlendSC || - Def->getVPDefID() == VPRecipeBase::VPInterleaveSC || - Def->getVPDefID() == VPRecipeBase::VPReplicateSC || - Def->getVPDefID() == VPRecipeBase::VPReductionSC || - Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC || - Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC; -} - +inline bool VPUser::classof(const VPDef *Def) { + return Def->getVPDefID() == VPRecipeBase::VPInstructionSC || + Def->getVPDefID() == VPRecipeBase::VPWidenSC || + Def->getVPDefID() == VPRecipeBase::VPWidenCallSC || + Def->getVPDefID() == VPRecipeBase::VPWidenSelectSC || + Def->getVPDefID() == VPRecipeBase::VPWidenGEPSC || + Def->getVPDefID() == VPRecipeBase::VPBlendSC || + Def->getVPDefID() == VPRecipeBase::VPInterleaveSC || + Def->getVPDefID() == VPRecipeBase::VPReplicateSC || + Def->getVPDefID() == VPRecipeBase::VPReductionSC || + Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC || + Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC; +} + /// This is a concrete Recipe that models a single VPlan-level instruction. /// While as any Recipe it may generate a sequence of IR instructions when /// executed, these instructions would always form a single-def expression as /// the VPInstruction is also a single def-use vertex. -class VPInstruction : public VPRecipeBase, public VPUser, public VPValue { +class VPInstruction : public VPRecipeBase, public VPUser, public VPValue { friend class VPlanSlp; public: @@ -749,22 +749,22 @@ protected: public: VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands) - : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser(Operands), - VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {} - - VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands) - : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser({}), - VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) { - for (auto *I : Operands) - addOperand(I->getVPValue()); - } - + : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser(Operands), + VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {} + + VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands) + : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser({}), + VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) { + for (auto *I : Operands) + addOperand(I->getVPValue()); + } + VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPValue *V) { - return V->getVPValueID() == VPValue::VPVInstructionSC; + return V->getVPValueID() == VPValue::VPVInstructionSC; } VPInstruction *clone() const { @@ -773,8 +773,8 @@ public: } /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *R) { - return R->getVPDefID() == VPRecipeBase::VPInstructionSC; + static inline bool classof(const VPDef *R) { + return R->getVPDefID() == VPRecipeBase::VPInstructionSC; } unsigned getOpcode() const { return Opcode; } @@ -784,12 +784,12 @@ public: /// provided. void execute(VPTransformState &State) override; - /// Print the VPInstruction to \p O. + /// Print the VPInstruction to \p O. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; - /// Print the VPInstruction to dbgs() (for debugging). - void dump() const; + /// Print the VPInstruction to dbgs() (for debugging). + void dump() const; /// Return true if this instruction may modify memory. bool mayWriteToMemory() const { @@ -823,22 +823,22 @@ public: /// VPWidenRecipe is a recipe for producing a copy of vector type its /// ingredient. This recipe covers most of the traditional vectorization cases /// where each ingredient transforms into a vectorized version of itself. -class VPWidenRecipe : public VPRecipeBase, public VPValue, public VPUser { +class VPWidenRecipe : public VPRecipeBase, public VPValue, public VPUser { public: template <typename IterT> VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) - : VPRecipeBase(VPRecipeBase::VPWidenSC), - VPValue(VPValue::VPVWidenSC, &I, this), VPUser(Operands) {} + : VPRecipeBase(VPRecipeBase::VPWidenSC), + VPValue(VPValue::VPVWidenSC, &I, this), VPUser(Operands) {} ~VPWidenRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPWidenSC; - } - static inline bool classof(const VPValue *V) { - return V->getVPValueID() == VPValue::VPVWidenSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenSC; } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVWidenSC; + } /// Produce widened copies of all Ingredients. void execute(VPTransformState &State) override; @@ -849,19 +849,19 @@ public: }; /// A recipe for widening Call instructions. -class VPWidenCallRecipe : public VPRecipeBase, public VPUser, public VPValue { +class VPWidenCallRecipe : public VPRecipeBase, public VPUser, public VPValue { public: template <typename IterT> VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments) - : VPRecipeBase(VPRecipeBase::VPWidenCallSC), VPUser(CallArguments), - VPValue(VPValue::VPVWidenCallSC, &I, this) {} + : VPRecipeBase(VPRecipeBase::VPWidenCallSC), VPUser(CallArguments), + VPValue(VPValue::VPVWidenCallSC, &I, this) {} ~VPWidenCallRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPWidenCallSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenCallSC; } /// Produce a widened version of the call instruction. @@ -873,7 +873,7 @@ public: }; /// A recipe for widening select instructions. -class VPWidenSelectRecipe : public VPRecipeBase, public VPUser, public VPValue { +class VPWidenSelectRecipe : public VPRecipeBase, public VPUser, public VPValue { /// Is the condition of the select loop invariant? bool InvariantCond; @@ -882,15 +882,15 @@ public: template <typename IterT> VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands, bool InvariantCond) - : VPRecipeBase(VPRecipeBase::VPWidenSelectSC), VPUser(Operands), - VPValue(VPValue::VPVWidenSelectSC, &I, this), + : VPRecipeBase(VPRecipeBase::VPWidenSelectSC), VPUser(Operands), + VPValue(VPValue::VPVWidenSelectSC, &I, this), InvariantCond(InvariantCond) {} ~VPWidenSelectRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPWidenSelectSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenSelectSC; } /// Produce a widened version of the select instruction. @@ -902,24 +902,24 @@ public: }; /// A recipe for handling GEP instructions. -class VPWidenGEPRecipe : public VPRecipeBase, - public VPUser, - public VPValue { +class VPWidenGEPRecipe : public VPRecipeBase, + public VPUser, + public VPValue { bool IsPtrLoopInvariant; SmallBitVector IsIndexLoopInvariant; public: template <typename IterT> - VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands) - : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands), - VPValue(VPWidenGEPSC, GEP, this), - IsIndexLoopInvariant(GEP->getNumIndices(), false) {} - - template <typename IterT> + VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands) + : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands), + VPValue(VPWidenGEPSC, GEP, this), + IsIndexLoopInvariant(GEP->getNumIndices(), false) {} + + template <typename IterT> VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands, Loop *OrigLoop) - : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands), - VPValue(VPValue::VPVWidenGEPSC, GEP, this), + : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands), + VPValue(VPValue::VPVWidenGEPSC, GEP, this), IsIndexLoopInvariant(GEP->getNumIndices(), false) { IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand()); for (auto Index : enumerate(GEP->indices())) @@ -929,8 +929,8 @@ public: ~VPWidenGEPRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPWidenGEPSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenGEPSC; } /// Generate the gep nodes. @@ -943,25 +943,25 @@ public: /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. -class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPUser { +class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPUser { PHINode *IV; TruncInst *Trunc; public: - VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, - TruncInst *Trunc = nullptr) - : VPRecipeBase(VPWidenIntOrFpInductionSC), VPUser({Start}), IV(IV), - Trunc(Trunc) { - if (Trunc) - new VPValue(Trunc, this); - else - new VPValue(IV, this); - } + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, + TruncInst *Trunc = nullptr) + : VPRecipeBase(VPWidenIntOrFpInductionSC), VPUser({Start}), IV(IV), + Trunc(Trunc) { + if (Trunc) + new VPValue(Trunc, this); + else + new VPValue(IV, this); + } ~VPWidenIntOrFpInductionRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC; } /// Generate the vectorized and scalarized versions of the phi node as @@ -971,38 +971,38 @@ public: /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; - - /// Returns the start value of the induction. - VPValue *getStartValue() { return getOperand(0); } + + /// Returns the start value of the induction. + VPValue *getStartValue() { return getOperand(0); } }; /// A recipe for handling all phi nodes except for integer and FP inductions. -/// For reduction PHIs, RdxDesc must point to the corresponding recurrence -/// descriptor and the start value is the first operand of the recipe. -class VPWidenPHIRecipe : public VPRecipeBase, public VPUser { +/// For reduction PHIs, RdxDesc must point to the corresponding recurrence +/// descriptor and the start value is the first operand of the recipe. +class VPWidenPHIRecipe : public VPRecipeBase, public VPUser { PHINode *Phi; - /// Descriptor for a reduction PHI. - RecurrenceDescriptor *RdxDesc = nullptr; - + /// Descriptor for a reduction PHI. + RecurrenceDescriptor *RdxDesc = nullptr; + public: - /// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p - /// RdxDesc. - VPWidenPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, VPValue &Start) - : VPWidenPHIRecipe(Phi) { - this->RdxDesc = &RdxDesc; - addOperand(&Start); - } - - /// Create a VPWidenPHIRecipe for \p Phi - VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) { - new VPValue(Phi, this); - } + /// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p + /// RdxDesc. + VPWidenPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, VPValue &Start) + : VPWidenPHIRecipe(Phi) { + this->RdxDesc = &RdxDesc; + addOperand(&Start); + } + + /// Create a VPWidenPHIRecipe for \p Phi + VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) { + new VPValue(Phi, this); + } ~VPWidenPHIRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPWidenPHISC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenPHISC; } /// Generate the phi/select nodes. @@ -1011,25 +1011,25 @@ public: /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; - - /// Returns the start value of the phi, if it is a reduction. - VPValue *getStartValue() { - return getNumOperands() == 0 ? nullptr : getOperand(0); - } + + /// Returns the start value of the phi, if it is a reduction. + VPValue *getStartValue() { + return getNumOperands() == 0 ? nullptr : getOperand(0); + } }; /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. -class VPBlendRecipe : public VPRecipeBase, public VPUser { +class VPBlendRecipe : public VPRecipeBase, public VPUser { PHINode *Phi; -public: +public: /// The blend operation is a User of the incoming values and of their /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value /// might be incoming with a full mask for which there is no VPValue. VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands) - : VPRecipeBase(VPBlendSC), VPUser(Operands), Phi(Phi) { - new VPValue(Phi, this); + : VPRecipeBase(VPBlendSC), VPUser(Operands), Phi(Phi) { + new VPValue(Phi, this); assert(Operands.size() > 0 && ((Operands.size() == 1) || (Operands.size() % 2 == 0)) && "Expected either a single incoming value or a positive even number " @@ -1037,19 +1037,19 @@ public: } /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPBlendSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPBlendSC; } /// Return the number of incoming values, taking into account that a single /// incoming value has no mask. - unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; } + unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; } /// Return incoming value number \p Idx. - VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); } + VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); } /// Return mask number \p Idx. - VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); } + VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); } /// Generate the phi/select nodes. void execute(VPTransformState &State) override; @@ -1060,60 +1060,60 @@ public: }; /// VPInterleaveRecipe is a recipe for transforming an interleave group of load -/// or stores into one wide load/store and shuffles. The first operand of a -/// VPInterleave recipe is the address, followed by the stored values, followed -/// by an optional mask. -class VPInterleaveRecipe : public VPRecipeBase, public VPUser { +/// or stores into one wide load/store and shuffles. The first operand of a +/// VPInterleave recipe is the address, followed by the stored values, followed +/// by an optional mask. +class VPInterleaveRecipe : public VPRecipeBase, public VPUser { const InterleaveGroup<Instruction> *IG; - bool HasMask = false; - + bool HasMask = false; + public: VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, - ArrayRef<VPValue *> StoredValues, VPValue *Mask) - : VPRecipeBase(VPInterleaveSC), VPUser(Addr), IG(IG) { - for (unsigned i = 0; i < IG->getFactor(); ++i) - if (Instruction *I = IG->getMember(i)) { - if (I->getType()->isVoidTy()) - continue; - new VPValue(I, this); - } - - for (auto *SV : StoredValues) - addOperand(SV); - if (Mask) { - HasMask = true; - addOperand(Mask); - } + ArrayRef<VPValue *> StoredValues, VPValue *Mask) + : VPRecipeBase(VPInterleaveSC), VPUser(Addr), IG(IG) { + for (unsigned i = 0; i < IG->getFactor(); ++i) + if (Instruction *I = IG->getMember(i)) { + if (I->getType()->isVoidTy()) + continue; + new VPValue(I, this); + } + + for (auto *SV : StoredValues) + addOperand(SV); + if (Mask) { + HasMask = true; + addOperand(Mask); + } } ~VPInterleaveRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPInterleaveSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPInterleaveSC; } /// Return the address accessed by this recipe. VPValue *getAddr() const { - return getOperand(0); // Address is the 1st, mandatory operand. + return getOperand(0); // Address is the 1st, mandatory operand. } /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { // Mask is optional and therefore the last, currently 2nd operand. - return HasMask ? getOperand(getNumOperands() - 1) : nullptr; - } - - /// Return the VPValues stored by this interleave group. If it is a load - /// interleave group, return an empty ArrayRef. - ArrayRef<VPValue *> getStoredValues() const { - // The first operand is the address, followed by the stored values, followed - // by an optional mask. - return ArrayRef<VPValue *>(op_begin(), getNumOperands()) - .slice(1, getNumOperands() - (HasMask ? 2 : 1)); - } - + return HasMask ? getOperand(getNumOperands() - 1) : nullptr; + } + + /// Return the VPValues stored by this interleave group. If it is a load + /// interleave group, return an empty ArrayRef. + ArrayRef<VPValue *> getStoredValues() const { + // The first operand is the address, followed by the stored values, followed + // by an optional mask. + return ArrayRef<VPValue *>(op_begin(), getNumOperands()) + .slice(1, getNumOperands() - (HasMask ? 2 : 1)); + } + /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; @@ -1124,61 +1124,61 @@ public: const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; } }; -/// A recipe to represent inloop reduction operations, performing a reduction on -/// a vector operand into a scalar value, and adding the result to a chain. -/// The Operands are {ChainOp, VecOp, [Condition]}. -class VPReductionRecipe : public VPRecipeBase, public VPUser, public VPValue { - /// The recurrence decriptor for the reduction in question. - RecurrenceDescriptor *RdxDesc; - /// Fast math flags to use for the resulting reduction operation. - bool NoNaN; - /// Pointer to the TTI, needed to create the target reduction - const TargetTransformInfo *TTI; - -public: - VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp, - VPValue *VecOp, VPValue *CondOp, bool NoNaN, - const TargetTransformInfo *TTI) - : VPRecipeBase(VPRecipeBase::VPReductionSC), VPUser({ChainOp, VecOp}), - VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), NoNaN(NoNaN), - TTI(TTI) { - if (CondOp) - addOperand(CondOp); - } - - ~VPReductionRecipe() override = default; - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPValue *V) { - return V->getVPValueID() == VPValue::VPVReductionSC; - } - - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPReductionSC; - } - - /// Generate the reduction in the loop - void execute(VPTransformState &State) override; - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; - - /// The VPValue of the scalar Chain being accumulated. - VPValue *getChainOp() const { return getOperand(0); } - /// The VPValue of the vector value to be reduced. - VPValue *getVecOp() const { return getOperand(1); } - /// The VPValue of the condition for the block. - VPValue *getCondOp() const { - return getNumOperands() > 2 ? getOperand(2) : nullptr; - } -}; - +/// A recipe to represent inloop reduction operations, performing a reduction on +/// a vector operand into a scalar value, and adding the result to a chain. +/// The Operands are {ChainOp, VecOp, [Condition]}. +class VPReductionRecipe : public VPRecipeBase, public VPUser, public VPValue { + /// The recurrence decriptor for the reduction in question. + RecurrenceDescriptor *RdxDesc; + /// Fast math flags to use for the resulting reduction operation. + bool NoNaN; + /// Pointer to the TTI, needed to create the target reduction + const TargetTransformInfo *TTI; + +public: + VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp, + VPValue *VecOp, VPValue *CondOp, bool NoNaN, + const TargetTransformInfo *TTI) + : VPRecipeBase(VPRecipeBase::VPReductionSC), VPUser({ChainOp, VecOp}), + VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), NoNaN(NoNaN), + TTI(TTI) { + if (CondOp) + addOperand(CondOp); + } + + ~VPReductionRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVReductionSC; + } + + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPReductionSC; + } + + /// Generate the reduction in the loop + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; + + /// The VPValue of the scalar Chain being accumulated. + VPValue *getChainOp() const { return getOperand(0); } + /// The VPValue of the vector value to be reduced. + VPValue *getVecOp() const { return getOperand(1); } + /// The VPValue of the condition for the block. + VPValue *getCondOp() const { + return getNumOperands() > 2 ? getOperand(2) : nullptr; + } +}; + /// VPReplicateRecipe replicates a given instruction producing multiple scalar /// copies of the original scalar type, one per lane, instead of producing a /// single copy of widened type for all lanes. If the instruction is known to be /// uniform only one copy, per lane zero, will be generated. -class VPReplicateRecipe : public VPRecipeBase, public VPUser, public VPValue { +class VPReplicateRecipe : public VPRecipeBase, public VPUser, public VPValue { /// Indicator if only a single replica per lane is needed. bool IsUniform; @@ -1192,9 +1192,9 @@ public: template <typename IterT> VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands, bool IsUniform, bool IsPredicated = false) - : VPRecipeBase(VPReplicateSC), VPUser(Operands), - VPValue(VPVReplicateSC, I, this), IsUniform(IsUniform), - IsPredicated(IsPredicated) { + : VPRecipeBase(VPReplicateSC), VPUser(Operands), + VPValue(VPVReplicateSC, I, this), IsUniform(IsUniform), + IsPredicated(IsPredicated) { // Retain the previous behavior of predicateInstructions(), where an // insert-element of a predicated instruction got hoisted into the // predicated basic block iff it was its only user. This is achieved by @@ -1206,14 +1206,14 @@ public: ~VPReplicateRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPReplicateSC; - } - - static inline bool classof(const VPValue *V) { - return V->getVPValueID() == VPValue::VPVReplicateSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPReplicateSC; } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVReplicateSC; + } + /// Generate replicas of the desired Ingredient. Replicas will be generated /// for all parts and lanes unless a specific part and lane are specified in /// the \p State. @@ -1224,21 +1224,21 @@ public: /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; - - bool isUniform() const { return IsUniform; } + + bool isUniform() const { return IsUniform; } }; /// A recipe for generating conditional branches on the bits of a mask. -class VPBranchOnMaskRecipe : public VPRecipeBase, public VPUser { +class VPBranchOnMaskRecipe : public VPRecipeBase, public VPUser { public: VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) { if (BlockInMask) // nullptr means all-one mask. - addOperand(BlockInMask); + addOperand(BlockInMask); } /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC; } /// Generate the extraction of the appropriate bit from the block mask and the @@ -1250,7 +1250,7 @@ public: VPSlotTracker &SlotTracker) const override { O << " +\n" << Indent << "\"BRANCH-ON-MASK "; if (VPValue *Mask = getMask()) - Mask->printAsOperand(O, SlotTracker); + Mask->printAsOperand(O, SlotTracker); else O << " All-One"; O << "\\l\""; @@ -1259,9 +1259,9 @@ public: /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { - assert(getNumOperands() <= 1 && "should have either 0 or 1 operands"); + assert(getNumOperands() <= 1 && "should have either 0 or 1 operands"); // Mask is optional. - return getNumOperands() == 1 ? getOperand(0) : nullptr; + return getNumOperands() == 1 ? getOperand(0) : nullptr; } }; @@ -1270,20 +1270,20 @@ public: /// order to merge values that are set under such a branch and feed their uses. /// The phi nodes can be scalar or vector depending on the users of the value. /// This recipe works in concert with VPBranchOnMaskRecipe. -class VPPredInstPHIRecipe : public VPRecipeBase, public VPUser { +class VPPredInstPHIRecipe : public VPRecipeBase, public VPUser { public: /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi /// nodes after merging back from a Branch-on-Mask. - VPPredInstPHIRecipe(VPValue *PredV) - : VPRecipeBase(VPPredInstPHISC), VPUser(PredV) { - new VPValue(PredV->getUnderlyingValue(), this); - } + VPPredInstPHIRecipe(VPValue *PredV) + : VPRecipeBase(VPPredInstPHISC), VPUser(PredV) { + new VPValue(PredV->getUnderlyingValue(), this); + } ~VPPredInstPHIRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPPredInstPHISC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPPredInstPHISC; } /// Generates phi nodes for live-outs as needed to retain SSA form. @@ -1300,59 +1300,59 @@ public: /// - For store: Address, stored value, optional mask /// TODO: We currently execute only per-part unless a specific instance is /// provided. -class VPWidenMemoryInstructionRecipe : public VPRecipeBase, - public VPUser { - Instruction &Ingredient; +class VPWidenMemoryInstructionRecipe : public VPRecipeBase, + public VPUser { + Instruction &Ingredient; void setMask(VPValue *Mask) { if (!Mask) return; - addOperand(Mask); + addOperand(Mask); } bool isMasked() const { - return isStore() ? getNumOperands() == 3 : getNumOperands() == 2; + return isStore() ? getNumOperands() == 3 : getNumOperands() == 2; } public: VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr}), - Ingredient(Load) { - new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this); + : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr}), + Ingredient(Load) { + new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this); setMask(Mask); } VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredValue, VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr, StoredValue}), - Ingredient(Store) { + : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr, StoredValue}), + Ingredient(Store) { setMask(Mask); } /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC; } /// Return the address accessed by this recipe. VPValue *getAddr() const { - return getOperand(0); // Address is the 1st, mandatory operand. + return getOperand(0); // Address is the 1st, mandatory operand. } /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { // Mask is optional and therefore the last operand. - return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; + return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; } - /// Returns true if this recipe is a store. - bool isStore() const { return isa<StoreInst>(Ingredient); } - + /// Returns true if this recipe is a store. + bool isStore() const { return isa<StoreInst>(Ingredient); } + /// Return the address accessed by this recipe. VPValue *getStoredValue() const { - assert(isStore() && "Stored value only available for store instructions"); - return getOperand(1); // Stored value is the 2nd, mandatory operand. + assert(isStore() && "Stored value only available for store instructions"); + return getOperand(1); // Stored value is the 2nd, mandatory operand. } /// Generate the wide load/store. @@ -1365,16 +1365,16 @@ public: /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase { -public: - VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) { - new VPValue(nullptr, this); - } +public: + VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) { + new VPValue(nullptr, this); + } ~VPWidenCanonicalIVRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC; } /// Generate a canonical vector induction variable of the vector loop, with @@ -1461,11 +1461,11 @@ public: /// this VPBasicBlock, thereby "executing" the VPlan. void execute(struct VPTransformState *State) override; - /// Return the position of the first non-phi node recipe in the block. - iterator getFirstNonPhi(); - - void dropAllReferences(VPValue *NewValue) override; - + /// Return the position of the first non-phi node recipe in the block. + iterator getFirstNonPhi(); + + void dropAllReferences(VPValue *NewValue) override; + private: /// Create an IR BasicBlock to hold the output instructions generated by this /// VPBasicBlock, and return it. Update the CFGState accordingly. @@ -1506,11 +1506,11 @@ public: IsReplicator(IsReplicator) {} ~VPRegionBlock() override { - if (Entry) { - VPValue DummyValue; - Entry->dropAllReferences(&DummyValue); + if (Entry) { + VPValue DummyValue; + Entry->dropAllReferences(&DummyValue); deleteCFG(Entry); - } + } } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1555,8 +1555,8 @@ public: /// The method which generates the output IR instructions that correspond to /// this VPRegionBlock, thereby "executing" the VPlan. void execute(struct VPTransformState *State) override; - - void dropAllReferences(VPValue *NewValue) override; + + void dropAllReferences(VPValue *NewValue) override; }; //===----------------------------------------------------------------------===// @@ -1694,7 +1694,7 @@ class VPlan { VPBlockBase *Entry; /// Holds the VFs applicable to this VPlan. - SmallSetVector<ElementCount, 2> VFs; + SmallSetVector<ElementCount, 2> VFs; /// Holds the name of the VPlan, for printing. std::string Name; @@ -1714,10 +1714,10 @@ class VPlan { /// VPlan. Value2VPValueTy Value2VPValue; - /// Contains all VPValues that been allocated by addVPValue directly and need - /// to be free when the plan's destructor is called. - SmallVector<VPValue *, 16> VPValuesToFree; - + /// Contains all VPValues that been allocated by addVPValue directly and need + /// to be free when the plan's destructor is called. + SmallVector<VPValue *, 16> VPValuesToFree; + /// Holds the VPLoopInfo analysis for this VPlan. VPLoopInfo VPLInfo; @@ -1731,15 +1731,15 @@ public: } ~VPlan() { - if (Entry) { - VPValue DummyValue; - for (VPBlockBase *Block : depth_first(Entry)) - Block->dropAllReferences(&DummyValue); - + if (Entry) { + VPValue DummyValue; + for (VPBlockBase *Block : depth_first(Entry)) + Block->dropAllReferences(&DummyValue); + VPBlockBase::deleteCFG(Entry); - } - for (VPValue *VPV : VPValuesToFree) - delete VPV; + } + for (VPValue *VPV : VPValuesToFree) + delete VPV; if (BackedgeTakenCount) delete BackedgeTakenCount; for (VPValue *Def : VPExternalDefs) @@ -1767,9 +1767,9 @@ public: return BackedgeTakenCount; } - void addVF(ElementCount VF) { VFs.insert(VF); } + void addVF(ElementCount VF) { VFs.insert(VF); } - bool hasVF(ElementCount VF) { return VFs.count(VF); } + bool hasVF(ElementCount VF) { return VFs.count(VF); } const std::string &getName() const { return Name; } @@ -1789,17 +1789,17 @@ public: void addVPValue(Value *V) { assert(V && "Trying to add a null Value to VPlan"); assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); - VPValue *VPV = new VPValue(V); - Value2VPValue[V] = VPV; - VPValuesToFree.push_back(VPV); - } - - void addVPValue(Value *V, VPValue *VPV) { - assert(V && "Trying to add a null Value to VPlan"); - assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); - Value2VPValue[V] = VPV; + VPValue *VPV = new VPValue(V); + Value2VPValue[V] = VPV; + VPValuesToFree.push_back(VPV); } + void addVPValue(Value *V, VPValue *VPV) { + assert(V && "Trying to add a null Value to VPlan"); + assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); + Value2VPValue[V] = VPV; + } + VPValue *getVPValue(Value *V) { assert(V && "Trying to get the VPValue of a null Value"); assert(Value2VPValue.count(V) && "Value does not exist in VPlan"); @@ -1813,8 +1813,8 @@ public: return getVPValue(V); } - void removeVPValueFor(Value *V) { Value2VPValue.erase(V); } - + void removeVPValueFor(Value *V) { Value2VPValue.erase(V); } + /// Return the VPLoopInfo analysis for this VPlan. VPLoopInfo &getVPLoopInfo() { return VPLInfo; } const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } @@ -1892,13 +1892,13 @@ private: void dump(); - static void printAsIngredient(raw_ostream &O, const Value *V); + static void printAsIngredient(raw_ostream &O, const Value *V); }; struct VPlanIngredient { - const Value *V; + const Value *V; - VPlanIngredient(const Value *V) : V(V) {} + VPlanIngredient(const Value *V) : V(V) {} }; inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { @@ -2048,7 +2048,7 @@ public: /// \returns nullptr if doesn't have such group. InterleaveGroup<VPInstruction> * getInterleaveGroup(VPInstruction *Instr) const { - return InterleaveGroupMap.lookup(Instr); + return InterleaveGroupMap.lookup(Instr); } }; @@ -2132,7 +2132,7 @@ class VPlanSlp { public: VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {} - ~VPlanSlp() = default; + ~VPlanSlp() = default; /// Tries to build an SLP tree rooted at \p Operands and returns a /// VPInstruction combining \p Operands, if they can be combined. diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp index ac3b3505dc..7da23508b7 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -191,7 +191,7 @@ void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) { // Generate edge predicates and append them to the block predicate. RPO is // necessary since the predecessor blocks' block predicate needs to be set // before the current block's block predicate can be computed. - for (VPBlockBase *Block : RPOT) { + for (VPBlockBase *Block : RPOT) { // TODO: Handle nested regions once we start generating the same. assert(!isa<VPRegionBlock>(Block) && "Nested region not expected"); createOrPropagatePredicates(Block, Region); @@ -208,7 +208,7 @@ void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) { ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry()); VPBlockBase *PrevBlock = nullptr; - for (VPBlockBase *CurrBlock : RPOT) { + for (VPBlockBase *CurrBlock : RPOT) { // TODO: Handle nested regions once we start generating the same. assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected"); diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp index 6f21bf4429..5b8145ff62 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -124,7 +124,7 @@ bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { for (auto &I : *Parent) { auto *VPI = cast<VPInstruction>(&I); if (VPI->getOpcode() == Instruction::Load && - llvm::is_contained(Operands, VPI)) + llvm::is_contained(Operands, VPI)) LoadsSeen++; if (LoadsSeen == Operands.size()) @@ -161,8 +161,8 @@ static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, unsigned OperandIndex) { SmallVector<VPValue *, 4> Operands; for (VPValue *V : Values) { - // Currently we only support VPInstructions. - auto *U = cast<VPInstruction>(V); + // Currently we only support VPInstructions. + auto *U = cast<VPInstruction>(V); Operands.push_back(U->getOperand(OperandIndex)); } return Operands; @@ -223,20 +223,20 @@ static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, /// Traverses and compares operands of V1 and V2 to MaxLevel. static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, VPInterleavedAccessInfo &IAI) { - auto *I1 = dyn_cast<VPInstruction>(V1); - auto *I2 = dyn_cast<VPInstruction>(V2); - // Currently we only support VPInstructions. - if (!I1 || !I2) + auto *I1 = dyn_cast<VPInstruction>(V1); + auto *I2 = dyn_cast<VPInstruction>(V2); + // Currently we only support VPInstructions. + if (!I1 || !I2) return 0; if (MaxLevel == 0) - return (unsigned)areConsecutiveOrMatch(I1, I2, IAI); + return (unsigned)areConsecutiveOrMatch(I1, I2, IAI); unsigned Score = 0; - for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I) - for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J) - Score += - getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI); + for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I) + for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J) + Score += + getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI); return Score; } @@ -466,8 +466,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { auto *VPI = new VPInstruction(Opcode, CombinedOperands); VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); - LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " - << *cast<VPInstruction>(Values[0]) << "\n"); + LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " + << *cast<VPInstruction>(Values[0]) << "\n"); addCombined(Values, VPI); return VPI; } diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp index 1a54603faf..6773dc5a61 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -48,8 +48,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes( VPInstruction *VPInst = cast<VPInstruction>(Ingredient); Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue()); if (DeadInstructions.count(Inst)) { - VPValue DummyValue; - VPInst->replaceAllUsesWith(&DummyValue); + VPValue DummyValue; + VPInst->replaceAllUsesWith(&DummyValue); Ingredient->eraseFromParent(); continue; } @@ -68,8 +68,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes( InductionDescriptor II = Inductions.lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || II.getKind() == InductionDescriptor::IK_FpInduction) { - VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); - NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start); + VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); + NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start); } else NewRecipe = new VPWidenPHIRecipe(Phi); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { @@ -80,11 +80,11 @@ void VPlanTransforms::VPInstructionsToVPRecipes( new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands())); NewRecipe->insertBefore(Ingredient); - if (NewRecipe->getNumDefinedValues() == 1) - VPInst->replaceAllUsesWith(NewRecipe->getVPValue()); - else - assert(NewRecipe->getNumDefinedValues() == 0 && - "Only recpies with zero or one defined values expected"); + if (NewRecipe->getNumDefinedValues() == 1) + VPInst->replaceAllUsesWith(NewRecipe->getVPValue()); + else + assert(NewRecipe->getNumDefinedValues() == 0 && + "Only recpies with zero or one defined values expected"); Ingredient->eraseFromParent(); } } diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h index ed572ca366..b43c8398b6 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h @@ -10,9 +10,9 @@ /// This file contains the declarations of the entities induced by Vectorization /// Plans, e.g. the instructions the VPlan intends to generate if executed. /// VPlan models the following entities: -/// VPValue VPUser VPDef -/// | | -/// VPInstruction +/// VPValue VPUser VPDef +/// | | +/// VPInstruction /// These are documented in docs/VectorizationPlan.rst. /// //===----------------------------------------------------------------------===// @@ -21,9 +21,9 @@ #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/ADT/TinyPtrVector.h" #include "llvm/ADT/iterator_range.h" namespace llvm { @@ -31,11 +31,11 @@ namespace llvm { // Forward declarations. class raw_ostream; class Value; -class VPDef; +class VPDef; class VPSlotTracker; class VPUser; -class VPRecipeBase; -class VPWidenMemoryInstructionRecipe; +class VPRecipeBase; +class VPWidenMemoryInstructionRecipe; // This is the base class of the VPlan Def/Use graph, used for modeling the data // flow into, within and out of the VPlan. VPValues can stand for live-ins @@ -43,14 +43,14 @@ class VPWidenMemoryInstructionRecipe; // and live-outs which the VPlan will need to fix accordingly. class VPValue { friend class VPBuilder; - friend class VPDef; - friend class VPInstruction; + friend class VPDef; + friend class VPInstruction; friend struct VPlanTransforms; friend class VPBasicBlock; friend class VPInterleavedAccessInfo; friend class VPSlotTracker; - friend class VPRecipeBase; - friend class VPWidenMemoryInstructionRecipe; + friend class VPRecipeBase; + friend class VPWidenMemoryInstructionRecipe; const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -60,12 +60,12 @@ protected: // Hold the underlying Value, if any, attached to this VPValue. Value *UnderlyingVal; - /// Pointer to the VPDef that defines this VPValue. If it is nullptr, the - /// VPValue is not defined by any recipe modeled in VPlan. - VPDef *Def; - - VPValue(const unsigned char SC, Value *UV = nullptr, VPDef *Def = nullptr); + /// Pointer to the VPDef that defines this VPValue. If it is nullptr, the + /// VPValue is not defined by any recipe modeled in VPlan. + VPDef *Def; + VPValue(const unsigned char SC, Value *UV = nullptr, VPDef *Def = nullptr); + // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to // the front-end and back-end of VPlan so that the middle-end is as // independent as possible of the underlying IR. We grant access to the @@ -80,33 +80,33 @@ protected: } public: - /// Return the underlying Value attached to this VPValue. - Value *getUnderlyingValue() { return UnderlyingVal; } - const Value *getUnderlyingValue() const { return UnderlyingVal; } - + /// Return the underlying Value attached to this VPValue. + Value *getUnderlyingValue() { return UnderlyingVal; } + const Value *getUnderlyingValue() const { return UnderlyingVal; } + /// An enumeration for keeping track of the concrete subclass of VPValue that /// are actually instantiated. Values of this enumeration are kept in the /// SubclassID field of the VPValue objects. They are used for concrete /// type identification. - enum { - VPValueSC, - VPVInstructionSC, - VPVMemoryInstructionSC, - VPVReductionSC, - VPVReplicateSC, - VPVWidenSC, - VPVWidenCallSC, - VPVWidenGEPSC, - VPVWidenSelectSC, - }; - - VPValue(Value *UV = nullptr, VPDef *Def = nullptr) - : VPValue(VPValueSC, UV, Def) {} + enum { + VPValueSC, + VPVInstructionSC, + VPVMemoryInstructionSC, + VPVReductionSC, + VPVReplicateSC, + VPVWidenSC, + VPVWidenCallSC, + VPVWidenGEPSC, + VPVWidenSelectSC, + }; + + VPValue(Value *UV = nullptr, VPDef *Def = nullptr) + : VPValue(VPValueSC, UV, Def) {} VPValue(const VPValue &) = delete; VPValue &operator=(const VPValue &) = delete; - virtual ~VPValue(); - + virtual ~VPValue(); + /// \return an ID for the concrete type of this object. /// This is used to implement the classof checks. This should not be used /// for any other purpose, as the values may change as LLVM evolves. @@ -115,28 +115,28 @@ public: void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const; void print(raw_ostream &OS, VPSlotTracker &Tracker) const; - /// Dump the value to stderr (for debugging). - void dump() const; - + /// Dump the value to stderr (for debugging). + void dump() const; + unsigned getNumUsers() const { return Users.size(); } void addUser(VPUser &User) { Users.push_back(&User); } - /// Remove a single \p User from the list of users. - void removeUser(VPUser &User) { - bool Found = false; - // The same user can be added multiple times, e.g. because the same VPValue - // is used twice by the same VPUser. Remove a single one. - erase_if(Users, [&User, &Found](VPUser *Other) { - if (Found) - return false; - if (Other == &User) { - Found = true; - return true; - } - return false; - }); - } - + /// Remove a single \p User from the list of users. + void removeUser(VPUser &User) { + bool Found = false; + // The same user can be added multiple times, e.g. because the same VPValue + // is used twice by the same VPUser. Remove a single one. + erase_if(Users, [&User, &Found](VPUser *Other) { + if (Found) + return false; + if (Other == &User) { + Found = true; + return true; + } + return false; + }); + } + typedef SmallVectorImpl<VPUser *>::iterator user_iterator; typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator; typedef iterator_range<user_iterator> user_range; @@ -164,17 +164,17 @@ public: } void replaceAllUsesWith(VPValue *New); - - VPDef *getDef() { return Def; } - - /// Returns the underlying IR value, if this VPValue is defined outside the - /// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef - /// inside a VPlan. - Value *getLiveInIRValue() { - assert(!getDef() && - "VPValue is not a live-in; it is defined by a VPDef inside a VPlan"); - return getUnderlyingValue(); - } + + VPDef *getDef() { return Def; } + + /// Returns the underlying IR value, if this VPValue is defined outside the + /// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef + /// inside a VPlan. + Value *getLiveInIRValue() { + assert(!getDef() && + "VPValue is not a live-in; it is defined by a VPDef inside a VPlan"); + return getUnderlyingValue(); + } }; typedef DenseMap<Value *, VPValue *> Value2VPValueTy; @@ -184,32 +184,32 @@ raw_ostream &operator<<(raw_ostream &OS, const VPValue &V); /// This class augments VPValue with operands which provide the inverse def-use /// edges from VPValue's users to their defs. -class VPUser { +class VPUser { SmallVector<VPValue *, 2> Operands; protected: - /// Print the operands to \p O. - void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const; - -public: - VPUser() {} - VPUser(ArrayRef<VPValue *> Operands) { + /// Print the operands to \p O. + void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const; + +public: + VPUser() {} + VPUser(ArrayRef<VPValue *> Operands) { for (VPValue *Operand : Operands) addOperand(Operand); } VPUser(std::initializer_list<VPValue *> Operands) : VPUser(ArrayRef<VPValue *>(Operands)) {} - template <typename IterT> VPUser(iterator_range<IterT> Operands) { + template <typename IterT> VPUser(iterator_range<IterT> Operands) { for (VPValue *Operand : Operands) addOperand(Operand); } VPUser(const VPUser &) = delete; VPUser &operator=(const VPUser &) = delete; - virtual ~VPUser() { - for (VPValue *Op : operands()) - Op->removeUser(*this); + virtual ~VPUser() { + for (VPValue *Op : operands()) + Op->removeUser(*this); } void addOperand(VPValue *Operand) { @@ -223,11 +223,11 @@ public: return Operands[N]; } - void setOperand(unsigned I, VPValue *New) { - Operands[I]->removeUser(*this); - Operands[I] = New; - New->addUser(*this); - } + void setOperand(unsigned I, VPValue *New) { + Operands[I]->removeUser(*this); + Operands[I] = New; + New->addUser(*this); + } typedef SmallVectorImpl<VPValue *>::iterator operand_iterator; typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator; @@ -242,110 +242,110 @@ public: const_operand_range operands() const { return const_operand_range(op_begin(), op_end()); } - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *Recipe); + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *Recipe); }; - -/// This class augments a recipe with a set of VPValues defined by the recipe. -/// It allows recipes to define zero, one or multiple VPValues. A VPDef owns -/// the VPValues it defines and is responsible for deleting its defined values. -/// Single-value VPDefs that also inherit from VPValue must make sure to inherit -/// from VPDef before VPValue. -class VPDef { - friend class VPValue; - - /// Subclass identifier (for isa/dyn_cast). - const unsigned char SubclassID; - - /// The VPValues defined by this VPDef. - TinyPtrVector<VPValue *> DefinedValues; - - /// Add \p V as a defined value by this VPDef. - void addDefinedValue(VPValue *V) { - assert(V->getDef() == this && - "can only add VPValue already linked with this VPDef"); - DefinedValues.push_back(V); - } - - /// Remove \p V from the values defined by this VPDef. \p V must be a defined - /// value of this VPDef. - void removeDefinedValue(VPValue *V) { - assert(V->getDef() == this && - "can only remove VPValue linked with this VPDef"); - assert(is_contained(DefinedValues, V) && - "VPValue to remove must be in DefinedValues"); - erase_value(DefinedValues, V); - V->Def = nullptr; - } - -public: - /// An enumeration for keeping track of the concrete subclass of VPRecipeBase - /// that is actually instantiated. Values of this enumeration are kept in the - /// SubclassID field of the VPRecipeBase objects. They are used for concrete - /// type identification. - using VPRecipeTy = enum { - VPBlendSC, - VPBranchOnMaskSC, - VPInstructionSC, - VPInterleaveSC, - VPPredInstPHISC, - VPReductionSC, - VPReplicateSC, - VPWidenCallSC, - VPWidenCanonicalIVSC, - VPWidenGEPSC, - VPWidenIntOrFpInductionSC, - VPWidenMemoryInstructionSC, - VPWidenPHISC, - VPWidenSC, - VPWidenSelectSC - }; - - VPDef(const unsigned char SC) : SubclassID(SC) {} - - virtual ~VPDef() { - for (VPValue *D : make_early_inc_range(DefinedValues)) { - assert(D->Def == this && - "all defined VPValues should point to the containing VPDef"); - assert(D->getNumUsers() == 0 && - "all defined VPValues should have no more users"); - D->Def = nullptr; - delete D; - } - } - - /// Returns the VPValue with index \p I defined by the VPDef. - VPValue *getVPValue(unsigned I = 0) { - assert(DefinedValues[I] && "defined value must be non-null"); - return DefinedValues[I]; - } - const VPValue *getVPValue(unsigned I = 0) const { - assert(DefinedValues[I] && "defined value must be non-null"); - return DefinedValues[I]; - } - - /// Returns an ArrayRef of the values defined by the VPDef. - ArrayRef<VPValue *> definedValues() { return DefinedValues; } - /// Returns an ArrayRef of the values defined by the VPDef. - ArrayRef<VPValue *> definedValues() const { return DefinedValues; } - - /// Returns the number of values defined by the VPDef. - unsigned getNumDefinedValues() const { return DefinedValues.size(); } - - /// \return an ID for the concrete type of this object. - /// This is used to implement the classof checks. This should not be used - /// for any other purpose, as the values may change as LLVM evolves. - unsigned getVPDefID() const { return SubclassID; } - - /// Dump the VPDef to stderr (for debugging). - void dump() const; - - /// Each concrete VPDef prints itself. - virtual void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const = 0; -}; - + +/// This class augments a recipe with a set of VPValues defined by the recipe. +/// It allows recipes to define zero, one or multiple VPValues. A VPDef owns +/// the VPValues it defines and is responsible for deleting its defined values. +/// Single-value VPDefs that also inherit from VPValue must make sure to inherit +/// from VPDef before VPValue. +class VPDef { + friend class VPValue; + + /// Subclass identifier (for isa/dyn_cast). + const unsigned char SubclassID; + + /// The VPValues defined by this VPDef. + TinyPtrVector<VPValue *> DefinedValues; + + /// Add \p V as a defined value by this VPDef. + void addDefinedValue(VPValue *V) { + assert(V->getDef() == this && + "can only add VPValue already linked with this VPDef"); + DefinedValues.push_back(V); + } + + /// Remove \p V from the values defined by this VPDef. \p V must be a defined + /// value of this VPDef. + void removeDefinedValue(VPValue *V) { + assert(V->getDef() == this && + "can only remove VPValue linked with this VPDef"); + assert(is_contained(DefinedValues, V) && + "VPValue to remove must be in DefinedValues"); + erase_value(DefinedValues, V); + V->Def = nullptr; + } + +public: + /// An enumeration for keeping track of the concrete subclass of VPRecipeBase + /// that is actually instantiated. Values of this enumeration are kept in the + /// SubclassID field of the VPRecipeBase objects. They are used for concrete + /// type identification. + using VPRecipeTy = enum { + VPBlendSC, + VPBranchOnMaskSC, + VPInstructionSC, + VPInterleaveSC, + VPPredInstPHISC, + VPReductionSC, + VPReplicateSC, + VPWidenCallSC, + VPWidenCanonicalIVSC, + VPWidenGEPSC, + VPWidenIntOrFpInductionSC, + VPWidenMemoryInstructionSC, + VPWidenPHISC, + VPWidenSC, + VPWidenSelectSC + }; + + VPDef(const unsigned char SC) : SubclassID(SC) {} + + virtual ~VPDef() { + for (VPValue *D : make_early_inc_range(DefinedValues)) { + assert(D->Def == this && + "all defined VPValues should point to the containing VPDef"); + assert(D->getNumUsers() == 0 && + "all defined VPValues should have no more users"); + D->Def = nullptr; + delete D; + } + } + + /// Returns the VPValue with index \p I defined by the VPDef. + VPValue *getVPValue(unsigned I = 0) { + assert(DefinedValues[I] && "defined value must be non-null"); + return DefinedValues[I]; + } + const VPValue *getVPValue(unsigned I = 0) const { + assert(DefinedValues[I] && "defined value must be non-null"); + return DefinedValues[I]; + } + + /// Returns an ArrayRef of the values defined by the VPDef. + ArrayRef<VPValue *> definedValues() { return DefinedValues; } + /// Returns an ArrayRef of the values defined by the VPDef. + ArrayRef<VPValue *> definedValues() const { return DefinedValues; } + + /// Returns the number of values defined by the VPDef. + unsigned getNumDefinedValues() const { return DefinedValues.size(); } + + /// \return an ID for the concrete type of this object. + /// This is used to implement the classof checks. This should not be used + /// for any other purpose, as the values may change as LLVM evolves. + unsigned getVPDefID() const { return SubclassID; } + + /// Dump the VPDef to stderr (for debugging). + void dump() const; + + /// Each concrete VPDef prints itself. + virtual void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const = 0; +}; + class VPlan; class VPBasicBlock; class VPRegionBlock; @@ -365,7 +365,7 @@ class VPSlotTracker { void assignSlots(const VPlan &Plan); public: - VPSlotTracker(const VPlan *Plan = nullptr) { + VPSlotTracker(const VPlan *Plan = nullptr) { if (Plan) assignSlots(*Plan); } diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp index 6eec8d14de..b8abab63df 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -65,7 +65,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) { for (const VPBlockBase *Succ : Successors) { // There must be a bi-directional link between block and successor. const auto &SuccPreds = Succ->getPredecessors(); - assert(llvm::is_contained(SuccPreds, VPB) && "Missing predecessor link."); + assert(llvm::is_contained(SuccPreds, VPB) && "Missing predecessor link."); (void)SuccPreds; } @@ -84,7 +84,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) { // There must be a bi-directional link between block and predecessor. const auto &PredSuccs = Pred->getSuccessors(); - assert(llvm::is_contained(PredSuccs, VPB) && "Missing successor link."); + assert(llvm::is_contained(PredSuccs, VPB) && "Missing successor link."); (void)PredSuccs; } } diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp index 787f146bdd..7b0a72de4e 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp @@ -16,7 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" @@ -34,7 +34,7 @@ using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "vector-combine" -STATISTIC(NumVecLoad, "Number of vector loads formed"); +STATISTIC(NumVecLoad, "Number of vector loads formed"); STATISTIC(NumVecCmp, "Number of vector compares formed"); STATISTIC(NumVecBO, "Number of vector binops formed"); STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); @@ -67,7 +67,7 @@ private: const TargetTransformInfo &TTI; const DominatorTree &DT; - bool vectorizeLoadInsert(Instruction &I); + bool vectorizeLoadInsert(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, ExtractElementInst *Ext1, unsigned PreferredExtractIndex) const; @@ -91,138 +91,138 @@ static void replaceValue(Value &Old, Value &New) { New.takeName(&Old); } -bool VectorCombine::vectorizeLoadInsert(Instruction &I) { - // Match insert into fixed vector of scalar value. - // TODO: Handle non-zero insert index. - auto *Ty = dyn_cast<FixedVectorType>(I.getType()); - Value *Scalar; - if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || - !Scalar->hasOneUse()) - return false; - - // Optionally match an extract from another vector. - Value *X; - bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt())); - if (!HasExtract) - X = Scalar; - - // Match source value as load of scalar or vector. - // Do not vectorize scalar load (widening) if atomic/volatile or under - // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions - // or create data races non-existent in the source. - auto *Load = dyn_cast<LoadInst>(X); - if (!Load || !Load->isSimple() || !Load->hasOneUse() || - Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || - mustSuppressSpeculation(*Load)) - return false; - - const DataLayout &DL = I.getModule()->getDataLayout(); - Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); - assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); - - // If original AS != Load's AS, we can't bitcast the original pointer and have - // to use Load's operand instead. Ideally we would want to strip pointer casts - // without changing AS, but there's no API to do that ATM. - unsigned AS = Load->getPointerAddressSpace(); - if (AS != SrcPtr->getType()->getPointerAddressSpace()) - SrcPtr = Load->getPointerOperand(); - - // We are potentially transforming byte-sized (8-bit) memory accesses, so make - // sure we have all of our type-based constraints in place for this target. - Type *ScalarTy = Scalar->getType(); - uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); - unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); - if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || - ScalarSize % 8 != 0) - return false; - - // Check safety of replacing the scalar load with a larger vector load. - // We use minimal alignment (maximum flexibility) because we only care about - // the dereferenceable region. When calculating cost and creating a new op, - // we may use a larger value based on alignment attributes. - unsigned MinVecNumElts = MinVectorSize / ScalarSize; - auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); - unsigned OffsetEltIndex = 0; - Align Alignment = Load->getAlign(); - if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) { - // It is not safe to load directly from the pointer, but we can still peek - // through gep offsets and check if it safe to load from a base address with - // updated alignment. If it is, we can shuffle the element(s) into place - // after loading. - unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType()); - APInt Offset(OffsetBitWidth, 0); - SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); - - // We want to shuffle the result down from a high element of a vector, so - // the offset must be positive. - if (Offset.isNegative()) - return false; - - // The offset must be a multiple of the scalar element to shuffle cleanly - // in the element's size. - uint64_t ScalarSizeInBytes = ScalarSize / 8; - if (Offset.urem(ScalarSizeInBytes) != 0) - return false; - - // If we load MinVecNumElts, will our target element still be loaded? - OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); - if (OffsetEltIndex >= MinVecNumElts) - return false; - - if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) - return false; - - // Update alignment with offset value. Note that the offset could be negated - // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but - // negation does not change the result of the alignment calculation. - Alignment = commonAlignment(Alignment, Offset.getZExtValue()); - } - - // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 - // Use the greater of the alignment on the load or its source pointer. - Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); - Type *LoadTy = Load->getType(); - InstructionCost OldCost = - TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); - APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); - OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, - /* Insert */ true, HasExtract); - - // New pattern: load VecPtr - InstructionCost NewCost = - TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); - // Optionally, we are shuffling the loaded vector element(s) into place. - if (OffsetEltIndex) - NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy); - - // We can aggressively convert to the vector form because the backend can - // invert this transform if it does not result in a performance win. - if (OldCost < NewCost || !NewCost.isValid()) - return false; - - // It is safe and potentially profitable to load a vector directly: - // inselt undef, load Scalar, 0 --> load VecPtr - IRBuilder<> Builder(Load); - Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS)); - Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); - - // Set everything but element 0 to undef to prevent poison from propagating - // from the extra loaded memory. This will also optionally shrink/grow the - // vector from the loaded size to the output size. - // We assume this operation has no cost in codegen if there was no offset. - // Note that we could use freeze to avoid poison problems, but then we might - // still need a shuffle to change the vector size. - unsigned OutputNumElts = Ty->getNumElements(); - SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem); - assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); - Mask[0] = OffsetEltIndex; - VecLd = Builder.CreateShuffleVector(VecLd, Mask); - - replaceValue(I, *VecLd); - ++NumVecLoad; - return true; -} - +bool VectorCombine::vectorizeLoadInsert(Instruction &I) { + // Match insert into fixed vector of scalar value. + // TODO: Handle non-zero insert index. + auto *Ty = dyn_cast<FixedVectorType>(I.getType()); + Value *Scalar; + if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || + !Scalar->hasOneUse()) + return false; + + // Optionally match an extract from another vector. + Value *X; + bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt())); + if (!HasExtract) + X = Scalar; + + // Match source value as load of scalar or vector. + // Do not vectorize scalar load (widening) if atomic/volatile or under + // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions + // or create data races non-existent in the source. + auto *Load = dyn_cast<LoadInst>(X); + if (!Load || !Load->isSimple() || !Load->hasOneUse() || + Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || + mustSuppressSpeculation(*Load)) + return false; + + const DataLayout &DL = I.getModule()->getDataLayout(); + Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); + assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); + + // If original AS != Load's AS, we can't bitcast the original pointer and have + // to use Load's operand instead. Ideally we would want to strip pointer casts + // without changing AS, but there's no API to do that ATM. + unsigned AS = Load->getPointerAddressSpace(); + if (AS != SrcPtr->getType()->getPointerAddressSpace()) + SrcPtr = Load->getPointerOperand(); + + // We are potentially transforming byte-sized (8-bit) memory accesses, so make + // sure we have all of our type-based constraints in place for this target. + Type *ScalarTy = Scalar->getType(); + uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); + unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); + if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || + ScalarSize % 8 != 0) + return false; + + // Check safety of replacing the scalar load with a larger vector load. + // We use minimal alignment (maximum flexibility) because we only care about + // the dereferenceable region. When calculating cost and creating a new op, + // we may use a larger value based on alignment attributes. + unsigned MinVecNumElts = MinVectorSize / ScalarSize; + auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); + unsigned OffsetEltIndex = 0; + Align Alignment = Load->getAlign(); + if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) { + // It is not safe to load directly from the pointer, but we can still peek + // through gep offsets and check if it safe to load from a base address with + // updated alignment. If it is, we can shuffle the element(s) into place + // after loading. + unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType()); + APInt Offset(OffsetBitWidth, 0); + SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); + + // We want to shuffle the result down from a high element of a vector, so + // the offset must be positive. + if (Offset.isNegative()) + return false; + + // The offset must be a multiple of the scalar element to shuffle cleanly + // in the element's size. + uint64_t ScalarSizeInBytes = ScalarSize / 8; + if (Offset.urem(ScalarSizeInBytes) != 0) + return false; + + // If we load MinVecNumElts, will our target element still be loaded? + OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); + if (OffsetEltIndex >= MinVecNumElts) + return false; + + if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) + return false; + + // Update alignment with offset value. Note that the offset could be negated + // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but + // negation does not change the result of the alignment calculation. + Alignment = commonAlignment(Alignment, Offset.getZExtValue()); + } + + // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 + // Use the greater of the alignment on the load or its source pointer. + Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); + Type *LoadTy = Load->getType(); + InstructionCost OldCost = + TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); + APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); + OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, + /* Insert */ true, HasExtract); + + // New pattern: load VecPtr + InstructionCost NewCost = + TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); + // Optionally, we are shuffling the loaded vector element(s) into place. + if (OffsetEltIndex) + NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy); + + // We can aggressively convert to the vector form because the backend can + // invert this transform if it does not result in a performance win. + if (OldCost < NewCost || !NewCost.isValid()) + return false; + + // It is safe and potentially profitable to load a vector directly: + // inselt undef, load Scalar, 0 --> load VecPtr + IRBuilder<> Builder(Load); + Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS)); + Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); + + // Set everything but element 0 to undef to prevent poison from propagating + // from the extra loaded memory. This will also optionally shrink/grow the + // vector from the loaded size to the output size. + // We assume this operation has no cost in codegen if there was no offset. + // Note that we could use freeze to avoid poison problems, but then we might + // still need a shuffle to change the vector size. + unsigned OutputNumElts = Ty->getNumElements(); + SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem); + assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); + Mask[0] = OffsetEltIndex; + VecLd = Builder.CreateShuffleVector(VecLd, Mask); + + replaceValue(I, *VecLd); + ++NumVecLoad; + return true; +} + /// Determine which, if any, of the inputs should be replaced by a shuffle /// followed by extract from a different index. ExtractElementInst *VectorCombine::getShuffleExtract( @@ -241,15 +241,15 @@ ExtractElementInst *VectorCombine::getShuffleExtract( Type *VecTy = Ext0->getVectorOperand()->getType(); assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); - InstructionCost Cost0 = - TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); - InstructionCost Cost1 = - TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); - - // If both costs are invalid no shuffle is needed - if (!Cost0.isValid() && !Cost1.isValid()) - return nullptr; - + InstructionCost Cost0 = + TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); + InstructionCost Cost1 = + TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + + // If both costs are invalid no shuffle is needed + if (!Cost0.isValid() && !Cost1.isValid()) + return nullptr; + // We are extracting from 2 different indexes, so one operand must be shuffled // before performing a vector operation and/or extract. The more expensive // extract will be replaced by a shuffle. @@ -284,7 +284,7 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, "Expected constant extract indexes"); Type *ScalarTy = Ext0->getType(); auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); - InstructionCost ScalarOpCost, VectorOpCost; + InstructionCost ScalarOpCost, VectorOpCost; // Get cost estimates for scalar and vector versions of the operation. bool IsBinOp = Instruction::isBinaryOp(Opcode); @@ -305,9 +305,9 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue(); unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue(); - InstructionCost Extract0Cost = + InstructionCost Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); - InstructionCost Extract1Cost = + InstructionCost Extract1Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index); // A more expensive extract will always be replaced by a splat shuffle. @@ -317,11 +317,11 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, // TODO: Evaluate whether that always results in lowest cost. Alternatively, // check the cost of creating a broadcast shuffle and shuffling both // operands to element 0. - InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost); + InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost); // Extra uses of the extracts mean that we include those costs in the // vector total because those instructions will not be eliminated. - InstructionCost OldCost, NewCost; + InstructionCost OldCost, NewCost; if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) { // Handle a special case. If the 2 extracts are identical, adjust the // formulas to account for that. The extra use charge allows for either the @@ -372,7 +372,7 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, auto *VecTy = cast<FixedVectorType>(Vec->getType()); SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem); ShufMask[NewIndex] = OldIndex; - return Builder.CreateShuffleVector(Vec, ShufMask, "shift"); + return Builder.CreateShuffleVector(Vec, ShufMask, "shift"); } /// Given an extract element instruction with constant index operand, shuffle @@ -506,23 +506,23 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) { m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask)))))) return false; - // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for - // scalable type is unknown; Second, we cannot reason if the narrowed shuffle - // mask for scalable type is a splat or not. - // 2) Disallow non-vector casts and length-changing shuffles. + // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for + // scalable type is unknown; Second, we cannot reason if the narrowed shuffle + // mask for scalable type is a splat or not. + // 2) Disallow non-vector casts and length-changing shuffles. // TODO: We could allow any shuffle. - auto *DestTy = dyn_cast<FixedVectorType>(I.getType()); - auto *SrcTy = dyn_cast<FixedVectorType>(V->getType()); - if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy) + auto *DestTy = dyn_cast<FixedVectorType>(I.getType()); + auto *SrcTy = dyn_cast<FixedVectorType>(V->getType()); + if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy) return false; // The new shuffle must not cost more than the old shuffle. The bitcast is // moved ahead of the shuffle, so assume that it has the same cost as before. - InstructionCost DestCost = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy); - InstructionCost SrcCost = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy); - if (DestCost > SrcCost || !DestCost.isValid()) + InstructionCost DestCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy); + InstructionCost SrcCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy); + if (DestCost > SrcCost || !DestCost.isValid()) return false; unsigned DestNumElts = DestTy->getNumElements(); @@ -545,7 +545,7 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) { // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' ++NumShufOfBitcast; Value *CastV = Builder.CreateBitCast(V, DestTy); - Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask); + Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask); replaceValue(I, *Shuf); return true; } @@ -612,7 +612,7 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { "Unexpected types for insert element into binop or cmp"); unsigned Opcode = I.getOpcode(); - InstructionCost ScalarOpCost, VectorOpCost; + InstructionCost ScalarOpCost, VectorOpCost; if (IsCmp) { ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy); VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy); @@ -623,16 +623,16 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { // Get cost estimate for the insert element. This cost will factor into // both sequences. - InstructionCost InsertCost = + InstructionCost InsertCost = TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index); - InstructionCost OldCost = - (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; - InstructionCost NewCost = ScalarOpCost + InsertCost + - (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + - (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); + InstructionCost OldCost = + (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; + InstructionCost NewCost = ScalarOpCost + InsertCost + + (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + + (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); // We want to scalarize unless the vector variant actually has lower cost. - if (OldCost < NewCost || !NewCost.isValid()) + if (OldCost < NewCost || !NewCost.isValid()) return false; // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> @@ -712,8 +712,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { if (!VecTy) return false; - InstructionCost OldCost = - TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); + InstructionCost OldCost = + TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2; OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); @@ -724,7 +724,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); - InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType()); + InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType()); NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy); NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); @@ -733,7 +733,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { // Aggressively form vector ops if the cost is equal because the transform // may enable further optimization. // Codegen can reverse this transform (scalarize) if it was not profitable. - if (OldCost < NewCost || !NewCost.isValid()) + if (OldCost < NewCost || !NewCost.isValid()) return false; // Create a vector constant from the 2 scalar constants. @@ -758,10 +758,10 @@ bool VectorCombine::run() { if (DisableVectorCombine) return false; - // Don't attempt vectorization if the target does not support vectors. - if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true))) - return false; - + // Don't attempt vectorization if the target does not support vectors. + if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true))) + return false; + bool MadeChange = false; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. @@ -775,7 +775,7 @@ bool VectorCombine::run() { if (isa<DbgInfoIntrinsic>(I)) continue; Builder.SetInsertPoint(&I); - MadeChange |= vectorizeLoadInsert(I); + MadeChange |= vectorizeLoadInsert(I); MadeChange |= foldExtractExtract(I); MadeChange |= foldBitcastShuf(I); MadeChange |= scalarizeBinopOrCmp(I); diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make b/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make index a68c667bde..a3879c3129 100644 --- a/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make +++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make @@ -12,12 +12,12 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/Analysis - contrib/libs/llvm12/lib/IR - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Transforms/Utils + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/Analysis + contrib/libs/llvm12/lib/IR + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Transforms/Utils ) ADDINCL( |