diff options
author | orivej <orivej@yandex-team.ru> | 2022-02-10 16:44:49 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:49 +0300 |
commit | 718c552901d703c502ccbefdfc3c9028d608b947 (patch) | |
tree | 46534a98bbefcd7b1f3faa5b52c138ab27db75b7 /contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp | |
parent | e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0 (diff) | |
download | ydb-718c552901d703c502ccbefdfc3c9028d608b947.tar.gz |
Restoring authorship annotation for <orivej@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp')
-rw-r--r-- | contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp | 2160 |
1 files changed, 1080 insertions, 1080 deletions
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp index 1bd2529891..d4b83c0fc3 100644 --- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp @@ -1,1088 +1,1088 @@ -//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements the Loop Distribution Pass. Its main focus is to -// distribute loops that cannot be vectorized due to dependence cycles. It -// tries to isolate the offending dependences into a new loop allowing -// vectorization of the remaining parts. -// -// For dependence analysis, the pass uses the LoopVectorizer's -// LoopAccessAnalysis. Because this analysis presumes no change in the order of -// memory operations, special care is taken to preserve the lexical order of -// these operations. -// -// Similarly to the Vectorizer, the pass also supports loop versioning to -// run-time disambiguate potentially overlapping arrays. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Scalar/LoopDistribute.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/EquivalenceClasses.h" -#include "llvm/ADT/Optional.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Twine.h" -#include "llvm/ADT/iterator_range.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/LoopAccessAnalysis.h" -#include "llvm/Analysis/LoopAnalysisManager.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/OptimizationRemarkEmitter.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Metadata.h" -#include "llvm/IR/PassManager.h" -#include "llvm/IR/Value.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/LoopUtils.h" -#include "llvm/Transforms/Utils/LoopVersioning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include <cassert> -#include <functional> -#include <list> -#include <tuple> -#include <utility> - -using namespace llvm; - -#define LDIST_NAME "loop-distribute" -#define DEBUG_TYPE LDIST_NAME - -/// @{ -/// Metadata attribute names -static const char *const LLVMLoopDistributeFollowupAll = - "llvm.loop.distribute.followup_all"; -static const char *const LLVMLoopDistributeFollowupCoincident = - "llvm.loop.distribute.followup_coincident"; -static const char *const LLVMLoopDistributeFollowupSequential = - "llvm.loop.distribute.followup_sequential"; -static const char *const LLVMLoopDistributeFollowupFallback = - "llvm.loop.distribute.followup_fallback"; -/// @} - -static cl::opt<bool> - LDistVerify("loop-distribute-verify", cl::Hidden, - cl::desc("Turn on DominatorTree and LoopInfo verification " - "after Loop Distribution"), - cl::init(false)); - -static cl::opt<bool> DistributeNonIfConvertible( - "loop-distribute-non-if-convertible", cl::Hidden, - cl::desc("Whether to distribute into a loop that may not be " - "if-convertible by the loop vectorizer"), - cl::init(false)); - -static cl::opt<unsigned> DistributeSCEVCheckThreshold( - "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, - cl::desc("The maximum number of SCEV checks allowed for Loop " - "Distribution")); - -static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( - "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), - cl::Hidden, - cl::desc( - "The maximum number of SCEV checks allowed for Loop " - "Distribution for loop marked with #pragma loop distribute(enable)")); - -static cl::opt<bool> EnableLoopDistribute( - "enable-loop-distribute", cl::Hidden, - cl::desc("Enable the new, experimental LoopDistribution Pass"), - cl::init(false)); - -STATISTIC(NumLoopsDistributed, "Number of loops distributed"); - -namespace { - -/// Maintains the set of instructions of the loop for a partition before -/// cloning. After cloning, it hosts the new loop. -class InstPartition { - using InstructionSet = SmallPtrSet<Instruction *, 8>; - -public: - InstPartition(Instruction *I, Loop *L, bool DepCycle = false) - : DepCycle(DepCycle), OrigLoop(L) { - Set.insert(I); - } - - /// Returns whether this partition contains a dependence cycle. - bool hasDepCycle() const { return DepCycle; } - - /// Adds an instruction to this partition. - void add(Instruction *I) { Set.insert(I); } - - /// Collection accessors. - InstructionSet::iterator begin() { return Set.begin(); } - InstructionSet::iterator end() { return Set.end(); } - InstructionSet::const_iterator begin() const { return Set.begin(); } - InstructionSet::const_iterator end() const { return Set.end(); } - bool empty() const { return Set.empty(); } - - /// Moves this partition into \p Other. This partition becomes empty - /// after this. - void moveTo(InstPartition &Other) { - Other.Set.insert(Set.begin(), Set.end()); - Set.clear(); - Other.DepCycle |= DepCycle; - } - - /// Populates the partition with a transitive closure of all the - /// instructions that the seeded instructions dependent on. - void populateUsedSet() { - // FIXME: We currently don't use control-dependence but simply include all - // blocks (possibly empty at the end) and let simplifycfg mostly clean this - // up. - for (auto *B : OrigLoop->getBlocks()) - Set.insert(B->getTerminator()); - - // Follow the use-def chains to form a transitive closure of all the - // instructions that the originally seeded instructions depend on. - SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); - while (!Worklist.empty()) { - Instruction *I = Worklist.pop_back_val(); - // Insert instructions from the loop that we depend on. - for (Value *V : I->operand_values()) { - auto *I = dyn_cast<Instruction>(V); - if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) - Worklist.push_back(I); - } - } - } - - /// Clones the original loop. - /// - /// Updates LoopInfo and DominatorTree using the information that block \p - /// LoopDomBB dominates the loop. - Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, - unsigned Index, LoopInfo *LI, - DominatorTree *DT) { - ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, - VMap, Twine(".ldist") + Twine(Index), - LI, DT, ClonedLoopBlocks); - return ClonedLoop; - } - - /// The cloned loop. If this partition is mapped to the original loop, - /// this is null. - const Loop *getClonedLoop() const { return ClonedLoop; } - - /// Returns the loop where this partition ends up after distribution. - /// If this partition is mapped to the original loop then use the block from - /// the loop. - Loop *getDistributedLoop() const { - return ClonedLoop ? ClonedLoop : OrigLoop; - } - - /// The VMap that is populated by cloning and then used in - /// remapinstruction to remap the cloned instructions. - ValueToValueMapTy &getVMap() { return VMap; } - - /// Remaps the cloned instructions using VMap. - void remapInstructions() { - remapInstructionsInBlocks(ClonedLoopBlocks, VMap); - } - - /// Based on the set of instructions selected for this partition, - /// removes the unnecessary ones. - void removeUnusedInsts() { - SmallVector<Instruction *, 8> Unused; - - for (auto *Block : OrigLoop->getBlocks()) - for (auto &Inst : *Block) - if (!Set.count(&Inst)) { - Instruction *NewInst = &Inst; - if (!VMap.empty()) - NewInst = cast<Instruction>(VMap[NewInst]); - - assert(!isa<BranchInst>(NewInst) && - "Branches are marked used early on"); - Unused.push_back(NewInst); - } - - // Delete the instructions backwards, as it has a reduced likelihood of - // having to update as many def-use and use-def chains. - for (auto *Inst : reverse(Unused)) { - if (!Inst->use_empty()) - Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); - Inst->eraseFromParent(); - } - } - - void print() const { - if (DepCycle) - dbgs() << " (cycle)\n"; - for (auto *I : Set) - // Prefix with the block name. - dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; - } - - void printBlocks() const { - for (auto *BB : getDistributedLoop()->getBlocks()) - dbgs() << *BB; - } - -private: - /// Instructions from OrigLoop selected for this partition. - InstructionSet Set; - - /// Whether this partition contains a dependence cycle. - bool DepCycle; - - /// The original loop. - Loop *OrigLoop; - - /// The cloned loop. If this partition is mapped to the original loop, - /// this is null. - Loop *ClonedLoop = nullptr; - - /// The blocks of ClonedLoop including the preheader. If this - /// partition is mapped to the original loop, this is empty. - SmallVector<BasicBlock *, 8> ClonedLoopBlocks; - - /// These gets populated once the set of instructions have been - /// finalized. If this partition is mapped to the original loop, these are not - /// set. - ValueToValueMapTy VMap; -}; - -/// Holds the set of Partitions. It populates them, merges them and then -/// clones the loops. -class InstPartitionContainer { - using InstToPartitionIdT = DenseMap<Instruction *, int>; - -public: - InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) - : L(L), LI(LI), DT(DT) {} - - /// Returns the number of partitions. - unsigned getSize() const { return PartitionContainer.size(); } - - /// Adds \p Inst into the current partition if that is marked to - /// contain cycles. Otherwise start a new partition for it. - void addToCyclicPartition(Instruction *Inst) { - // If the current partition is non-cyclic. Start a new one. - if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) - PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); - else - PartitionContainer.back().add(Inst); - } - - /// Adds \p Inst into a partition that is not marked to contain - /// dependence cycles. - /// - // Initially we isolate memory instructions into as many partitions as - // possible, then later we may merge them back together. - void addToNewNonCyclicPartition(Instruction *Inst) { - PartitionContainer.emplace_back(Inst, L); - } - - /// Merges adjacent non-cyclic partitions. - /// - /// The idea is that we currently only want to isolate the non-vectorizable - /// partition. We could later allow more distribution among these partition - /// too. - void mergeAdjacentNonCyclic() { - mergeAdjacentPartitionsIf( - [](const InstPartition *P) { return !P->hasDepCycle(); }); - } - - /// If a partition contains only conditional stores, we won't vectorize - /// it. Try to merge it with a previous cyclic partition. - void mergeNonIfConvertible() { - mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { - if (Partition->hasDepCycle()) - return true; - - // Now, check if all stores are conditional in this partition. - bool seenStore = false; - - for (auto *Inst : *Partition) - if (isa<StoreInst>(Inst)) { - seenStore = true; - if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) - return false; - } - return seenStore; - }); - } - - /// Merges the partitions according to various heuristics. - void mergeBeforePopulating() { - mergeAdjacentNonCyclic(); - if (!DistributeNonIfConvertible) - mergeNonIfConvertible(); - } - - /// Merges partitions in order to ensure that no loads are duplicated. - /// - /// We can't duplicate loads because that could potentially reorder them. - /// LoopAccessAnalysis provides dependency information with the context that - /// the order of memory operation is preserved. - /// - /// Return if any partitions were merged. - bool mergeToAvoidDuplicatedLoads() { - using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>; - using ToBeMergedT = EquivalenceClasses<InstPartition *>; - - LoadToPartitionT LoadToPartition; - ToBeMergedT ToBeMerged; - - // Step through the partitions and create equivalence between partitions - // that contain the same load. Also put partitions in between them in the - // same equivalence class to avoid reordering of memory operations. - for (PartitionContainerT::iterator I = PartitionContainer.begin(), - E = PartitionContainer.end(); - I != E; ++I) { - auto *PartI = &*I; - - // If a load occurs in two partitions PartI and PartJ, merge all - // partitions (PartI, PartJ] into PartI. - for (Instruction *Inst : *PartI) - if (isa<LoadInst>(Inst)) { - bool NewElt; - LoadToPartitionT::iterator LoadToPart; - - std::tie(LoadToPart, NewElt) = - LoadToPartition.insert(std::make_pair(Inst, PartI)); - if (!NewElt) { - LLVM_DEBUG(dbgs() - << "Merging partitions due to this load in multiple " - << "partitions: " << PartI << ", " << LoadToPart->second - << "\n" - << *Inst << "\n"); - - auto PartJ = I; - do { - --PartJ; - ToBeMerged.unionSets(PartI, &*PartJ); - } while (&*PartJ != LoadToPart->second); - } - } - } - if (ToBeMerged.empty()) - return false; - - // Merge the member of an equivalence class into its class leader. This - // makes the members empty. - for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); - I != E; ++I) { - if (!I->isLeader()) - continue; - - auto PartI = I->getData(); - for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), - ToBeMerged.member_end())) { - PartJ->moveTo(*PartI); - } - } - - // Remove the empty partitions. - PartitionContainer.remove_if( - [](const InstPartition &P) { return P.empty(); }); - - return true; - } - - /// Sets up the mapping between instructions to partitions. If the - /// instruction is duplicated across multiple partitions, set the entry to -1. - void setupPartitionIdOnInstructions() { - int PartitionID = 0; - for (const auto &Partition : PartitionContainer) { - for (Instruction *Inst : Partition) { - bool NewElt; - InstToPartitionIdT::iterator Iter; - - std::tie(Iter, NewElt) = - InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); - if (!NewElt) - Iter->second = -1; - } - ++PartitionID; - } - } - - /// Populates the partition with everything that the seeding - /// instructions require. - void populateUsedSet() { - for (auto &P : PartitionContainer) - P.populateUsedSet(); - } - - /// This performs the main chunk of the work of cloning the loops for - /// the partitions. - void cloneLoops() { - BasicBlock *OrigPH = L->getLoopPreheader(); - // At this point the predecessor of the preheader is either the memcheck - // block or the top part of the original preheader. - BasicBlock *Pred = OrigPH->getSinglePredecessor(); - assert(Pred && "Preheader does not have a single predecessor"); - BasicBlock *ExitBlock = L->getExitBlock(); - assert(ExitBlock && "No single exit block"); - Loop *NewLoop; - - assert(!PartitionContainer.empty() && "at least two partitions expected"); - // We're cloning the preheader along with the loop so we already made sure - // it was empty. - assert(&*OrigPH->begin() == OrigPH->getTerminator() && - "preheader not empty"); - - // Preserve the original loop ID for use after the transformation. - MDNode *OrigLoopID = L->getLoopID(); - - // Create a loop for each partition except the last. Clone the original - // loop before PH along with adding a preheader for the cloned loop. Then - // update PH to point to the newly added preheader. - BasicBlock *TopPH = OrigPH; - unsigned Index = getSize() - 1; - for (auto I = std::next(PartitionContainer.rbegin()), - E = PartitionContainer.rend(); - I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { - auto *Part = &*I; - - NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); - - Part->getVMap()[ExitBlock] = TopPH; - Part->remapInstructions(); - setNewLoopID(OrigLoopID, Part); - } - Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); - - // Also set a new loop ID for the last loop. - setNewLoopID(OrigLoopID, &PartitionContainer.back()); - - // Now go in forward order and update the immediate dominator for the - // preheaders with the exiting block of the previous loop. Dominance - // within the loop is updated in cloneLoopWithPreheader. - for (auto Curr = PartitionContainer.cbegin(), - Next = std::next(PartitionContainer.cbegin()), - E = PartitionContainer.cend(); - Next != E; ++Curr, ++Next) - DT->changeImmediateDominator( - Next->getDistributedLoop()->getLoopPreheader(), - Curr->getDistributedLoop()->getExitingBlock()); - } - - /// Removes the dead instructions from the cloned loops. - void removeUnusedInsts() { - for (auto &Partition : PartitionContainer) - Partition.removeUnusedInsts(); - } - - /// For each memory pointer, it computes the partitionId the pointer is - /// used in. - /// - /// This returns an array of int where the I-th entry corresponds to I-th - /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple - /// partitions its entry is set to -1. - SmallVector<int, 8> - computePartitionSetForPointers(const LoopAccessInfo &LAI) { - const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); - - unsigned N = RtPtrCheck->Pointers.size(); - SmallVector<int, 8> PtrToPartitions(N); - for (unsigned I = 0; I < N; ++I) { - Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; - auto Instructions = - LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); - - int &Partition = PtrToPartitions[I]; - // First set it to uninitialized. - Partition = -2; - for (Instruction *Inst : Instructions) { - // Note that this could be -1 if Inst is duplicated across multiple - // partitions. - int ThisPartition = this->InstToPartitionId[Inst]; - if (Partition == -2) - Partition = ThisPartition; - // -1 means belonging to multiple partitions. - else if (Partition == -1) - break; - else if (Partition != (int)ThisPartition) - Partition = -1; - } - assert(Partition != -2 && "Pointer not belonging to any partition"); - } - - return PtrToPartitions; - } - - void print(raw_ostream &OS) const { - unsigned Index = 0; - for (const auto &P : PartitionContainer) { - OS << "Partition " << Index++ << " (" << &P << "):\n"; - P.print(); - } - } - - void dump() const { print(dbgs()); } - -#ifndef NDEBUG - friend raw_ostream &operator<<(raw_ostream &OS, - const InstPartitionContainer &Partitions) { - Partitions.print(OS); - return OS; - } -#endif - - void printBlocks() const { - unsigned Index = 0; - for (const auto &P : PartitionContainer) { - dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; - P.printBlocks(); - } - } - -private: - using PartitionContainerT = std::list<InstPartition>; - - /// List of partitions. - PartitionContainerT PartitionContainer; - - /// Mapping from Instruction to partition Id. If the instruction - /// belongs to multiple partitions the entry contains -1. - InstToPartitionIdT InstToPartitionId; - - Loop *L; - LoopInfo *LI; - DominatorTree *DT; - - /// The control structure to merge adjacent partitions if both satisfy - /// the \p Predicate. - template <class UnaryPredicate> - void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { - InstPartition *PrevMatch = nullptr; - for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { - auto DoesMatch = Predicate(&*I); - if (PrevMatch == nullptr && DoesMatch) { - PrevMatch = &*I; - ++I; - } else if (PrevMatch != nullptr && DoesMatch) { - I->moveTo(*PrevMatch); - I = PartitionContainer.erase(I); - } else { - PrevMatch = nullptr; - ++I; - } - } - } - - /// Assign new LoopIDs for the partition's cloned loop. - void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { - Optional<MDNode *> PartitionID = makeFollowupLoopID( - OrigLoopID, - {LLVMLoopDistributeFollowupAll, - Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential - : LLVMLoopDistributeFollowupCoincident}); - if (PartitionID.hasValue()) { - Loop *NewLoop = Part->getDistributedLoop(); - NewLoop->setLoopID(PartitionID.getValue()); - } - } -}; - -/// For each memory instruction, this class maintains difference of the -/// number of unsafe dependences that start out from this instruction minus -/// those that end here. -/// -/// By traversing the memory instructions in program order and accumulating this -/// number, we know whether any unsafe dependence crosses over a program point. -class MemoryInstructionDependences { - using Dependence = MemoryDepChecker::Dependence; - -public: - struct Entry { - Instruction *Inst; - unsigned NumUnsafeDependencesStartOrEnd = 0; - - Entry(Instruction *Inst) : Inst(Inst) {} - }; - - using AccessesType = SmallVector<Entry, 8>; - - AccessesType::const_iterator begin() const { return Accesses.begin(); } - AccessesType::const_iterator end() const { return Accesses.end(); } - - MemoryInstructionDependences( - const SmallVectorImpl<Instruction *> &Instructions, - const SmallVectorImpl<Dependence> &Dependences) { - Accesses.append(Instructions.begin(), Instructions.end()); - - LLVM_DEBUG(dbgs() << "Backward dependences:\n"); - for (auto &Dep : Dependences) - if (Dep.isPossiblyBackward()) { - // Note that the designations source and destination follow the program - // order, i.e. source is always first. (The direction is given by the - // DepType.) - ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; - --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; - - LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions)); - } - } - -private: - AccessesType Accesses; -}; - -/// The actual class performing the per-loop work. -class LoopDistributeForLoop { -public: - LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, - ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) - : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) { - setForced(); - } - - /// Try to distribute an inner-most loop. - bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { +//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the Loop Distribution Pass. Its main focus is to +// distribute loops that cannot be vectorized due to dependence cycles. It +// tries to isolate the offending dependences into a new loop allowing +// vectorization of the remaining parts. +// +// For dependence analysis, the pass uses the LoopVectorizer's +// LoopAccessAnalysis. Because this analysis presumes no change in the order of +// memory operations, special care is taken to preserve the lexical order of +// these operations. +// +// Similarly to the Vectorizer, the pass also supports loop versioning to +// run-time disambiguate potentially overlapping arrays. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LoopDistribute.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include <cassert> +#include <functional> +#include <list> +#include <tuple> +#include <utility> + +using namespace llvm; + +#define LDIST_NAME "loop-distribute" +#define DEBUG_TYPE LDIST_NAME + +/// @{ +/// Metadata attribute names +static const char *const LLVMLoopDistributeFollowupAll = + "llvm.loop.distribute.followup_all"; +static const char *const LLVMLoopDistributeFollowupCoincident = + "llvm.loop.distribute.followup_coincident"; +static const char *const LLVMLoopDistributeFollowupSequential = + "llvm.loop.distribute.followup_sequential"; +static const char *const LLVMLoopDistributeFollowupFallback = + "llvm.loop.distribute.followup_fallback"; +/// @} + +static cl::opt<bool> + LDistVerify("loop-distribute-verify", cl::Hidden, + cl::desc("Turn on DominatorTree and LoopInfo verification " + "after Loop Distribution"), + cl::init(false)); + +static cl::opt<bool> DistributeNonIfConvertible( + "loop-distribute-non-if-convertible", cl::Hidden, + cl::desc("Whether to distribute into a loop that may not be " + "if-convertible by the loop vectorizer"), + cl::init(false)); + +static cl::opt<unsigned> DistributeSCEVCheckThreshold( + "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed for Loop " + "Distribution")); + +static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( + "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), + cl::Hidden, + cl::desc( + "The maximum number of SCEV checks allowed for Loop " + "Distribution for loop marked with #pragma loop distribute(enable)")); + +static cl::opt<bool> EnableLoopDistribute( + "enable-loop-distribute", cl::Hidden, + cl::desc("Enable the new, experimental LoopDistribution Pass"), + cl::init(false)); + +STATISTIC(NumLoopsDistributed, "Number of loops distributed"); + +namespace { + +/// Maintains the set of instructions of the loop for a partition before +/// cloning. After cloning, it hosts the new loop. +class InstPartition { + using InstructionSet = SmallPtrSet<Instruction *, 8>; + +public: + InstPartition(Instruction *I, Loop *L, bool DepCycle = false) + : DepCycle(DepCycle), OrigLoop(L) { + Set.insert(I); + } + + /// Returns whether this partition contains a dependence cycle. + bool hasDepCycle() const { return DepCycle; } + + /// Adds an instruction to this partition. + void add(Instruction *I) { Set.insert(I); } + + /// Collection accessors. + InstructionSet::iterator begin() { return Set.begin(); } + InstructionSet::iterator end() { return Set.end(); } + InstructionSet::const_iterator begin() const { return Set.begin(); } + InstructionSet::const_iterator end() const { return Set.end(); } + bool empty() const { return Set.empty(); } + + /// Moves this partition into \p Other. This partition becomes empty + /// after this. + void moveTo(InstPartition &Other) { + Other.Set.insert(Set.begin(), Set.end()); + Set.clear(); + Other.DepCycle |= DepCycle; + } + + /// Populates the partition with a transitive closure of all the + /// instructions that the seeded instructions dependent on. + void populateUsedSet() { + // FIXME: We currently don't use control-dependence but simply include all + // blocks (possibly empty at the end) and let simplifycfg mostly clean this + // up. + for (auto *B : OrigLoop->getBlocks()) + Set.insert(B->getTerminator()); + + // Follow the use-def chains to form a transitive closure of all the + // instructions that the originally seeded instructions depend on. + SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + // Insert instructions from the loop that we depend on. + for (Value *V : I->operand_values()) { + auto *I = dyn_cast<Instruction>(V); + if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) + Worklist.push_back(I); + } + } + } + + /// Clones the original loop. + /// + /// Updates LoopInfo and DominatorTree using the information that block \p + /// LoopDomBB dominates the loop. + Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, + unsigned Index, LoopInfo *LI, + DominatorTree *DT) { + ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, + VMap, Twine(".ldist") + Twine(Index), + LI, DT, ClonedLoopBlocks); + return ClonedLoop; + } + + /// The cloned loop. If this partition is mapped to the original loop, + /// this is null. + const Loop *getClonedLoop() const { return ClonedLoop; } + + /// Returns the loop where this partition ends up after distribution. + /// If this partition is mapped to the original loop then use the block from + /// the loop. + Loop *getDistributedLoop() const { + return ClonedLoop ? ClonedLoop : OrigLoop; + } + + /// The VMap that is populated by cloning and then used in + /// remapinstruction to remap the cloned instructions. + ValueToValueMapTy &getVMap() { return VMap; } + + /// Remaps the cloned instructions using VMap. + void remapInstructions() { + remapInstructionsInBlocks(ClonedLoopBlocks, VMap); + } + + /// Based on the set of instructions selected for this partition, + /// removes the unnecessary ones. + void removeUnusedInsts() { + SmallVector<Instruction *, 8> Unused; + + for (auto *Block : OrigLoop->getBlocks()) + for (auto &Inst : *Block) + if (!Set.count(&Inst)) { + Instruction *NewInst = &Inst; + if (!VMap.empty()) + NewInst = cast<Instruction>(VMap[NewInst]); + + assert(!isa<BranchInst>(NewInst) && + "Branches are marked used early on"); + Unused.push_back(NewInst); + } + + // Delete the instructions backwards, as it has a reduced likelihood of + // having to update as many def-use and use-def chains. + for (auto *Inst : reverse(Unused)) { + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); + Inst->eraseFromParent(); + } + } + + void print() const { + if (DepCycle) + dbgs() << " (cycle)\n"; + for (auto *I : Set) + // Prefix with the block name. + dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; + } + + void printBlocks() const { + for (auto *BB : getDistributedLoop()->getBlocks()) + dbgs() << *BB; + } + +private: + /// Instructions from OrigLoop selected for this partition. + InstructionSet Set; + + /// Whether this partition contains a dependence cycle. + bool DepCycle; + + /// The original loop. + Loop *OrigLoop; + + /// The cloned loop. If this partition is mapped to the original loop, + /// this is null. + Loop *ClonedLoop = nullptr; + + /// The blocks of ClonedLoop including the preheader. If this + /// partition is mapped to the original loop, this is empty. + SmallVector<BasicBlock *, 8> ClonedLoopBlocks; + + /// These gets populated once the set of instructions have been + /// finalized. If this partition is mapped to the original loop, these are not + /// set. + ValueToValueMapTy VMap; +}; + +/// Holds the set of Partitions. It populates them, merges them and then +/// clones the loops. +class InstPartitionContainer { + using InstToPartitionIdT = DenseMap<Instruction *, int>; + +public: + InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) + : L(L), LI(LI), DT(DT) {} + + /// Returns the number of partitions. + unsigned getSize() const { return PartitionContainer.size(); } + + /// Adds \p Inst into the current partition if that is marked to + /// contain cycles. Otherwise start a new partition for it. + void addToCyclicPartition(Instruction *Inst) { + // If the current partition is non-cyclic. Start a new one. + if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) + PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); + else + PartitionContainer.back().add(Inst); + } + + /// Adds \p Inst into a partition that is not marked to contain + /// dependence cycles. + /// + // Initially we isolate memory instructions into as many partitions as + // possible, then later we may merge them back together. + void addToNewNonCyclicPartition(Instruction *Inst) { + PartitionContainer.emplace_back(Inst, L); + } + + /// Merges adjacent non-cyclic partitions. + /// + /// The idea is that we currently only want to isolate the non-vectorizable + /// partition. We could later allow more distribution among these partition + /// too. + void mergeAdjacentNonCyclic() { + mergeAdjacentPartitionsIf( + [](const InstPartition *P) { return !P->hasDepCycle(); }); + } + + /// If a partition contains only conditional stores, we won't vectorize + /// it. Try to merge it with a previous cyclic partition. + void mergeNonIfConvertible() { + mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { + if (Partition->hasDepCycle()) + return true; + + // Now, check if all stores are conditional in this partition. + bool seenStore = false; + + for (auto *Inst : *Partition) + if (isa<StoreInst>(Inst)) { + seenStore = true; + if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) + return false; + } + return seenStore; + }); + } + + /// Merges the partitions according to various heuristics. + void mergeBeforePopulating() { + mergeAdjacentNonCyclic(); + if (!DistributeNonIfConvertible) + mergeNonIfConvertible(); + } + + /// Merges partitions in order to ensure that no loads are duplicated. + /// + /// We can't duplicate loads because that could potentially reorder them. + /// LoopAccessAnalysis provides dependency information with the context that + /// the order of memory operation is preserved. + /// + /// Return if any partitions were merged. + bool mergeToAvoidDuplicatedLoads() { + using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>; + using ToBeMergedT = EquivalenceClasses<InstPartition *>; + + LoadToPartitionT LoadToPartition; + ToBeMergedT ToBeMerged; + + // Step through the partitions and create equivalence between partitions + // that contain the same load. Also put partitions in between them in the + // same equivalence class to avoid reordering of memory operations. + for (PartitionContainerT::iterator I = PartitionContainer.begin(), + E = PartitionContainer.end(); + I != E; ++I) { + auto *PartI = &*I; + + // If a load occurs in two partitions PartI and PartJ, merge all + // partitions (PartI, PartJ] into PartI. + for (Instruction *Inst : *PartI) + if (isa<LoadInst>(Inst)) { + bool NewElt; + LoadToPartitionT::iterator LoadToPart; + + std::tie(LoadToPart, NewElt) = + LoadToPartition.insert(std::make_pair(Inst, PartI)); + if (!NewElt) { + LLVM_DEBUG(dbgs() + << "Merging partitions due to this load in multiple " + << "partitions: " << PartI << ", " << LoadToPart->second + << "\n" + << *Inst << "\n"); + + auto PartJ = I; + do { + --PartJ; + ToBeMerged.unionSets(PartI, &*PartJ); + } while (&*PartJ != LoadToPart->second); + } + } + } + if (ToBeMerged.empty()) + return false; + + // Merge the member of an equivalence class into its class leader. This + // makes the members empty. + for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); + I != E; ++I) { + if (!I->isLeader()) + continue; + + auto PartI = I->getData(); + for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), + ToBeMerged.member_end())) { + PartJ->moveTo(*PartI); + } + } + + // Remove the empty partitions. + PartitionContainer.remove_if( + [](const InstPartition &P) { return P.empty(); }); + + return true; + } + + /// Sets up the mapping between instructions to partitions. If the + /// instruction is duplicated across multiple partitions, set the entry to -1. + void setupPartitionIdOnInstructions() { + int PartitionID = 0; + for (const auto &Partition : PartitionContainer) { + for (Instruction *Inst : Partition) { + bool NewElt; + InstToPartitionIdT::iterator Iter; + + std::tie(Iter, NewElt) = + InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); + if (!NewElt) + Iter->second = -1; + } + ++PartitionID; + } + } + + /// Populates the partition with everything that the seeding + /// instructions require. + void populateUsedSet() { + for (auto &P : PartitionContainer) + P.populateUsedSet(); + } + + /// This performs the main chunk of the work of cloning the loops for + /// the partitions. + void cloneLoops() { + BasicBlock *OrigPH = L->getLoopPreheader(); + // At this point the predecessor of the preheader is either the memcheck + // block or the top part of the original preheader. + BasicBlock *Pred = OrigPH->getSinglePredecessor(); + assert(Pred && "Preheader does not have a single predecessor"); + BasicBlock *ExitBlock = L->getExitBlock(); + assert(ExitBlock && "No single exit block"); + Loop *NewLoop; + + assert(!PartitionContainer.empty() && "at least two partitions expected"); + // We're cloning the preheader along with the loop so we already made sure + // it was empty. + assert(&*OrigPH->begin() == OrigPH->getTerminator() && + "preheader not empty"); + + // Preserve the original loop ID for use after the transformation. + MDNode *OrigLoopID = L->getLoopID(); + + // Create a loop for each partition except the last. Clone the original + // loop before PH along with adding a preheader for the cloned loop. Then + // update PH to point to the newly added preheader. + BasicBlock *TopPH = OrigPH; + unsigned Index = getSize() - 1; + for (auto I = std::next(PartitionContainer.rbegin()), + E = PartitionContainer.rend(); + I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { + auto *Part = &*I; + + NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); + + Part->getVMap()[ExitBlock] = TopPH; + Part->remapInstructions(); + setNewLoopID(OrigLoopID, Part); + } + Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); + + // Also set a new loop ID for the last loop. + setNewLoopID(OrigLoopID, &PartitionContainer.back()); + + // Now go in forward order and update the immediate dominator for the + // preheaders with the exiting block of the previous loop. Dominance + // within the loop is updated in cloneLoopWithPreheader. + for (auto Curr = PartitionContainer.cbegin(), + Next = std::next(PartitionContainer.cbegin()), + E = PartitionContainer.cend(); + Next != E; ++Curr, ++Next) + DT->changeImmediateDominator( + Next->getDistributedLoop()->getLoopPreheader(), + Curr->getDistributedLoop()->getExitingBlock()); + } + + /// Removes the dead instructions from the cloned loops. + void removeUnusedInsts() { + for (auto &Partition : PartitionContainer) + Partition.removeUnusedInsts(); + } + + /// For each memory pointer, it computes the partitionId the pointer is + /// used in. + /// + /// This returns an array of int where the I-th entry corresponds to I-th + /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple + /// partitions its entry is set to -1. + SmallVector<int, 8> + computePartitionSetForPointers(const LoopAccessInfo &LAI) { + const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); + + unsigned N = RtPtrCheck->Pointers.size(); + SmallVector<int, 8> PtrToPartitions(N); + for (unsigned I = 0; I < N; ++I) { + Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; + auto Instructions = + LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); + + int &Partition = PtrToPartitions[I]; + // First set it to uninitialized. + Partition = -2; + for (Instruction *Inst : Instructions) { + // Note that this could be -1 if Inst is duplicated across multiple + // partitions. + int ThisPartition = this->InstToPartitionId[Inst]; + if (Partition == -2) + Partition = ThisPartition; + // -1 means belonging to multiple partitions. + else if (Partition == -1) + break; + else if (Partition != (int)ThisPartition) + Partition = -1; + } + assert(Partition != -2 && "Pointer not belonging to any partition"); + } + + return PtrToPartitions; + } + + void print(raw_ostream &OS) const { + unsigned Index = 0; + for (const auto &P : PartitionContainer) { + OS << "Partition " << Index++ << " (" << &P << "):\n"; + P.print(); + } + } + + void dump() const { print(dbgs()); } + +#ifndef NDEBUG + friend raw_ostream &operator<<(raw_ostream &OS, + const InstPartitionContainer &Partitions) { + Partitions.print(OS); + return OS; + } +#endif + + void printBlocks() const { + unsigned Index = 0; + for (const auto &P : PartitionContainer) { + dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; + P.printBlocks(); + } + } + +private: + using PartitionContainerT = std::list<InstPartition>; + + /// List of partitions. + PartitionContainerT PartitionContainer; + + /// Mapping from Instruction to partition Id. If the instruction + /// belongs to multiple partitions the entry contains -1. + InstToPartitionIdT InstToPartitionId; + + Loop *L; + LoopInfo *LI; + DominatorTree *DT; + + /// The control structure to merge adjacent partitions if both satisfy + /// the \p Predicate. + template <class UnaryPredicate> + void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { + InstPartition *PrevMatch = nullptr; + for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { + auto DoesMatch = Predicate(&*I); + if (PrevMatch == nullptr && DoesMatch) { + PrevMatch = &*I; + ++I; + } else if (PrevMatch != nullptr && DoesMatch) { + I->moveTo(*PrevMatch); + I = PartitionContainer.erase(I); + } else { + PrevMatch = nullptr; + ++I; + } + } + } + + /// Assign new LoopIDs for the partition's cloned loop. + void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { + Optional<MDNode *> PartitionID = makeFollowupLoopID( + OrigLoopID, + {LLVMLoopDistributeFollowupAll, + Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential + : LLVMLoopDistributeFollowupCoincident}); + if (PartitionID.hasValue()) { + Loop *NewLoop = Part->getDistributedLoop(); + NewLoop->setLoopID(PartitionID.getValue()); + } + } +}; + +/// For each memory instruction, this class maintains difference of the +/// number of unsafe dependences that start out from this instruction minus +/// those that end here. +/// +/// By traversing the memory instructions in program order and accumulating this +/// number, we know whether any unsafe dependence crosses over a program point. +class MemoryInstructionDependences { + using Dependence = MemoryDepChecker::Dependence; + +public: + struct Entry { + Instruction *Inst; + unsigned NumUnsafeDependencesStartOrEnd = 0; + + Entry(Instruction *Inst) : Inst(Inst) {} + }; + + using AccessesType = SmallVector<Entry, 8>; + + AccessesType::const_iterator begin() const { return Accesses.begin(); } + AccessesType::const_iterator end() const { return Accesses.end(); } + + MemoryInstructionDependences( + const SmallVectorImpl<Instruction *> &Instructions, + const SmallVectorImpl<Dependence> &Dependences) { + Accesses.append(Instructions.begin(), Instructions.end()); + + LLVM_DEBUG(dbgs() << "Backward dependences:\n"); + for (auto &Dep : Dependences) + if (Dep.isPossiblyBackward()) { + // Note that the designations source and destination follow the program + // order, i.e. source is always first. (The direction is given by the + // DepType.) + ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; + --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; + + LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions)); + } + } + +private: + AccessesType Accesses; +}; + +/// The actual class performing the per-loop work. +class LoopDistributeForLoop { +public: + LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, + ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) + : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) { + setForced(); + } + + /// Try to distribute an inner-most loop. + bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { assert(L->isInnermost() && "Only process inner loops."); - - LLVM_DEBUG(dbgs() << "\nLDist: In \"" - << L->getHeader()->getParent()->getName() - << "\" checking " << *L << "\n"); - + + LLVM_DEBUG(dbgs() << "\nLDist: In \"" + << L->getHeader()->getParent()->getName() + << "\" checking " << *L << "\n"); + // Having a single exit block implies there's also one exiting block. - if (!L->getExitBlock()) - return fail("MultipleExitBlocks", "multiple exit blocks"); - if (!L->isLoopSimplifyForm()) - return fail("NotLoopSimplifyForm", - "loop is not in loop-simplify form"); + if (!L->getExitBlock()) + return fail("MultipleExitBlocks", "multiple exit blocks"); + if (!L->isLoopSimplifyForm()) + return fail("NotLoopSimplifyForm", + "loop is not in loop-simplify form"); if (!L->isRotatedForm()) return fail("NotBottomTested", "loop is not bottom tested"); - - BasicBlock *PH = L->getLoopPreheader(); - - LAI = &GetLAA(*L); - - // Currently, we only distribute to isolate the part of the loop with - // dependence cycles to enable partial vectorization. - if (LAI->canVectorizeMemory()) - return fail("MemOpsCanBeVectorized", - "memory operations are safe for vectorization"); - - auto *Dependences = LAI->getDepChecker().getDependences(); - if (!Dependences || Dependences->empty()) - return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); - - InstPartitionContainer Partitions(L, LI, DT); - - // First, go through each memory operation and assign them to consecutive - // partitions (the order of partitions follows program order). Put those - // with unsafe dependences into "cyclic" partition otherwise put each store - // in its own "non-cyclic" partition (we'll merge these later). - // - // Note that a memory operation (e.g. Load2 below) at a program point that - // has an unsafe dependence (Store3->Load1) spanning over it must be - // included in the same cyclic partition as the dependent operations. This - // is to preserve the original program order after distribution. E.g.: - // - // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive - // Load1 -. 1 0->1 - // Load2 | /Unsafe/ 0 1 - // Store3 -' -1 1->0 - // Load4 0 0 - // - // NumUnsafeDependencesActive > 0 indicates this situation and in this case - // we just keep assigning to the same cyclic partition until - // NumUnsafeDependencesActive reaches 0. - const MemoryDepChecker &DepChecker = LAI->getDepChecker(); - MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), - *Dependences); - - int NumUnsafeDependencesActive = 0; - for (auto &InstDep : MID) { - Instruction *I = InstDep.Inst; - // We update NumUnsafeDependencesActive post-instruction, catch the - // start of a dependence directly via NumUnsafeDependencesStartOrEnd. - if (NumUnsafeDependencesActive || - InstDep.NumUnsafeDependencesStartOrEnd > 0) - Partitions.addToCyclicPartition(I); - else - Partitions.addToNewNonCyclicPartition(I); - NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; - assert(NumUnsafeDependencesActive >= 0 && - "Negative number of dependences active"); - } - - // Add partitions for values used outside. These partitions can be out of - // order from the original program order. This is OK because if the - // partition uses a load we will merge this partition with the original - // partition of the load that we set up in the previous loop (see - // mergeToAvoidDuplicatedLoads). - auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); - for (auto *Inst : DefsUsedOutside) - Partitions.addToNewNonCyclicPartition(Inst); - - LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); - if (Partitions.getSize() < 2) - return fail("CantIsolateUnsafeDeps", - "cannot isolate unsafe dependencies"); - - // Run the merge heuristics: Merge non-cyclic adjacent partitions since we - // should be able to vectorize these together. - Partitions.mergeBeforePopulating(); - LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); - if (Partitions.getSize() < 2) - return fail("CantIsolateUnsafeDeps", - "cannot isolate unsafe dependencies"); - - // Now, populate the partitions with non-memory operations. - Partitions.populateUsedSet(); - LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); - - // In order to preserve original lexical order for loads, keep them in the - // partition that we set up in the MemoryInstructionDependences loop. - if (Partitions.mergeToAvoidDuplicatedLoads()) { - LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" - << Partitions); - if (Partitions.getSize() < 2) - return fail("CantIsolateUnsafeDeps", - "cannot isolate unsafe dependencies"); - } - - // Don't distribute the loop if we need too many SCEV run-time checks, or - // any if it's illegal. - const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); - if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { - return fail("RuntimeCheckWithConvergent", - "may not insert runtime check with convergent operation"); - } - - if (Pred.getComplexity() > (IsForced.getValueOr(false) - ? PragmaDistributeSCEVCheckThreshold - : DistributeSCEVCheckThreshold)) - return fail("TooManySCEVRuntimeChecks", - "too many SCEV run-time checks needed.\n"); - - if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L)) - return fail("HeuristicDisabled", "distribution heuristic disabled"); - - LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); - // We're done forming the partitions set up the reverse mapping from - // instructions to partitions. - Partitions.setupPartitionIdOnInstructions(); - - // If we need run-time checks, version the loop now. - auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); - const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); - const auto &AllChecks = RtPtrChecking->getChecks(); - auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, - RtPtrChecking); - - if (LAI->hasConvergentOp() && !Checks.empty()) { - return fail("RuntimeCheckWithConvergent", - "may not insert runtime check with convergent operation"); - } - - // To keep things simple have an empty preheader before we version or clone - // the loop. (Also split if this has no predecessor, i.e. entry, because we - // rely on PH having a predecessor.) - if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) - SplitBlock(PH, PH->getTerminator(), DT, LI); - - if (!Pred.isAlwaysTrue() || !Checks.empty()) { - assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning"); - - MDNode *OrigLoopID = L->getLoopID(); - - LLVM_DEBUG(dbgs() << "\nPointers:\n"); - LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + + BasicBlock *PH = L->getLoopPreheader(); + + LAI = &GetLAA(*L); + + // Currently, we only distribute to isolate the part of the loop with + // dependence cycles to enable partial vectorization. + if (LAI->canVectorizeMemory()) + return fail("MemOpsCanBeVectorized", + "memory operations are safe for vectorization"); + + auto *Dependences = LAI->getDepChecker().getDependences(); + if (!Dependences || Dependences->empty()) + return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); + + InstPartitionContainer Partitions(L, LI, DT); + + // First, go through each memory operation and assign them to consecutive + // partitions (the order of partitions follows program order). Put those + // with unsafe dependences into "cyclic" partition otherwise put each store + // in its own "non-cyclic" partition (we'll merge these later). + // + // Note that a memory operation (e.g. Load2 below) at a program point that + // has an unsafe dependence (Store3->Load1) spanning over it must be + // included in the same cyclic partition as the dependent operations. This + // is to preserve the original program order after distribution. E.g.: + // + // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive + // Load1 -. 1 0->1 + // Load2 | /Unsafe/ 0 1 + // Store3 -' -1 1->0 + // Load4 0 0 + // + // NumUnsafeDependencesActive > 0 indicates this situation and in this case + // we just keep assigning to the same cyclic partition until + // NumUnsafeDependencesActive reaches 0. + const MemoryDepChecker &DepChecker = LAI->getDepChecker(); + MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), + *Dependences); + + int NumUnsafeDependencesActive = 0; + for (auto &InstDep : MID) { + Instruction *I = InstDep.Inst; + // We update NumUnsafeDependencesActive post-instruction, catch the + // start of a dependence directly via NumUnsafeDependencesStartOrEnd. + if (NumUnsafeDependencesActive || + InstDep.NumUnsafeDependencesStartOrEnd > 0) + Partitions.addToCyclicPartition(I); + else + Partitions.addToNewNonCyclicPartition(I); + NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; + assert(NumUnsafeDependencesActive >= 0 && + "Negative number of dependences active"); + } + + // Add partitions for values used outside. These partitions can be out of + // order from the original program order. This is OK because if the + // partition uses a load we will merge this partition with the original + // partition of the load that we set up in the previous loop (see + // mergeToAvoidDuplicatedLoads). + auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); + for (auto *Inst : DefsUsedOutside) + Partitions.addToNewNonCyclicPartition(Inst); + + LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); + if (Partitions.getSize() < 2) + return fail("CantIsolateUnsafeDeps", + "cannot isolate unsafe dependencies"); + + // Run the merge heuristics: Merge non-cyclic adjacent partitions since we + // should be able to vectorize these together. + Partitions.mergeBeforePopulating(); + LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); + if (Partitions.getSize() < 2) + return fail("CantIsolateUnsafeDeps", + "cannot isolate unsafe dependencies"); + + // Now, populate the partitions with non-memory operations. + Partitions.populateUsedSet(); + LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); + + // In order to preserve original lexical order for loads, keep them in the + // partition that we set up in the MemoryInstructionDependences loop. + if (Partitions.mergeToAvoidDuplicatedLoads()) { + LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" + << Partitions); + if (Partitions.getSize() < 2) + return fail("CantIsolateUnsafeDeps", + "cannot isolate unsafe dependencies"); + } + + // Don't distribute the loop if we need too many SCEV run-time checks, or + // any if it's illegal. + const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); + if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { + return fail("RuntimeCheckWithConvergent", + "may not insert runtime check with convergent operation"); + } + + if (Pred.getComplexity() > (IsForced.getValueOr(false) + ? PragmaDistributeSCEVCheckThreshold + : DistributeSCEVCheckThreshold)) + return fail("TooManySCEVRuntimeChecks", + "too many SCEV run-time checks needed.\n"); + + if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L)) + return fail("HeuristicDisabled", "distribution heuristic disabled"); + + LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); + // We're done forming the partitions set up the reverse mapping from + // instructions to partitions. + Partitions.setupPartitionIdOnInstructions(); + + // If we need run-time checks, version the loop now. + auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); + const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); + const auto &AllChecks = RtPtrChecking->getChecks(); + auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, + RtPtrChecking); + + if (LAI->hasConvergentOp() && !Checks.empty()) { + return fail("RuntimeCheckWithConvergent", + "may not insert runtime check with convergent operation"); + } + + // To keep things simple have an empty preheader before we version or clone + // the loop. (Also split if this has no predecessor, i.e. entry, because we + // rely on PH having a predecessor.) + if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) + SplitBlock(PH, PH->getTerminator(), DT, LI); + + if (!Pred.isAlwaysTrue() || !Checks.empty()) { + assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning"); + + MDNode *OrigLoopID = L->getLoopID(); + + LLVM_DEBUG(dbgs() << "\nPointers:\n"); + LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE); - LVer.versionLoop(DefsUsedOutside); - LVer.annotateLoopWithNoAlias(); - - // The unversioned loop will not be changed, so we inherit all attributes - // from the original loop, but remove the loop distribution metadata to - // avoid to distribute it again. - MDNode *UnversionedLoopID = - makeFollowupLoopID(OrigLoopID, - {LLVMLoopDistributeFollowupAll, - LLVMLoopDistributeFollowupFallback}, - "llvm.loop.distribute.", true) - .getValue(); - LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); - } - - // Create identical copies of the original loop for each partition and hook - // them up sequentially. - Partitions.cloneLoops(); - - // Now, we remove the instruction from each loop that don't belong to that - // partition. - Partitions.removeUnusedInsts(); - LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); - LLVM_DEBUG(Partitions.printBlocks()); - - if (LDistVerify) { - LI->verify(*DT); - assert(DT->verify(DominatorTree::VerificationLevel::Fast)); - } - - ++NumLoopsDistributed; - // Report the success. - ORE->emit([&]() { - return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), - L->getHeader()) - << "distributed loop"; - }); - return true; - } - - /// Provide diagnostics then \return with false. - bool fail(StringRef RemarkName, StringRef Message) { - LLVMContext &Ctx = F->getContext(); - bool Forced = isForced().getValueOr(false); - - LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n"); - - // With Rpass-missed report that distribution failed. - ORE->emit([&]() { - return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", - L->getStartLoc(), L->getHeader()) - << "loop not distributed: use -Rpass-analysis=loop-distribute for " - "more " - "info"; - }); - - // With Rpass-analysis report why. This is on by default if distribution - // was requested explicitly. - ORE->emit(OptimizationRemarkAnalysis( - Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, - RemarkName, L->getStartLoc(), L->getHeader()) - << "loop not distributed: " << Message); - - // Also issue a warning if distribution was requested explicitly but it - // failed. - if (Forced) - Ctx.diagnose(DiagnosticInfoOptimizationFailure( - *F, L->getStartLoc(), "loop not distributed: failed " - "explicitly specified loop distribution")); - - return false; - } - - /// Return if distribution forced to be enabled/disabled for the loop. - /// - /// If the optional has a value, it indicates whether distribution was forced - /// to be enabled (true) or disabled (false). If the optional has no value - /// distribution was not forced either way. - const Optional<bool> &isForced() const { return IsForced; } - -private: - /// Filter out checks between pointers from the same partition. - /// - /// \p PtrToPartition contains the partition number for pointers. Partition - /// number -1 means that the pointer is used in multiple partitions. In this - /// case we can't safely omit the check. - SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks( - const SmallVectorImpl<RuntimePointerCheck> &AllChecks, - const SmallVectorImpl<int> &PtrToPartition, - const RuntimePointerChecking *RtPtrChecking) { - SmallVector<RuntimePointerCheck, 4> Checks; - - copy_if(AllChecks, std::back_inserter(Checks), - [&](const RuntimePointerCheck &Check) { - for (unsigned PtrIdx1 : Check.first->Members) - for (unsigned PtrIdx2 : Check.second->Members) - // Only include this check if there is a pair of pointers - // that require checking and the pointers fall into - // separate partitions. - // - // (Note that we already know at this point that the two - // pointer groups need checking but it doesn't follow - // that each pair of pointers within the two groups need - // checking as well. - // - // In other words we don't want to include a check just - // because there is a pair of pointers between the two - // pointer groups that require checks and a different - // pair whose pointers fall into different partitions.) - if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && - !RuntimePointerChecking::arePointersInSamePartition( - PtrToPartition, PtrIdx1, PtrIdx2)) - return true; - return false; - }); - - return Checks; - } - - /// Check whether the loop metadata is forcing distribution to be - /// enabled/disabled. - void setForced() { - Optional<const MDOperand *> Value = - findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); - if (!Value) - return; - - const MDOperand *Op = *Value; - assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); - IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); - } - - Loop *L; - Function *F; - - // Analyses used. - LoopInfo *LI; - const LoopAccessInfo *LAI = nullptr; - DominatorTree *DT; - ScalarEvolution *SE; - OptimizationRemarkEmitter *ORE; - - /// Indicates whether distribution is forced to be enabled/disabled for - /// the loop. - /// - /// If the optional has a value, it indicates whether distribution was forced - /// to be enabled (true) or disabled (false). If the optional has no value - /// distribution was not forced either way. - Optional<bool> IsForced; -}; - -} // end anonymous namespace - -/// Shared implementation between new and old PMs. -static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, - ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, - std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { - // Build up a worklist of inner-loops to vectorize. This is necessary as the - // act of distributing a loop creates new loops and can invalidate iterators - // across the loops. - SmallVector<Loop *, 8> Worklist; - - for (Loop *TopLevelLoop : *LI) - for (Loop *L : depth_first(TopLevelLoop)) - // We only handle inner-most loops. + LVer.versionLoop(DefsUsedOutside); + LVer.annotateLoopWithNoAlias(); + + // The unversioned loop will not be changed, so we inherit all attributes + // from the original loop, but remove the loop distribution metadata to + // avoid to distribute it again. + MDNode *UnversionedLoopID = + makeFollowupLoopID(OrigLoopID, + {LLVMLoopDistributeFollowupAll, + LLVMLoopDistributeFollowupFallback}, + "llvm.loop.distribute.", true) + .getValue(); + LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); + } + + // Create identical copies of the original loop for each partition and hook + // them up sequentially. + Partitions.cloneLoops(); + + // Now, we remove the instruction from each loop that don't belong to that + // partition. + Partitions.removeUnusedInsts(); + LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); + LLVM_DEBUG(Partitions.printBlocks()); + + if (LDistVerify) { + LI->verify(*DT); + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); + } + + ++NumLoopsDistributed; + // Report the success. + ORE->emit([&]() { + return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), + L->getHeader()) + << "distributed loop"; + }); + return true; + } + + /// Provide diagnostics then \return with false. + bool fail(StringRef RemarkName, StringRef Message) { + LLVMContext &Ctx = F->getContext(); + bool Forced = isForced().getValueOr(false); + + LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n"); + + // With Rpass-missed report that distribution failed. + ORE->emit([&]() { + return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", + L->getStartLoc(), L->getHeader()) + << "loop not distributed: use -Rpass-analysis=loop-distribute for " + "more " + "info"; + }); + + // With Rpass-analysis report why. This is on by default if distribution + // was requested explicitly. + ORE->emit(OptimizationRemarkAnalysis( + Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, + RemarkName, L->getStartLoc(), L->getHeader()) + << "loop not distributed: " << Message); + + // Also issue a warning if distribution was requested explicitly but it + // failed. + if (Forced) + Ctx.diagnose(DiagnosticInfoOptimizationFailure( + *F, L->getStartLoc(), "loop not distributed: failed " + "explicitly specified loop distribution")); + + return false; + } + + /// Return if distribution forced to be enabled/disabled for the loop. + /// + /// If the optional has a value, it indicates whether distribution was forced + /// to be enabled (true) or disabled (false). If the optional has no value + /// distribution was not forced either way. + const Optional<bool> &isForced() const { return IsForced; } + +private: + /// Filter out checks between pointers from the same partition. + /// + /// \p PtrToPartition contains the partition number for pointers. Partition + /// number -1 means that the pointer is used in multiple partitions. In this + /// case we can't safely omit the check. + SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks( + const SmallVectorImpl<RuntimePointerCheck> &AllChecks, + const SmallVectorImpl<int> &PtrToPartition, + const RuntimePointerChecking *RtPtrChecking) { + SmallVector<RuntimePointerCheck, 4> Checks; + + copy_if(AllChecks, std::back_inserter(Checks), + [&](const RuntimePointerCheck &Check) { + for (unsigned PtrIdx1 : Check.first->Members) + for (unsigned PtrIdx2 : Check.second->Members) + // Only include this check if there is a pair of pointers + // that require checking and the pointers fall into + // separate partitions. + // + // (Note that we already know at this point that the two + // pointer groups need checking but it doesn't follow + // that each pair of pointers within the two groups need + // checking as well. + // + // In other words we don't want to include a check just + // because there is a pair of pointers between the two + // pointer groups that require checks and a different + // pair whose pointers fall into different partitions.) + if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && + !RuntimePointerChecking::arePointersInSamePartition( + PtrToPartition, PtrIdx1, PtrIdx2)) + return true; + return false; + }); + + return Checks; + } + + /// Check whether the loop metadata is forcing distribution to be + /// enabled/disabled. + void setForced() { + Optional<const MDOperand *> Value = + findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); + if (!Value) + return; + + const MDOperand *Op = *Value; + assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); + IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); + } + + Loop *L; + Function *F; + + // Analyses used. + LoopInfo *LI; + const LoopAccessInfo *LAI = nullptr; + DominatorTree *DT; + ScalarEvolution *SE; + OptimizationRemarkEmitter *ORE; + + /// Indicates whether distribution is forced to be enabled/disabled for + /// the loop. + /// + /// If the optional has a value, it indicates whether distribution was forced + /// to be enabled (true) or disabled (false). If the optional has no value + /// distribution was not forced either way. + Optional<bool> IsForced; +}; + +} // end anonymous namespace + +/// Shared implementation between new and old PMs. +static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, + ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, + std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { + // Build up a worklist of inner-loops to vectorize. This is necessary as the + // act of distributing a loop creates new loops and can invalidate iterators + // across the loops. + SmallVector<Loop *, 8> Worklist; + + for (Loop *TopLevelLoop : *LI) + for (Loop *L : depth_first(TopLevelLoop)) + // We only handle inner-most loops. if (L->isInnermost()) - Worklist.push_back(L); - - // Now walk the identified inner loops. - bool Changed = false; - for (Loop *L : Worklist) { - LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE); - - // If distribution was forced for the specific loop to be - // enabled/disabled, follow that. Otherwise use the global flag. - if (LDL.isForced().getValueOr(EnableLoopDistribute)) - Changed |= LDL.processLoop(GetLAA); - } - - // Process each loop nest in the function. - return Changed; -} - -namespace { - -/// The pass class. -class LoopDistributeLegacy : public FunctionPass { -public: - static char ID; - - LoopDistributeLegacy() : FunctionPass(ID) { - // The default is set by the caller. - initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override { - if (skipFunction(F)) - return false; - - auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); - auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - std::function<const LoopAccessInfo &(Loop &)> GetLAA = - [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; - - return runImpl(F, LI, DT, SE, ORE, GetLAA); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequired<LoopAccessLegacyAnalysis>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } -}; - -} // end anonymous namespace - -PreservedAnalyses LoopDistributePass::run(Function &F, - FunctionAnalysisManager &AM) { - auto &LI = AM.getResult<LoopAnalysis>(F); - auto &DT = AM.getResult<DominatorTreeAnalysis>(F); - auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); - auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); - - // We don't directly need these analyses but they're required for loop - // analyses so provide them below. - auto &AA = AM.getResult<AAManager>(F); - auto &AC = AM.getResult<AssumptionAnalysis>(F); - auto &TTI = AM.getResult<TargetIRAnalysis>(F); - auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); - - auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); - std::function<const LoopAccessInfo &(Loop &)> GetLAA = - [&](Loop &L) -> const LoopAccessInfo & { + Worklist.push_back(L); + + // Now walk the identified inner loops. + bool Changed = false; + for (Loop *L : Worklist) { + LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE); + + // If distribution was forced for the specific loop to be + // enabled/disabled, follow that. Otherwise use the global flag. + if (LDL.isForced().getValueOr(EnableLoopDistribute)) + Changed |= LDL.processLoop(GetLAA); + } + + // Process each loop nest in the function. + return Changed; +} + +namespace { + +/// The pass class. +class LoopDistributeLegacy : public FunctionPass { +public: + static char ID; + + LoopDistributeLegacy() : FunctionPass(ID) { + // The default is set by the caller. + initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); + auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + std::function<const LoopAccessInfo &(Loop &)> GetLAA = + [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; + + return runImpl(F, LI, DT, SE, ORE, GetLAA); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<LoopAccessLegacyAnalysis>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } +}; + +} // end anonymous namespace + +PreservedAnalyses LoopDistributePass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &LI = AM.getResult<LoopAnalysis>(F); + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + + // We don't directly need these analyses but they're required for loop + // analyses so provide them below. + auto &AA = AM.getResult<AAManager>(F); + auto &AC = AM.getResult<AssumptionAnalysis>(F); + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + + auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); + std::function<const LoopAccessInfo &(Loop &)> GetLAA = + [&](Loop &L) -> const LoopAccessInfo & { LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr, nullptr}; - return LAM.getResult<LoopAccessAnalysis>(L, AR); - }; - - bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA); - if (!Changed) - return PreservedAnalyses::all(); - PreservedAnalyses PA; - PA.preserve<LoopAnalysis>(); - PA.preserve<DominatorTreeAnalysis>(); - PA.preserve<GlobalsAA>(); - return PA; -} - -char LoopDistributeLegacy::ID; - -static const char ldist_name[] = "Loop Distribution"; - -INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, - false) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) -INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) - -FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); } + return LAM.getResult<LoopAccessAnalysis>(L, AR); + }; + + bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA); + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<LoopAnalysis>(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<GlobalsAA>(); + return PA; +} + +char LoopDistributeLegacy::ID; + +static const char ldist_name[] = "Loop Distribution"; + +INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, + false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) + +FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); } |