diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp')
-rw-r--r-- | contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp | 1882 |
1 files changed, 1882 insertions, 0 deletions
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp new file mode 100644 index 0000000000..b5f8dfa9aa --- /dev/null +++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp @@ -0,0 +1,1882 @@ +//===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the loop fusion pass. +/// The implementation is largely based on the following document: +/// +/// Code Transformations to Augment the Scope of Loop Fusion in a +/// Production Compiler +/// Christopher Mark Barton +/// MSc Thesis +/// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf +/// +/// The general approach taken is to collect sets of control flow equivalent +/// loops and test whether they can be fused. The necessary conditions for +/// fusion are: +/// 1. The loops must be adjacent (there cannot be any statements between +/// the two loops). +/// 2. The loops must be conforming (they must execute the same number of +/// iterations). +/// 3. The loops must be control flow equivalent (if one loop executes, the +/// other is guaranteed to execute). +/// 4. There cannot be any negative distance dependencies between the loops. +/// If all of these conditions are satisfied, it is safe to fuse the loops. +/// +/// This implementation creates FusionCandidates that represent the loop and the +/// necessary information needed by fusion. It then operates on the fusion +/// candidates, first confirming that the candidate is eligible for fusion. The +/// candidates are then collected into control flow equivalent sets, sorted in +/// dominance order. Each set of control flow equivalent candidates is then +/// traversed, attempting to fuse pairs of candidates in the set. If all +/// requirements for fusion are met, the two candidates are fused, creating a +/// new (fused) candidate which is then added back into the set to consider for +/// additional fusion. +/// +/// This implementation currently does not make any modifications to remove +/// conditions for fusion. Code transformations to make loops conform to each of +/// the conditions for fusion are discussed in more detail in the document +/// above. These can be added to the current implementation in the future. +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LoopFuse.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/CodeMoverUtils.h" +#include "llvm/Transforms/Utils/LoopPeel.h" + +using namespace llvm; + +#define DEBUG_TYPE "loop-fusion" + +STATISTIC(FuseCounter, "Loops fused"); +STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); +STATISTIC(InvalidPreheader, "Loop has invalid preheader"); +STATISTIC(InvalidHeader, "Loop has invalid header"); +STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks"); +STATISTIC(InvalidExitBlock, "Loop has invalid exit block"); +STATISTIC(InvalidLatch, "Loop has invalid latch"); +STATISTIC(InvalidLoop, "Loop is invalid"); +STATISTIC(AddressTakenBB, "Basic block has address taken"); +STATISTIC(MayThrowException, "Loop may throw an exception"); +STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); +STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); +STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); +STATISTIC(UnknownTripCount, "Loop has unknown trip count"); +STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); +STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); +STATISTIC(NonAdjacent, "Loops are not adjacent"); +STATISTIC( + NonEmptyPreheader, + "Loop has a non-empty preheader with instructions that cannot be moved"); +STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); +STATISTIC(NonIdenticalGuards, "Candidates have different guards"); +STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " + "instructions that cannot be moved"); +STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " + "instructions that cannot be moved"); +STATISTIC(NotRotated, "Candidate is not rotated"); + +enum FusionDependenceAnalysisChoice { + FUSION_DEPENDENCE_ANALYSIS_SCEV, + FUSION_DEPENDENCE_ANALYSIS_DA, + FUSION_DEPENDENCE_ANALYSIS_ALL, +}; + +static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( + "loop-fusion-dependence-analysis", + cl::desc("Which dependence analysis should loop fusion use?"), + cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", + "Use the scalar evolution interface"), + clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", + "Use the dependence analysis interface"), + clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", + "Use all available analyses")), + cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore); + +static cl::opt<unsigned> FusionPeelMaxCount( + "loop-fusion-peel-max-count", cl::init(0), cl::Hidden, + cl::desc("Max number of iterations to be peeled from a loop, such that " + "fusion can take place")); + +#ifndef NDEBUG +static cl::opt<bool> + VerboseFusionDebugging("loop-fusion-verbose-debug", + cl::desc("Enable verbose debugging for Loop Fusion"), + cl::Hidden, cl::init(false), cl::ZeroOrMore); +#endif + +namespace { +/// This class is used to represent a candidate for loop fusion. When it is +/// constructed, it checks the conditions for loop fusion to ensure that it +/// represents a valid candidate. It caches several parts of a loop that are +/// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead +/// of continually querying the underlying Loop to retrieve these values. It is +/// assumed these will not change throughout loop fusion. +/// +/// The invalidate method should be used to indicate that the FusionCandidate is +/// no longer a valid candidate for fusion. Similarly, the isValid() method can +/// be used to ensure that the FusionCandidate is still valid for fusion. +struct FusionCandidate { + /// Cache of parts of the loop used throughout loop fusion. These should not + /// need to change throughout the analysis and transformation. + /// These parts are cached to avoid repeatedly looking up in the Loop class. + + /// Preheader of the loop this candidate represents + BasicBlock *Preheader; + /// Header of the loop this candidate represents + BasicBlock *Header; + /// Blocks in the loop that exit the loop + BasicBlock *ExitingBlock; + /// The successor block of this loop (where the exiting blocks go to) + BasicBlock *ExitBlock; + /// Latch of the loop + BasicBlock *Latch; + /// The loop that this fusion candidate represents + Loop *L; + /// Vector of instructions in this loop that read from memory + SmallVector<Instruction *, 16> MemReads; + /// Vector of instructions in this loop that write to memory + SmallVector<Instruction *, 16> MemWrites; + /// Are all of the members of this fusion candidate still valid + bool Valid; + /// Guard branch of the loop, if it exists + BranchInst *GuardBranch; + /// Peeling Paramaters of the Loop. + TTI::PeelingPreferences PP; + /// Can you Peel this Loop? + bool AbleToPeel; + /// Has this loop been Peeled + bool Peeled; + + /// Dominator and PostDominator trees are needed for the + /// FusionCandidateCompare function, required by FusionCandidateSet to + /// determine where the FusionCandidate should be inserted into the set. These + /// are used to establish ordering of the FusionCandidates based on dominance. + const DominatorTree *DT; + const PostDominatorTree *PDT; + + OptimizationRemarkEmitter &ORE; + + FusionCandidate(Loop *L, const DominatorTree *DT, + const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE, + TTI::PeelingPreferences PP) + : Preheader(L->getLoopPreheader()), Header(L->getHeader()), + ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), + Latch(L->getLoopLatch()), L(L), Valid(true), + GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), + Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { + + // Walk over all blocks in the loop and check for conditions that may + // prevent fusion. For each block, walk over all instructions and collect + // the memory reads and writes If any instructions that prevent fusion are + // found, invalidate this object and return. + for (BasicBlock *BB : L->blocks()) { + if (BB->hasAddressTaken()) { + invalidate(); + reportInvalidCandidate(AddressTakenBB); + return; + } + + for (Instruction &I : *BB) { + if (I.mayThrow()) { + invalidate(); + reportInvalidCandidate(MayThrowException); + return; + } + if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { + if (SI->isVolatile()) { + invalidate(); + reportInvalidCandidate(ContainsVolatileAccess); + return; + } + } + if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { + if (LI->isVolatile()) { + invalidate(); + reportInvalidCandidate(ContainsVolatileAccess); + return; + } + } + if (I.mayWriteToMemory()) + MemWrites.push_back(&I); + if (I.mayReadFromMemory()) + MemReads.push_back(&I); + } + } + } + + /// Check if all members of the class are valid. + bool isValid() const { + return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && + !L->isInvalid() && Valid; + } + + /// Verify that all members are in sync with the Loop object. + void verify() const { + assert(isValid() && "Candidate is not valid!!"); + assert(!L->isInvalid() && "Loop is invalid!"); + assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync"); + assert(Header == L->getHeader() && "Header is out of sync"); + assert(ExitingBlock == L->getExitingBlock() && + "Exiting Blocks is out of sync"); + assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync"); + assert(Latch == L->getLoopLatch() && "Latch is out of sync"); + } + + /// Get the entry block for this fusion candidate. + /// + /// If this fusion candidate represents a guarded loop, the entry block is the + /// loop guard block. If it represents an unguarded loop, the entry block is + /// the preheader of the loop. + BasicBlock *getEntryBlock() const { + if (GuardBranch) + return GuardBranch->getParent(); + else + return Preheader; + } + + /// After Peeling the loop is modified quite a bit, hence all of the Blocks + /// need to be updated accordingly. + void updateAfterPeeling() { + Preheader = L->getLoopPreheader(); + Header = L->getHeader(); + ExitingBlock = L->getExitingBlock(); + ExitBlock = L->getExitBlock(); + Latch = L->getLoopLatch(); + verify(); + } + + /// Given a guarded loop, get the successor of the guard that is not in the + /// loop. + /// + /// This method returns the successor of the loop guard that is not located + /// within the loop (i.e., the successor of the guard that is not the + /// preheader). + /// This method is only valid for guarded loops. + BasicBlock *getNonLoopBlock() const { + assert(GuardBranch && "Only valid on guarded loops."); + assert(GuardBranch->isConditional() && + "Expecting guard to be a conditional branch."); + if (Peeled) + return GuardBranch->getSuccessor(1); + return (GuardBranch->getSuccessor(0) == Preheader) + ? GuardBranch->getSuccessor(1) + : GuardBranch->getSuccessor(0); + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { + dbgs() << "\tGuardBranch: "; + if (GuardBranch) + dbgs() << *GuardBranch; + else + dbgs() << "nullptr"; + dbgs() << "\n" + << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" + << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") + << "\n" + << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" + << "\tExitingBB: " + << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" + << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") + << "\n" + << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" + << "\tEntryBlock: " + << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") + << "\n"; + } +#endif + + /// Determine if a fusion candidate (representing a loop) is eligible for + /// fusion. Note that this only checks whether a single loop can be fused - it + /// does not check whether it is *legal* to fuse two loops together. + bool isEligibleForFusion(ScalarEvolution &SE) const { + if (!isValid()) { + LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); + if (!Preheader) + ++InvalidPreheader; + if (!Header) + ++InvalidHeader; + if (!ExitingBlock) + ++InvalidExitingBlock; + if (!ExitBlock) + ++InvalidExitBlock; + if (!Latch) + ++InvalidLatch; + if (L->isInvalid()) + ++InvalidLoop; + + return false; + } + + // Require ScalarEvolution to be able to determine a trip count. + if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() + << " trip count not computable!\n"); + return reportInvalidCandidate(UnknownTripCount); + } + + if (!L->isLoopSimplifyForm()) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() + << " is not in simplified form!\n"); + return reportInvalidCandidate(NotSimplifiedForm); + } + + if (!L->isRotatedForm()) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n"); + return reportInvalidCandidate(NotRotated); + } + + return true; + } + +private: + // This is only used internally for now, to clear the MemWrites and MemReads + // list and setting Valid to false. I can't envision other uses of this right + // now, since once FusionCandidates are put into the FusionCandidateSet they + // are immutable. Thus, any time we need to change/update a FusionCandidate, + // we must create a new one and insert it into the FusionCandidateSet to + // ensure the FusionCandidateSet remains ordered correctly. + void invalidate() { + MemWrites.clear(); + MemReads.clear(); + Valid = false; + } + + bool reportInvalidCandidate(llvm::Statistic &Stat) const { + using namespace ore; + assert(L && Preheader && "Fusion candidate not initialized properly!"); + ++Stat; + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), + L->getStartLoc(), Preheader) + << "[" << Preheader->getParent()->getName() << "]: " + << "Loop is not a candidate for fusion: " << Stat.getDesc()); + return false; + } +}; + +struct FusionCandidateCompare { + /// Comparison functor to sort two Control Flow Equivalent fusion candidates + /// into dominance order. + /// If LHS dominates RHS and RHS post-dominates LHS, return true; + /// IF RHS dominates LHS and LHS post-dominates RHS, return false; + bool operator()(const FusionCandidate &LHS, + const FusionCandidate &RHS) const { + const DominatorTree *DT = LHS.DT; + + BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); + BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); + + // Do not save PDT to local variable as it is only used in asserts and thus + // will trigger an unused variable warning if building without asserts. + assert(DT && LHS.PDT && "Expecting valid dominator tree"); + + // Do this compare first so if LHS == RHS, function returns false. + if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { + // RHS dominates LHS + // Verify LHS post-dominates RHS + assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); + return false; + } + + if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { + // Verify RHS Postdominates LHS + assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); + return true; + } + + // If LHS does not dominate RHS and RHS does not dominate LHS then there is + // no dominance relationship between the two FusionCandidates. Thus, they + // should not be in the same set together. + llvm_unreachable( + "No dominance relationship between these fusion candidates!"); + } +}; + +using LoopVector = SmallVector<Loop *, 4>; + +// Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance +// order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 +// dominates FC1 and FC1 post-dominates FC0. +// std::set was chosen because we want a sorted data structure with stable +// iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent +// loops by moving intervening code around. When this intervening code contains +// loops, those loops will be moved also. The corresponding FusionCandidates +// will also need to be moved accordingly. As this is done, having stable +// iterators will simplify the logic. Similarly, having an efficient insert that +// keeps the FusionCandidateSet sorted will also simplify the implementation. +using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; +using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; + +#if !defined(NDEBUG) +static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const FusionCandidate &FC) { + if (FC.isValid()) + OS << FC.Preheader->getName(); + else + OS << "<Invalid>"; + + return OS; +} + +static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const FusionCandidateSet &CandSet) { + for (const FusionCandidate &FC : CandSet) + OS << FC << '\n'; + + return OS; +} + +static void +printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { + dbgs() << "Fusion Candidates: \n"; + for (const auto &CandidateSet : FusionCandidates) { + dbgs() << "*** Fusion Candidate Set ***\n"; + dbgs() << CandidateSet; + dbgs() << "****************************\n"; + } +} +#endif + +/// Collect all loops in function at the same nest level, starting at the +/// outermost level. +/// +/// This data structure collects all loops at the same nest level for a +/// given function (specified by the LoopInfo object). It starts at the +/// outermost level. +struct LoopDepthTree { + using LoopsOnLevelTy = SmallVector<LoopVector, 4>; + using iterator = LoopsOnLevelTy::iterator; + using const_iterator = LoopsOnLevelTy::const_iterator; + + LoopDepthTree(LoopInfo &LI) : Depth(1) { + if (!LI.empty()) + LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend())); + } + + /// Test whether a given loop has been removed from the function, and thus is + /// no longer valid. + bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); } + + /// Record that a given loop has been removed from the function and is no + /// longer valid. + void removeLoop(const Loop *L) { RemovedLoops.insert(L); } + + /// Descend the tree to the next (inner) nesting level + void descend() { + LoopsOnLevelTy LoopsOnNextLevel; + + for (const LoopVector &LV : *this) + for (Loop *L : LV) + if (!isRemovedLoop(L) && L->begin() != L->end()) + LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end())); + + LoopsOnLevel = LoopsOnNextLevel; + RemovedLoops.clear(); + Depth++; + } + + bool empty() const { return size() == 0; } + size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } + unsigned getDepth() const { return Depth; } + + iterator begin() { return LoopsOnLevel.begin(); } + iterator end() { return LoopsOnLevel.end(); } + const_iterator begin() const { return LoopsOnLevel.begin(); } + const_iterator end() const { return LoopsOnLevel.end(); } + +private: + /// Set of loops that have been removed from the function and are no longer + /// valid. + SmallPtrSet<const Loop *, 8> RemovedLoops; + + /// Depth of the current level, starting at 1 (outermost loops). + unsigned Depth; + + /// Vector of loops at the current depth level that have the same parent loop + LoopsOnLevelTy LoopsOnLevel; +}; + +#ifndef NDEBUG +static void printLoopVector(const LoopVector &LV) { + dbgs() << "****************************\n"; + for (auto L : LV) + printLoop(*L, dbgs()); + dbgs() << "****************************\n"; +} +#endif + +struct LoopFuser { +private: + // Sets of control flow equivalent fusion candidates for a given nest level. + FusionCandidateCollection FusionCandidates; + + LoopDepthTree LDT; + DomTreeUpdater DTU; + + LoopInfo &LI; + DominatorTree &DT; + DependenceInfo &DI; + ScalarEvolution &SE; + PostDominatorTree &PDT; + OptimizationRemarkEmitter &ORE; + AssumptionCache &AC; + + const TargetTransformInfo &TTI; + +public: + LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, + ScalarEvolution &SE, PostDominatorTree &PDT, + OptimizationRemarkEmitter &ORE, const DataLayout &DL, + AssumptionCache &AC, const TargetTransformInfo &TTI) + : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), + DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} + + /// This is the main entry point for loop fusion. It will traverse the + /// specified function and collect candidate loops to fuse, starting at the + /// outermost nesting level and working inwards. + bool fuseLoops(Function &F) { +#ifndef NDEBUG + if (VerboseFusionDebugging) { + LI.print(dbgs()); + } +#endif + + LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() + << "\n"); + bool Changed = false; + + while (!LDT.empty()) { + LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " + << LDT.getDepth() << "\n";); + + for (const LoopVector &LV : LDT) { + assert(LV.size() > 0 && "Empty loop set was build!"); + + // Skip singleton loop sets as they do not offer fusion opportunities on + // this level. + if (LV.size() == 1) + continue; +#ifndef NDEBUG + if (VerboseFusionDebugging) { + LLVM_DEBUG({ + dbgs() << " Visit loop set (#" << LV.size() << "):\n"; + printLoopVector(LV); + }); + } +#endif + + collectFusionCandidates(LV); + Changed |= fuseCandidates(); + } + + // Finished analyzing candidates at this level. + // Descend to the next level and clear all of the candidates currently + // collected. Note that it will not be possible to fuse any of the + // existing candidates with new candidates because the new candidates will + // be at a different nest level and thus not be control flow equivalent + // with all of the candidates collected so far. + LLVM_DEBUG(dbgs() << "Descend one level!\n"); + LDT.descend(); + FusionCandidates.clear(); + } + + if (Changed) + LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump();); + +#ifndef NDEBUG + assert(DT.verify()); + assert(PDT.verify()); + LI.verify(DT); + SE.verify(); +#endif + + LLVM_DEBUG(dbgs() << "Loop Fusion complete\n"); + return Changed; + } + +private: + /// Determine if two fusion candidates are control flow equivalent. + /// + /// Two fusion candidates are control flow equivalent if when one executes, + /// the other is guaranteed to execute. This is determined using dominators + /// and post-dominators: if A dominates B and B post-dominates A then A and B + /// are control-flow equivalent. + bool isControlFlowEquivalent(const FusionCandidate &FC0, + const FusionCandidate &FC1) const { + assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); + + return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(), + DT, PDT); + } + + /// Iterate over all loops in the given loop set and identify the loops that + /// are eligible for fusion. Place all eligible fusion candidates into Control + /// Flow Equivalent sets, sorted by dominance. + void collectFusionCandidates(const LoopVector &LV) { + for (Loop *L : LV) { + TTI::PeelingPreferences PP = + gatherPeelingPreferences(L, SE, TTI, None, None); + FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP); + if (!CurrCand.isEligibleForFusion(SE)) + continue; + + // Go through each list in FusionCandidates and determine if L is control + // flow equivalent with the first loop in that list. If it is, append LV. + // If not, go to the next list. + // If no suitable list is found, start another list and add it to + // FusionCandidates. + bool FoundSet = false; + + for (auto &CurrCandSet : FusionCandidates) { + if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) { + CurrCandSet.insert(CurrCand); + FoundSet = true; +#ifndef NDEBUG + if (VerboseFusionDebugging) + LLVM_DEBUG(dbgs() << "Adding " << CurrCand + << " to existing candidate set\n"); +#endif + break; + } + } + if (!FoundSet) { + // No set was found. Create a new set and add to FusionCandidates +#ifndef NDEBUG + if (VerboseFusionDebugging) + LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n"); +#endif + FusionCandidateSet NewCandSet; + NewCandSet.insert(CurrCand); + FusionCandidates.push_back(NewCandSet); + } + NumFusionCandidates++; + } + } + + /// Determine if it is beneficial to fuse two loops. + /// + /// For now, this method simply returns true because we want to fuse as much + /// as possible (primarily to test the pass). This method will evolve, over + /// time, to add heuristics for profitability of fusion. + bool isBeneficialFusion(const FusionCandidate &FC0, + const FusionCandidate &FC1) { + return true; + } + + /// Determine if two fusion candidates have the same trip count (i.e., they + /// execute the same number of iterations). + /// + /// This function will return a pair of values. The first is a boolean, + /// stating whether or not the two candidates are known at compile time to + /// have the same TripCount. The second is the difference in the two + /// TripCounts. This information can be used later to determine whether or not + /// peeling can be performed on either one of the candiates. + std::pair<bool, Optional<unsigned>> + haveIdenticalTripCounts(const FusionCandidate &FC0, + const FusionCandidate &FC1) const { + + const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); + if (isa<SCEVCouldNotCompute>(TripCount0)) { + UncomputableTripCount++; + LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!"); + return {false, None}; + } + + const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L); + if (isa<SCEVCouldNotCompute>(TripCount1)) { + UncomputableTripCount++; + LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!"); + return {false, None}; + } + + LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " + << *TripCount1 << " are " + << (TripCount0 == TripCount1 ? "identical" : "different") + << "\n"); + + if (TripCount0 == TripCount1) + return {true, 0}; + + LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, " + "determining the difference between trip counts\n"); + + // Currently only considering loops with a single exit point + // and a non-constant trip count. + const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L); + const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L); + + // If any of the tripcounts are zero that means that loop(s) do not have + // a single exit or a constant tripcount. + if (TC0 == 0 || TC1 == 0) { + LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " + "have a constant number of iterations. Peeling " + "is not benefical\n"); + return {false, None}; + } + + Optional<unsigned> Difference = None; + int Diff = TC0 - TC1; + + if (Diff > 0) + Difference = Diff; + else { + LLVM_DEBUG( + dbgs() << "Difference is less than 0. FC1 (second loop) has more " + "iterations than the first one. Currently not supported\n"); + } + + LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference + << "\n"); + + return {false, Difference}; + } + + void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1, + unsigned PeelCount) { + assert(FC0.AbleToPeel && "Should be able to peel loop"); + + LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount + << " iterations of the first loop. \n"); + + FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, &DT, &AC, true); + if (FC0.Peeled) { + LLVM_DEBUG(dbgs() << "Done Peeling\n"); + +#ifndef NDEBUG + auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1); + + assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 && + "Loops should have identical trip counts after peeling"); +#endif + + FC0.PP.PeelCount += PeelCount; + + // Peeling does not update the PDT + PDT.recalculate(*FC0.Preheader->getParent()); + + FC0.updateAfterPeeling(); + + // In this case the iterations of the loop are constant, so the first + // loop will execute completely (will not jump from one of + // the peeled blocks to the second loop). Here we are updating the + // branch conditions of each of the peeled blocks, such that it will + // branch to its successor which is not the preheader of the second loop + // in the case of unguarded loops, or the succesors of the exit block of + // the first loop otherwise. Doing this update will ensure that the entry + // block of the first loop dominates the entry block of the second loop. + BasicBlock *BB = + FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader; + if (BB) { + SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; + SmallVector<Instruction *, 8> WorkList; + for (BasicBlock *Pred : predecessors(BB)) { + if (Pred != FC0.ExitBlock) { + WorkList.emplace_back(Pred->getTerminator()); + TreeUpdates.emplace_back( + DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB)); + } + } + // Cannot modify the predecessors inside the above loop as it will cause + // the iterators to be nullptrs, causing memory errors. + for (Instruction *CurrentBranch: WorkList) { + BasicBlock *Succ = CurrentBranch->getSuccessor(0); + if (Succ == BB) + Succ = CurrentBranch->getSuccessor(1); + ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ)); + } + + DTU.applyUpdates(TreeUpdates); + DTU.flush(); + } + LLVM_DEBUG( + dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount + << " iterations from the first loop.\n" + "Both Loops have the same number of iterations now.\n"); + } + } + + /// Walk each set of control flow equivalent fusion candidates and attempt to + /// fuse them. This does a single linear traversal of all candidates in the + /// set. The conditions for legal fusion are checked at this point. If a pair + /// of fusion candidates passes all legality checks, they are fused together + /// and a new fusion candidate is created and added to the FusionCandidateSet. + /// The original fusion candidates are then removed, as they are no longer + /// valid. + bool fuseCandidates() { + bool Fused = false; + LLVM_DEBUG(printFusionCandidates(FusionCandidates)); + for (auto &CandidateSet : FusionCandidates) { + if (CandidateSet.size() < 2) + continue; + + LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n" + << CandidateSet << "\n"); + + for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) { + assert(!LDT.isRemovedLoop(FC0->L) && + "Should not have removed loops in CandidateSet!"); + auto FC1 = FC0; + for (++FC1; FC1 != CandidateSet.end(); ++FC1) { + assert(!LDT.isRemovedLoop(FC1->L) && + "Should not have removed loops in CandidateSet!"); + + LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump(); + dbgs() << " with\n"; FC1->dump(); dbgs() << "\n"); + + FC0->verify(); + FC1->verify(); + + // Check if the candidates have identical tripcounts (first value of + // pair), and if not check the difference in the tripcounts between + // the loops (second value of pair). The difference is not equal to + // None iff the loops iterate a constant number of times, and have a + // single exit. + std::pair<bool, Optional<unsigned>> IdenticalTripCountRes = + haveIdenticalTripCounts(*FC0, *FC1); + bool SameTripCount = IdenticalTripCountRes.first; + Optional<unsigned> TCDifference = IdenticalTripCountRes.second; + + // Here we are checking that FC0 (the first loop) can be peeled, and + // both loops have different tripcounts. + if (FC0->AbleToPeel && !SameTripCount && TCDifference) { + if (*TCDifference > FusionPeelMaxCount) { + LLVM_DEBUG(dbgs() + << "Difference in loop trip counts: " << *TCDifference + << " is greater than maximum peel count specificed: " + << FusionPeelMaxCount << "\n"); + } else { + // Dependent on peeling being performed on the first loop, and + // assuming all other conditions for fusion return true. + SameTripCount = true; + } + } + + if (!SameTripCount) { + LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " + "counts. Not fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEqualTripCount); + continue; + } + + if (!isAdjacent(*FC0, *FC1)) { + LLVM_DEBUG(dbgs() + << "Fusion candidates are not adjacent. Not fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); + continue; + } + + // Ensure that FC0 and FC1 have identical guards. + // If one (or both) are not guarded, this check is not necessary. + if (FC0->GuardBranch && FC1->GuardBranch && + !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) { + LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " + "guards. Not Fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonIdenticalGuards); + continue; + } + + if (!isSafeToMoveBefore(*FC1->Preheader, + *FC0->Preheader->getTerminator(), DT, &PDT, + &DI)) { + LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " + "instructions in preheader. Not fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyPreheader); + continue; + } + + if (FC0->GuardBranch) { + assert(FC1->GuardBranch && "Expecting valid FC1 guard branch"); + + if (!isSafeToMoveBefore(*FC0->ExitBlock, + *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT, + &PDT, &DI)) { + LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " + "instructions in exit block. Not fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyExitBlock); + continue; + } + + if (!isSafeToMoveBefore( + *FC1->GuardBranch->getParent(), + *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT, + &DI)) { + LLVM_DEBUG(dbgs() + << "Fusion candidate contains unsafe " + "instructions in guard block. Not fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyGuardBlock); + continue; + } + } + + // Check the dependencies across the loops and do not fuse if it would + // violate them. + if (!dependencesAllowFusion(*FC0, *FC1)) { + LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + InvalidDependencies); + continue; + } + + bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); + LLVM_DEBUG(dbgs() + << "\tFusion appears to be " + << (BeneficialToFuse ? "" : "un") << "profitable!\n"); + if (!BeneficialToFuse) { + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + FusionNotBeneficial); + continue; + } + // All analysis has completed and has determined that fusion is legal + // and profitable. At this point, start transforming the code and + // perform fusion. + + LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " + << *FC1 << "\n"); + + FusionCandidate FC0Copy = *FC0; + // Peel the loop after determining that fusion is legal. The Loops + // will still be safe to fuse after the peeling is performed. + bool Peel = TCDifference && *TCDifference > 0; + if (Peel) + peelFusionCandidate(FC0Copy, *FC1, *TCDifference); + + // Report fusion to the Optimization Remarks. + // Note this needs to be done *before* performFusion because + // performFusion will change the original loops, making it not + // possible to identify them after fusion is complete. + reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1, + FuseCounter); + + FusionCandidate FusedCand( + performFusion((Peel ? FC0Copy : *FC0), *FC1), &DT, &PDT, ORE, + FC0Copy.PP); + FusedCand.verify(); + assert(FusedCand.isEligibleForFusion(SE) && + "Fused candidate should be eligible for fusion!"); + + // Notify the loop-depth-tree that these loops are not valid objects + LDT.removeLoop(FC1->L); + + CandidateSet.erase(FC0); + CandidateSet.erase(FC1); + + auto InsertPos = CandidateSet.insert(FusedCand); + + assert(InsertPos.second && + "Unable to insert TargetCandidate in CandidateSet!"); + + // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations + // of the FC1 loop will attempt to fuse the new (fused) loop with the + // remaining candidates in the current candidate set. + FC0 = FC1 = InsertPos.first; + + LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet + << "\n"); + + Fused = true; + } + } + } + return Fused; + } + + /// Rewrite all additive recurrences in a SCEV to use a new loop. + class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { + public: + AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, + bool UseMax = true) + : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), + NewL(NewL) {} + + const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { + const Loop *ExprL = Expr->getLoop(); + SmallVector<const SCEV *, 2> Operands; + if (ExprL == &OldL) { + Operands.append(Expr->op_begin(), Expr->op_end()); + return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags()); + } + + if (OldL.contains(ExprL)) { + bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE)); + if (!UseMax || !Pos || !Expr->isAffine()) { + Valid = false; + return Expr; + } + return visit(Expr->getStart()); + } + + for (const SCEV *Op : Expr->operands()) + Operands.push_back(visit(Op)); + return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags()); + } + + bool wasValidSCEV() const { return Valid; } + + private: + bool Valid, UseMax; + const Loop &OldL, &NewL; + }; + + /// Return false if the access functions of \p I0 and \p I1 could cause + /// a negative dependence. + bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, + Instruction &I1, bool EqualIsInvalid) { + Value *Ptr0 = getLoadStorePointerOperand(&I0); + Value *Ptr1 = getLoadStorePointerOperand(&I1); + if (!Ptr0 || !Ptr1) + return false; + + const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0); + const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1); +#ifndef NDEBUG + if (VerboseFusionDebugging) + LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " + << *SCEVPtr1 << "\n"); +#endif + AddRecLoopReplacer Rewriter(SE, L0, L1); + SCEVPtr0 = Rewriter.visit(SCEVPtr0); +#ifndef NDEBUG + if (VerboseFusionDebugging) + LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 + << " [Valid: " << Rewriter.wasValidSCEV() << "]\n"); +#endif + if (!Rewriter.wasValidSCEV()) + return false; + + // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by + // L0) and the other is not. We could check if it is monotone and test + // the beginning and end value instead. + + BasicBlock *L0Header = L0.getHeader(); + auto HasNonLinearDominanceRelation = [&](const SCEV *S) { + const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S); + if (!AddRec) + return false; + return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) && + !DT.dominates(AddRec->getLoop()->getHeader(), L0Header); + }; + if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation)) + return false; + + ICmpInst::Predicate Pred = + EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; + bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1); +#ifndef NDEBUG + if (VerboseFusionDebugging) + LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 + << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1 + << "\n"); +#endif + return IsAlwaysGE; + } + + /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in + /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses + /// specified by @p DepChoice are used to determine this. + bool dependencesAllowFusion(const FusionCandidate &FC0, + const FusionCandidate &FC1, Instruction &I0, + Instruction &I1, bool AnyDep, + FusionDependenceAnalysisChoice DepChoice) { +#ifndef NDEBUG + if (VerboseFusionDebugging) { + LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " + << DepChoice << "\n"); + } +#endif + switch (DepChoice) { + case FUSION_DEPENDENCE_ANALYSIS_SCEV: + return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep); + case FUSION_DEPENDENCE_ANALYSIS_DA: { + auto DepResult = DI.depends(&I0, &I1, true); + if (!DepResult) + return true; +#ifndef NDEBUG + if (VerboseFusionDebugging) { + LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs()); + dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " + << (DepResult->isOrdered() ? "true" : "false") + << "]\n"); + LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() + << "\n"); + } +#endif + + if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) + LLVM_DEBUG( + dbgs() << "TODO: Implement pred/succ dependence handling!\n"); + + // TODO: Can we actually use the dependence info analysis here? + return false; + } + + case FUSION_DEPENDENCE_ANALYSIS_ALL: + return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, + FUSION_DEPENDENCE_ANALYSIS_SCEV) || + dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, + FUSION_DEPENDENCE_ANALYSIS_DA); + } + + llvm_unreachable("Unknown fusion dependence analysis choice!"); + } + + /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. + bool dependencesAllowFusion(const FusionCandidate &FC0, + const FusionCandidate &FC1) { + LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 + << "\n"); + assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); + assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); + + for (Instruction *WriteL0 : FC0.MemWrites) { + for (Instruction *WriteL1 : FC1.MemWrites) + if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, + /* AnyDep */ false, + FusionDependenceAnalysis)) { + InvalidDependencies++; + return false; + } + for (Instruction *ReadL1 : FC1.MemReads) + if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1, + /* AnyDep */ false, + FusionDependenceAnalysis)) { + InvalidDependencies++; + return false; + } + } + + for (Instruction *WriteL1 : FC1.MemWrites) { + for (Instruction *WriteL0 : FC0.MemWrites) + if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, + /* AnyDep */ false, + FusionDependenceAnalysis)) { + InvalidDependencies++; + return false; + } + for (Instruction *ReadL0 : FC0.MemReads) + if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1, + /* AnyDep */ false, + FusionDependenceAnalysis)) { + InvalidDependencies++; + return false; + } + } + + // Walk through all uses in FC1. For each use, find the reaching def. If the + // def is located in FC0 then it is is not safe to fuse. + for (BasicBlock *BB : FC1.L->blocks()) + for (Instruction &I : *BB) + for (auto &Op : I.operands()) + if (Instruction *Def = dyn_cast<Instruction>(Op)) + if (FC0.L->contains(Def->getParent())) { + InvalidDependencies++; + return false; + } + + return true; + } + + /// Determine if two fusion candidates are adjacent in the CFG. + /// + /// This method will determine if there are additional basic blocks in the CFG + /// between the exit of \p FC0 and the entry of \p FC1. + /// If the two candidates are guarded loops, then it checks whether the + /// non-loop successor of the \p FC0 guard branch is the entry block of \p + /// FC1. If not, then the loops are not adjacent. If the two candidates are + /// not guarded loops, then it checks whether the exit block of \p FC0 is the + /// preheader of \p FC1. + bool isAdjacent(const FusionCandidate &FC0, + const FusionCandidate &FC1) const { + // If the successor of the guard branch is FC1, then the loops are adjacent + if (FC0.GuardBranch) + return FC0.getNonLoopBlock() == FC1.getEntryBlock(); + else + return FC0.ExitBlock == FC1.getEntryBlock(); + } + + /// Determine if two fusion candidates have identical guards + /// + /// This method will determine if two fusion candidates have the same guards. + /// The guards are considered the same if: + /// 1. The instructions to compute the condition used in the compare are + /// identical. + /// 2. The successors of the guard have the same flow into/around the loop. + /// If the compare instructions are identical, then the first successor of the + /// guard must go to the same place (either the preheader of the loop or the + /// NonLoopBlock). In other words, the the first successor of both loops must + /// both go into the loop (i.e., the preheader) or go around the loop (i.e., + /// the NonLoopBlock). The same must be true for the second successor. + bool haveIdenticalGuards(const FusionCandidate &FC0, + const FusionCandidate &FC1) const { + assert(FC0.GuardBranch && FC1.GuardBranch && + "Expecting FC0 and FC1 to be guarded loops."); + + if (auto FC0CmpInst = + dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) + if (auto FC1CmpInst = + dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) + if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) + return false; + + // The compare instructions are identical. + // Now make sure the successor of the guards have the same flow into/around + // the loop + if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) + return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); + else + return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); + } + + /// Modify the latch branch of FC to be unconditional since successors of the + /// branch are the same. + void simplifyLatchBranch(const FusionCandidate &FC) const { + BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator()); + if (FCLatchBranch) { + assert(FCLatchBranch->isConditional() && + FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && + "Expecting the two successors of FCLatchBranch to be the same"); + BranchInst *NewBranch = + BranchInst::Create(FCLatchBranch->getSuccessor(0)); + ReplaceInstWithInst(FCLatchBranch, NewBranch); + } + } + + /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique + /// successor, then merge FC0.Latch with its unique successor. + void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { + moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI); + if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { + MergeBlockIntoPredecessor(Succ, &DTU, &LI); + DTU.flush(); + } + } + + /// Fuse two fusion candidates, creating a new fused loop. + /// + /// This method contains the mechanics of fusing two loops, represented by \p + /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 + /// postdominates \p FC0 (making them control flow equivalent). It also + /// assumes that the other conditions for fusion have been met: adjacent, + /// identical trip counts, and no negative distance dependencies exist that + /// would prevent fusion. Thus, there is no checking for these conditions in + /// this method. + /// + /// Fusion is performed by rewiring the CFG to update successor blocks of the + /// components of tho loop. Specifically, the following changes are done: + /// + /// 1. The preheader of \p FC1 is removed as it is no longer necessary + /// (because it is currently only a single statement block). + /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. + /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. + /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. + /// + /// All of these modifications are done with dominator tree updates, thus + /// keeping the dominator (and post dominator) information up-to-date. + /// + /// This can be improved in the future by actually merging blocks during + /// fusion. For example, the preheader of \p FC1 can be merged with the + /// preheader of \p FC0. This would allow loops with more than a single + /// statement in the preheader to be fused. Similarly, the latch blocks of the + /// two loops could also be fused into a single block. This will require + /// analysis to prove it is safe to move the contents of the block past + /// existing code, which currently has not been implemented. + Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { + assert(FC0.isValid() && FC1.isValid() && + "Expecting valid fusion candidates"); + + LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); + dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); + + // Move instructions from the preheader of FC1 to the end of the preheader + // of FC0. + moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI); + + // Fusing guarded loops is handled slightly differently than non-guarded + // loops and has been broken out into a separate method instead of trying to + // intersperse the logic within a single method. + if (FC0.GuardBranch) + return fuseGuardedLoops(FC0, FC1); + + assert(FC1.Preheader == + (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock)); + assert(FC1.Preheader->size() == 1 && + FC1.Preheader->getSingleSuccessor() == FC1.Header); + + // Remember the phi nodes originally in the header of FC0 in order to rewire + // them later. However, this is only necessary if the new loop carried + // values might not dominate the exiting branch. While we do not generally + // test if this is the case but simply insert intermediate phi nodes, we + // need to make sure these intermediate phi nodes have different + // predecessors. To this end, we filter the special case where the exiting + // block is the latch block of the first loop. Nothing needs to be done + // anyway as all loop carried values dominate the latch and thereby also the + // exiting branch. + SmallVector<PHINode *, 8> OriginalFC0PHIs; + if (FC0.ExitingBlock != FC0.Latch) + for (PHINode &PHI : FC0.Header->phis()) + OriginalFC0PHIs.push_back(&PHI); + + // Replace incoming blocks for header PHIs first. + FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); + FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); + + // Then modify the control flow and update DT and PDT. + SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; + + // The old exiting block of the first loop (FC0) has to jump to the header + // of the second as we need to execute the code in the second header block + // regardless of the trip count. That is, if the trip count is 0, so the + // back edge is never taken, we still have to execute both loop headers, + // especially (but not only!) if the second is a do-while style loop. + // However, doing so might invalidate the phi nodes of the first loop as + // the new values do only need to dominate their latch and not the exiting + // predicate. To remedy this potential problem we always introduce phi + // nodes in the header of the second loop later that select the loop carried + // value, if the second header was reached through an old latch of the + // first, or undef otherwise. This is sound as exiting the first implies the + // second will exit too, __without__ taking the back-edge. [Their + // trip-counts are equal after all. + // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go + // to FC1.Header? I think this is basically what the three sequences are + // trying to accomplish; however, doing this directly in the CFG may mean + // the DT/PDT becomes invalid + if (!FC0.Peeled) { + FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader, + FC1.Header); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); + } else { + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader)); + + // Remove the ExitBlock of the first Loop (also not needed) + FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, + FC1.Header); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); + FC0.ExitBlock->getTerminator()->eraseFromParent(); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); + new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); + } + + // The pre-header of L1 is not necessary anymore. + assert(pred_empty(FC1.Preheader)); + FC1.Preheader->getTerminator()->eraseFromParent(); + new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1.Preheader, FC1.Header)); + + // Moves the phi nodes from the second to the first loops header block. + while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { + if (SE.isSCEVable(PHI->getType())) + SE.forgetValue(PHI); + if (PHI->hasNUsesOrMore(1)) + PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); + else + PHI->eraseFromParent(); + } + + // Introduce new phi nodes in the second loop header to ensure + // exiting the first and jumping to the header of the second does not break + // the SSA property of the phis originally in the first loop. See also the + // comment above. + Instruction *L1HeaderIP = &FC1.Header->front(); + for (PHINode *LCPHI : OriginalFC0PHIs) { + int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); + assert(L1LatchBBIdx >= 0 && + "Expected loop carried value to be rewired at this point!"); + + Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); + + PHINode *L1HeaderPHI = PHINode::Create( + LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); + L1HeaderPHI->addIncoming(LCV, FC0.Latch); + L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), + FC0.ExitingBlock); + + LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); + } + + // Replace latch terminator destinations. + FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); + FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); + + // Modify the latch branch of FC0 to be unconditional as both successors of + // the branch are the same. + simplifyLatchBranch(FC0); + + // If FC0.Latch and FC0.ExitingBlock are the same then we have already + // performed the updates above. + if (FC0.Latch != FC0.ExitingBlock) + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.Latch, FC1.Header)); + + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, + FC0.Latch, FC0.Header)); + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, + FC1.Latch, FC0.Header)); + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, + FC1.Latch, FC1.Header)); + + // Update DT/PDT + DTU.applyUpdates(TreeUpdates); + + LI.removeBlock(FC1.Preheader); + DTU.deleteBB(FC1.Preheader); + if (FC0.Peeled) { + LI.removeBlock(FC0.ExitBlock); + DTU.deleteBB(FC0.ExitBlock); + } + + DTU.flush(); + + // Is there a way to keep SE up-to-date so we don't need to forget the loops + // and rebuild the information in subsequent passes of fusion? + // Note: Need to forget the loops before merging the loop latches, as + // mergeLatch may remove the only block in FC1. + SE.forgetLoop(FC1.L); + SE.forgetLoop(FC0.L); + + // Move instructions from FC0.Latch to FC1.Latch. + // Note: mergeLatch requires an updated DT. + mergeLatch(FC0, FC1); + + // Merge the loops. + SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); + for (BasicBlock *BB : Blocks) { + FC0.L->addBlockEntry(BB); + FC1.L->removeBlockFromLoop(BB); + if (LI.getLoopFor(BB) != FC1.L) + continue; + LI.changeLoopFor(BB, FC0.L); + } + while (!FC1.L->isInnermost()) { + const auto &ChildLoopIt = FC1.L->begin(); + Loop *ChildLoop = *ChildLoopIt; + FC1.L->removeChildLoop(ChildLoopIt); + FC0.L->addChildLoop(ChildLoop); + } + + // Delete the now empty loop L1. + LI.erase(FC1.L); + +#ifndef NDEBUG + assert(!verifyFunction(*FC0.Header->getParent(), &errs())); + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); + assert(PDT.verify()); + LI.verify(DT); + SE.verify(); +#endif + + LLVM_DEBUG(dbgs() << "Fusion done:\n"); + + return FC0.L; + } + + /// Report details on loop fusion opportunities. + /// + /// This template function can be used to report both successful and missed + /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should + /// be one of: + /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful + /// given two valid fusion candidates. + /// - OptimizationRemark to report successful fusion of two fusion + /// candidates. + /// The remarks will be printed using the form: + /// <path/filename>:<line number>:<column number>: [<function name>]: + /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> + template <typename RemarkKind> + void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, + llvm::Statistic &Stat) { + assert(FC0.Preheader && FC1.Preheader && + "Expecting valid fusion candidates"); + using namespace ore; + ++Stat; + ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), + FC0.Preheader) + << "[" << FC0.Preheader->getParent()->getName() + << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) + << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) + << ": " << Stat.getDesc()); + } + + /// Fuse two guarded fusion candidates, creating a new fused loop. + /// + /// Fusing guarded loops is handled much the same way as fusing non-guarded + /// loops. The rewiring of the CFG is slightly different though, because of + /// the presence of the guards around the loops and the exit blocks after the + /// loop body. As such, the new loop is rewired as follows: + /// 1. Keep the guard branch from FC0 and use the non-loop block target + /// from the FC1 guard branch. + /// 2. Remove the exit block from FC0 (this exit block should be empty + /// right now). + /// 3. Remove the guard branch for FC1 + /// 4. Remove the preheader for FC1. + /// The exit block successor for the latch of FC0 is updated to be the header + /// of FC1 and the non-exit block successor of the latch of FC1 is updated to + /// be the header of FC0, thus creating the fused loop. + Loop *fuseGuardedLoops(const FusionCandidate &FC0, + const FusionCandidate &FC1) { + assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); + + BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); + BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); + BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); + BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); + BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor(); + + // Move instructions from the exit block of FC0 to the beginning of the exit + // block of FC1, in the case that the FC0 loop has not been peeled. In the + // case that FC0 loop is peeled, then move the instructions of the successor + // of the FC0 Exit block to the beginning of the exit block of FC1. + moveInstructionsToTheBeginning( + (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock, + DT, PDT, DI); + + // Move instructions from the guard block of FC1 to the end of the guard + // block of FC0. + moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI); + + assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); + + SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; + + //////////////////////////////////////////////////////////////////////////// + // Update the Loop Guard + //////////////////////////////////////////////////////////////////////////// + // The guard for FC0 is updated to guard both FC0 and FC1. This is done by + // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. + // Thus, one path from the guard goes to the preheader for FC0 (and thus + // executes the new fused loop) and the other path goes to the NonLoopBlock + // for FC1 (where FC1 guard would have gone if FC1 was not executed). + FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock); + FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); + + BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock; + BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header); + + // The guard of FC1 is not necessary anymore. + FC1.GuardBranch->eraseFromParent(); + new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); + + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); + + if (FC0.Peeled) { + // Remove the Block after the ExitBlock of FC0 + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock)); + FC0ExitBlockSuccessor->getTerminator()->eraseFromParent(); + new UnreachableInst(FC0ExitBlockSuccessor->getContext(), + FC0ExitBlockSuccessor); + } + + assert(pred_empty(FC1GuardBlock) && + "Expecting guard block to have no predecessors"); + assert(succ_empty(FC1GuardBlock) && + "Expecting guard block to have no successors"); + + // Remember the phi nodes originally in the header of FC0 in order to rewire + // them later. However, this is only necessary if the new loop carried + // values might not dominate the exiting branch. While we do not generally + // test if this is the case but simply insert intermediate phi nodes, we + // need to make sure these intermediate phi nodes have different + // predecessors. To this end, we filter the special case where the exiting + // block is the latch block of the first loop. Nothing needs to be done + // anyway as all loop carried values dominate the latch and thereby also the + // exiting branch. + // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch + // (because the loops are rotated. Thus, nothing will ever be added to + // OriginalFC0PHIs. + SmallVector<PHINode *, 8> OriginalFC0PHIs; + if (FC0.ExitingBlock != FC0.Latch) + for (PHINode &PHI : FC0.Header->phis()) + OriginalFC0PHIs.push_back(&PHI); + + assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); + + // Replace incoming blocks for header PHIs first. + FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); + FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); + + // The old exiting block of the first loop (FC0) has to jump to the header + // of the second as we need to execute the code in the second header block + // regardless of the trip count. That is, if the trip count is 0, so the + // back edge is never taken, we still have to execute both loop headers, + // especially (but not only!) if the second is a do-while style loop. + // However, doing so might invalidate the phi nodes of the first loop as + // the new values do only need to dominate their latch and not the exiting + // predicate. To remedy this potential problem we always introduce phi + // nodes in the header of the second loop later that select the loop carried + // value, if the second header was reached through an old latch of the + // first, or undef otherwise. This is sound as exiting the first implies the + // second will exit too, __without__ taking the back-edge (their + // trip-counts are equal after all). + FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, + FC1.Header); + + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); + + // Remove FC0 Exit Block + // The exit block for FC0 is no longer needed since control will flow + // directly to the header of FC1. Since it is an empty block, it can be + // removed at this point. + // TODO: In the future, we can handle non-empty exit blocks my merging any + // instructions from FC0 exit block into FC1 exit block prior to removing + // the block. + assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty"); + FC0.ExitBlock->getTerminator()->eraseFromParent(); + new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); + + // Remove FC1 Preheader + // The pre-header of L1 is not necessary anymore. + assert(pred_empty(FC1.Preheader)); + FC1.Preheader->getTerminator()->eraseFromParent(); + new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1.Preheader, FC1.Header)); + + // Moves the phi nodes from the second to the first loops header block. + while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { + if (SE.isSCEVable(PHI->getType())) + SE.forgetValue(PHI); + if (PHI->hasNUsesOrMore(1)) + PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); + else + PHI->eraseFromParent(); + } + + // Introduce new phi nodes in the second loop header to ensure + // exiting the first and jumping to the header of the second does not break + // the SSA property of the phis originally in the first loop. See also the + // comment above. + Instruction *L1HeaderIP = &FC1.Header->front(); + for (PHINode *LCPHI : OriginalFC0PHIs) { + int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); + assert(L1LatchBBIdx >= 0 && + "Expected loop carried value to be rewired at this point!"); + + Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); + + PHINode *L1HeaderPHI = PHINode::Create( + LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); + L1HeaderPHI->addIncoming(LCV, FC0.Latch); + L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), + FC0.ExitingBlock); + + LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); + } + + // Update the latches + + // Replace latch terminator destinations. + FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); + FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); + + // Modify the latch branch of FC0 to be unconditional as both successors of + // the branch are the same. + simplifyLatchBranch(FC0); + + // If FC0.Latch and FC0.ExitingBlock are the same then we have already + // performed the updates above. + if (FC0.Latch != FC0.ExitingBlock) + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.Latch, FC1.Header)); + + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, + FC0.Latch, FC0.Header)); + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, + FC1.Latch, FC0.Header)); + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, + FC1.Latch, FC1.Header)); + + // All done + // Apply the updates to the Dominator Tree and cleanup. + + assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!"); + assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!"); + + // Update DT/PDT + DTU.applyUpdates(TreeUpdates); + + LI.removeBlock(FC1GuardBlock); + LI.removeBlock(FC1.Preheader); + LI.removeBlock(FC0.ExitBlock); + if (FC0.Peeled) { + LI.removeBlock(FC0ExitBlockSuccessor); + DTU.deleteBB(FC0ExitBlockSuccessor); + } + DTU.deleteBB(FC1GuardBlock); + DTU.deleteBB(FC1.Preheader); + DTU.deleteBB(FC0.ExitBlock); + DTU.flush(); + + // Is there a way to keep SE up-to-date so we don't need to forget the loops + // and rebuild the information in subsequent passes of fusion? + // Note: Need to forget the loops before merging the loop latches, as + // mergeLatch may remove the only block in FC1. + SE.forgetLoop(FC1.L); + SE.forgetLoop(FC0.L); + + // Move instructions from FC0.Latch to FC1.Latch. + // Note: mergeLatch requires an updated DT. + mergeLatch(FC0, FC1); + + // Merge the loops. + SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); + for (BasicBlock *BB : Blocks) { + FC0.L->addBlockEntry(BB); + FC1.L->removeBlockFromLoop(BB); + if (LI.getLoopFor(BB) != FC1.L) + continue; + LI.changeLoopFor(BB, FC0.L); + } + while (!FC1.L->isInnermost()) { + const auto &ChildLoopIt = FC1.L->begin(); + Loop *ChildLoop = *ChildLoopIt; + FC1.L->removeChildLoop(ChildLoopIt); + FC0.L->addChildLoop(ChildLoop); + } + + // Delete the now empty loop L1. + LI.erase(FC1.L); + +#ifndef NDEBUG + assert(!verifyFunction(*FC0.Header->getParent(), &errs())); + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); + assert(PDT.verify()); + LI.verify(DT); + SE.verify(); +#endif + + LLVM_DEBUG(dbgs() << "Fusion done:\n"); + + return FC0.L; + } +}; + +struct LoopFuseLegacy : public FunctionPass { + + static char ID; + + LoopFuseLegacy() : FunctionPass(ID) { + initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequiredID(LoopSimplifyID); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<PostDominatorTreeWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + AU.addRequired<DependenceAnalysisWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<PostDominatorTreeWrapperPass>(); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); + auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); + auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + const DataLayout &DL = F.getParent()->getDataLayout(); + + LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); + return LF.fuseLoops(F); + } +}; +} // namespace + +PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { + auto &LI = AM.getResult<LoopAnalysis>(F); + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &DI = AM.getResult<DependenceAnalysis>(F); + auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); + auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + auto &AC = AM.getResult<AssumptionAnalysis>(F); + const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); + const DataLayout &DL = F.getParent()->getDataLayout(); + + LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); + bool Changed = LF.fuseLoops(F); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<PostDominatorTreeAnalysis>(); + PA.preserve<ScalarEvolutionAnalysis>(); + PA.preserve<LoopAnalysis>(); + return PA; +} + +char LoopFuseLegacy::ID = 0; + +INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, + false) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false) + +FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); } |