diff options
author | orivej <orivej@yandex-team.ru> | 2022-02-10 16:45:01 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:01 +0300 |
commit | 2d37894b1b037cf24231090eda8589bbb44fb6fc (patch) | |
tree | be835aa92c6248212e705f25388ebafcf84bc7a1 /contrib/libs/llvm12/lib/Analysis/BlockFrequencyInfoImpl.cpp | |
parent | 718c552901d703c502ccbefdfc3c9028d608b947 (diff) | |
download | ydb-2d37894b1b037cf24231090eda8589bbb44fb6fc.tar.gz |
Restoring authorship annotation for <orivej@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Analysis/BlockFrequencyInfoImpl.cpp')
-rw-r--r-- | contrib/libs/llvm12/lib/Analysis/BlockFrequencyInfoImpl.cpp | 1732 |
1 files changed, 866 insertions, 866 deletions
diff --git a/contrib/libs/llvm12/lib/Analysis/BlockFrequencyInfoImpl.cpp b/contrib/libs/llvm12/lib/Analysis/BlockFrequencyInfoImpl.cpp index db5cc43e97..e4fda2472b 100644 --- a/contrib/libs/llvm12/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ b/contrib/libs/llvm12/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -1,866 +1,866 @@ -//===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Loops should be simplified before this analysis. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Analysis/BlockFrequencyInfoImpl.h" -#include "llvm/ADT/APInt.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/GraphTraits.h" -#include "llvm/ADT/None.h" -#include "llvm/ADT/SCCIterator.h" -#include "llvm/Config/llvm-config.h" -#include "llvm/IR/Function.h" -#include "llvm/Support/BlockFrequency.h" -#include "llvm/Support/BranchProbability.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ScaledNumber.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include <algorithm> -#include <cassert> -#include <cstddef> -#include <cstdint> -#include <iterator> -#include <list> -#include <numeric> -#include <utility> -#include <vector> - -using namespace llvm; -using namespace llvm::bfi_detail; - -#define DEBUG_TYPE "block-freq" - -cl::opt<bool> CheckBFIUnknownBlockQueries( - "check-bfi-unknown-block-queries", - cl::init(false), cl::Hidden, - cl::desc("Check if block frequency is queried for an unknown block " - "for debugging missed BFI updates")); - -ScaledNumber<uint64_t> BlockMass::toScaled() const { - if (isFull()) - return ScaledNumber<uint64_t>(1, 0); - return ScaledNumber<uint64_t>(getMass() + 1, -64); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); } -#endif - -static char getHexDigit(int N) { - assert(N < 16); - if (N < 10) - return '0' + N; - return 'a' + N - 10; -} - -raw_ostream &BlockMass::print(raw_ostream &OS) const { - for (int Digits = 0; Digits < 16; ++Digits) - OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf); - return OS; -} - -namespace { - -using BlockNode = BlockFrequencyInfoImplBase::BlockNode; -using Distribution = BlockFrequencyInfoImplBase::Distribution; -using WeightList = BlockFrequencyInfoImplBase::Distribution::WeightList; -using Scaled64 = BlockFrequencyInfoImplBase::Scaled64; -using LoopData = BlockFrequencyInfoImplBase::LoopData; -using Weight = BlockFrequencyInfoImplBase::Weight; -using FrequencyData = BlockFrequencyInfoImplBase::FrequencyData; - -/// Dithering mass distributer. -/// -/// This class splits up a single mass into portions by weight, dithering to -/// spread out error. No mass is lost. The dithering precision depends on the -/// precision of the product of \a BlockMass and \a BranchProbability. -/// -/// The distribution algorithm follows. -/// -/// 1. Initialize by saving the sum of the weights in \a RemWeight and the -/// mass to distribute in \a RemMass. -/// -/// 2. For each portion: -/// -/// 1. Construct a branch probability, P, as the portion's weight divided -/// by the current value of \a RemWeight. -/// 2. Calculate the portion's mass as \a RemMass times P. -/// 3. Update \a RemWeight and \a RemMass at each portion by subtracting -/// the current portion's weight and mass. -struct DitheringDistributer { - uint32_t RemWeight; - BlockMass RemMass; - - DitheringDistributer(Distribution &Dist, const BlockMass &Mass); - - BlockMass takeMass(uint32_t Weight); -}; - -} // end anonymous namespace - -DitheringDistributer::DitheringDistributer(Distribution &Dist, - const BlockMass &Mass) { - Dist.normalize(); - RemWeight = Dist.Total; - RemMass = Mass; -} - -BlockMass DitheringDistributer::takeMass(uint32_t Weight) { - assert(Weight && "invalid weight"); - assert(Weight <= RemWeight); - BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight); - - // Decrement totals (dither). - RemWeight -= Weight; - RemMass -= Mass; - return Mass; -} - -void Distribution::add(const BlockNode &Node, uint64_t Amount, - Weight::DistType Type) { - assert(Amount && "invalid weight of 0"); - uint64_t NewTotal = Total + Amount; - - // Check for overflow. It should be impossible to overflow twice. - bool IsOverflow = NewTotal < Total; - assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow"); - DidOverflow |= IsOverflow; - - // Update the total. - Total = NewTotal; - - // Save the weight. - Weights.push_back(Weight(Type, Node, Amount)); -} - -static void combineWeight(Weight &W, const Weight &OtherW) { - assert(OtherW.TargetNode.isValid()); - if (!W.Amount) { - W = OtherW; - return; - } - assert(W.Type == OtherW.Type); - assert(W.TargetNode == OtherW.TargetNode); - assert(OtherW.Amount && "Expected non-zero weight"); - if (W.Amount > W.Amount + OtherW.Amount) - // Saturate on overflow. - W.Amount = UINT64_MAX; - else - W.Amount += OtherW.Amount; -} - -static void combineWeightsBySorting(WeightList &Weights) { - // Sort so edges to the same node are adjacent. - llvm::sort(Weights, [](const Weight &L, const Weight &R) { - return L.TargetNode < R.TargetNode; - }); - - // Combine adjacent edges. - WeightList::iterator O = Weights.begin(); - for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E; - ++O, (I = L)) { - *O = *I; - - // Find the adjacent weights to the same node. - for (++L; L != E && I->TargetNode == L->TargetNode; ++L) - combineWeight(*O, *L); - } - - // Erase extra entries. - Weights.erase(O, Weights.end()); -} - -static void combineWeightsByHashing(WeightList &Weights) { - // Collect weights into a DenseMap. - using HashTable = DenseMap<BlockNode::IndexType, Weight>; - - HashTable Combined(NextPowerOf2(2 * Weights.size())); - for (const Weight &W : Weights) - combineWeight(Combined[W.TargetNode.Index], W); - - // Check whether anything changed. - if (Weights.size() == Combined.size()) - return; - - // Fill in the new weights. - Weights.clear(); - Weights.reserve(Combined.size()); - for (const auto &I : Combined) - Weights.push_back(I.second); -} - -static void combineWeights(WeightList &Weights) { - // Use a hash table for many successors to keep this linear. - if (Weights.size() > 128) { - combineWeightsByHashing(Weights); - return; - } - - combineWeightsBySorting(Weights); -} - -static uint64_t shiftRightAndRound(uint64_t N, int Shift) { - assert(Shift >= 0); - assert(Shift < 64); - if (!Shift) - return N; - return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1)); -} - -void Distribution::normalize() { - // Early exit for termination nodes. - if (Weights.empty()) - return; - - // Only bother if there are multiple successors. - if (Weights.size() > 1) - combineWeights(Weights); - - // Early exit when combined into a single successor. - if (Weights.size() == 1) { - Total = 1; - Weights.front().Amount = 1; - return; - } - - // Determine how much to shift right so that the total fits into 32-bits. - // - // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1 - // for each weight can cause a 32-bit overflow. - int Shift = 0; - if (DidOverflow) - Shift = 33; - else if (Total > UINT32_MAX) - Shift = 33 - countLeadingZeros(Total); - - // Early exit if nothing needs to be scaled. - if (!Shift) { - // If we didn't overflow then combineWeights() shouldn't have changed the - // sum of the weights, but let's double-check. - assert(Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0), - [](uint64_t Sum, const Weight &W) { - return Sum + W.Amount; - }) && - "Expected total to be correct"); - return; - } - - // Recompute the total through accumulation (rather than shifting it) so that - // it's accurate after shifting and any changes combineWeights() made above. - Total = 0; - - // Sum the weights to each node and shift right if necessary. - for (Weight &W : Weights) { - // Scale down below UINT32_MAX. Since Shift is larger than necessary, we - // can round here without concern about overflow. - assert(W.TargetNode.isValid()); - W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift)); - assert(W.Amount <= UINT32_MAX); - - // Update the total. - Total += W.Amount; - } - assert(Total <= UINT32_MAX); -} - -void BlockFrequencyInfoImplBase::clear() { - // Swap with a default-constructed std::vector, since std::vector<>::clear() - // does not actually clear heap storage. - std::vector<FrequencyData>().swap(Freqs); - IsIrrLoopHeader.clear(); - std::vector<WorkingData>().swap(Working); - Loops.clear(); -} - -/// Clear all memory not needed downstream. -/// -/// Releases all memory not used downstream. In particular, saves Freqs. -static void cleanup(BlockFrequencyInfoImplBase &BFI) { - std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs)); - SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader)); - BFI.clear(); - BFI.Freqs = std::move(SavedFreqs); - BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader); -} - -bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, - const LoopData *OuterLoop, - const BlockNode &Pred, - const BlockNode &Succ, - uint64_t Weight) { - if (!Weight) - Weight = 1; - - auto isLoopHeader = [&OuterLoop](const BlockNode &Node) { - return OuterLoop && OuterLoop->isHeader(Node); - }; - - BlockNode Resolved = Working[Succ.Index].getResolvedNode(); - -#ifndef NDEBUG - auto debugSuccessor = [&](const char *Type) { - dbgs() << " =>" - << " [" << Type << "] weight = " << Weight; - if (!isLoopHeader(Resolved)) - dbgs() << ", succ = " << getBlockName(Succ); - if (Resolved != Succ) - dbgs() << ", resolved = " << getBlockName(Resolved); - dbgs() << "\n"; - }; - (void)debugSuccessor; -#endif - - if (isLoopHeader(Resolved)) { - LLVM_DEBUG(debugSuccessor("backedge")); - Dist.addBackedge(Resolved, Weight); - return true; - } - - if (Working[Resolved.Index].getContainingLoop() != OuterLoop) { - LLVM_DEBUG(debugSuccessor(" exit ")); - Dist.addExit(Resolved, Weight); - return true; - } - - if (Resolved < Pred) { - if (!isLoopHeader(Pred)) { - // If OuterLoop is an irreducible loop, we can't actually handle this. - assert((!OuterLoop || !OuterLoop->isIrreducible()) && - "unhandled irreducible control flow"); - - // Irreducible backedge. Abort. - LLVM_DEBUG(debugSuccessor("abort!!!")); - return false; - } - - // If "Pred" is a loop header, then this isn't really a backedge; rather, - // OuterLoop must be irreducible. These false backedges can come only from - // secondary loop headers. - assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) && - "unhandled irreducible control flow"); - } - - LLVM_DEBUG(debugSuccessor(" local ")); - Dist.addLocal(Resolved, Weight); - return true; -} - -bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist( - const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) { - // Copy the exit map into Dist. - for (const auto &I : Loop.Exits) - if (!addToDist(Dist, OuterLoop, Loop.getHeader(), I.first, - I.second.getMass())) - // Irreducible backedge. - return false; - - return true; -} - -/// Compute the loop scale for a loop. -void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { - // Compute loop scale. - LLVM_DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n"); - - // Infinite loops need special handling. If we give the back edge an infinite - // mass, they may saturate all the other scales in the function down to 1, - // making all the other region temperatures look exactly the same. Choose an - // arbitrary scale to avoid these issues. - // - // FIXME: An alternate way would be to select a symbolic scale which is later - // replaced to be the maximum of all computed scales plus 1. This would - // appropriately describe the loop as having a large scale, without skewing - // the final frequency computation. - const Scaled64 InfiniteLoopScale(1, 12); - - // LoopScale == 1 / ExitMass - // ExitMass == HeadMass - BackedgeMass - BlockMass TotalBackedgeMass; - for (auto &Mass : Loop.BackedgeMass) - TotalBackedgeMass += Mass; - BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass; - - // Block scale stores the inverse of the scale. If this is an infinite loop, - // its exit mass will be zero. In this case, use an arbitrary scale for the - // loop scale. - Loop.Scale = - ExitMass.isEmpty() ? InfiniteLoopScale : ExitMass.toScaled().inverse(); - - LLVM_DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" - << BlockMass::getFull() << " - " << TotalBackedgeMass - << ")\n" - << " - scale = " << Loop.Scale << "\n"); -} - -/// Package up a loop. -void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) { - LLVM_DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n"); - - // Clear the subloop exits to prevent quadratic memory usage. - for (const BlockNode &M : Loop.Nodes) { - if (auto *Loop = Working[M.Index].getPackagedLoop()) - Loop->Exits.clear(); - LLVM_DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n"); - } - Loop.IsPackaged = true; -} - -#ifndef NDEBUG -static void debugAssign(const BlockFrequencyInfoImplBase &BFI, - const DitheringDistributer &D, const BlockNode &T, - const BlockMass &M, const char *Desc) { - dbgs() << " => assign " << M << " (" << D.RemMass << ")"; - if (Desc) - dbgs() << " [" << Desc << "]"; - if (T.isValid()) - dbgs() << " to " << BFI.getBlockName(T); - dbgs() << "\n"; -} -#endif - -void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, - LoopData *OuterLoop, - Distribution &Dist) { - BlockMass Mass = Working[Source.Index].getMass(); - LLVM_DEBUG(dbgs() << " => mass: " << Mass << "\n"); - - // Distribute mass to successors as laid out in Dist. - DitheringDistributer D(Dist, Mass); - - for (const Weight &W : Dist.Weights) { - // Check for a local edge (non-backedge and non-exit). - BlockMass Taken = D.takeMass(W.Amount); - if (W.Type == Weight::Local) { - Working[W.TargetNode.Index].getMass() += Taken; - LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); - continue; - } - - // Backedges and exits only make sense if we're processing a loop. - assert(OuterLoop && "backedge or exit outside of loop"); - - // Check for a backedge. - if (W.Type == Weight::Backedge) { - OuterLoop->BackedgeMass[OuterLoop->getHeaderIndex(W.TargetNode)] += Taken; - LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back")); - continue; - } - - // This must be an exit. - assert(W.Type == Weight::Exit); - OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken)); - LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit")); - } -} - -static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, - const Scaled64 &Min, const Scaled64 &Max) { - // Scale the Factor to a size that creates integers. Ideally, integers would - // be scaled so that Max == UINT64_MAX so that they can be best - // differentiated. However, in the presence of large frequency values, small - // frequencies are scaled down to 1, making it impossible to differentiate - // small, unequal numbers. When the spread between Min and Max frequencies - // fits well within MaxBits, we make the scale be at least 8. - const unsigned MaxBits = 64; - const unsigned SpreadBits = (Max / Min).lg(); - Scaled64 ScalingFactor; - if (SpreadBits <= MaxBits - 3) { - // If the values are small enough, make the scaling factor at least 8 to - // allow distinguishing small values. - ScalingFactor = Min.inverse(); - ScalingFactor <<= 3; - } else { - // If the values need more than MaxBits to be represented, saturate small - // frequency values down to 1 by using a scaling factor that benefits large - // frequency values. - ScalingFactor = Scaled64(1, MaxBits) / Max; - } - - // Translate the floats to integers. - LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max - << ", factor = " << ScalingFactor << "\n"); - for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) { - Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor; - BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>()); - LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = " - << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled - << ", int = " << BFI.Freqs[Index].Integer << "\n"); - } -} - -/// Unwrap a loop package. -/// -/// Visits all the members of a loop, adjusting their BlockData according to -/// the loop's pseudo-node. -static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) { - LLVM_DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop) - << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale - << "\n"); - Loop.Scale *= Loop.Mass.toScaled(); - Loop.IsPackaged = false; - LLVM_DEBUG(dbgs() << " => combined-scale = " << Loop.Scale << "\n"); - - // Propagate the head scale through the loop. Since members are visited in - // RPO, the head scale will be updated by the loop scale first, and then the - // final head scale will be used for updated the rest of the members. - for (const BlockNode &N : Loop.Nodes) { - const auto &Working = BFI.Working[N.Index]; - Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale - : BFI.Freqs[N.Index].Scaled; - Scaled64 New = Loop.Scale * F; - LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " - << New << "\n"); - F = New; - } -} - -void BlockFrequencyInfoImplBase::unwrapLoops() { - // Set initial frequencies from loop-local masses. - for (size_t Index = 0; Index < Working.size(); ++Index) - Freqs[Index].Scaled = Working[Index].Mass.toScaled(); - - for (LoopData &Loop : Loops) - unwrapLoop(*this, Loop); -} - -void BlockFrequencyInfoImplBase::finalizeMetrics() { - // Unwrap loop packages in reverse post-order, tracking min and max - // frequencies. - auto Min = Scaled64::getLargest(); - auto Max = Scaled64::getZero(); - for (size_t Index = 0; Index < Working.size(); ++Index) { - // Update min/max scale. - Min = std::min(Min, Freqs[Index].Scaled); - Max = std::max(Max, Freqs[Index].Scaled); - } - - // Convert to integers. - convertFloatingToInteger(*this, Min, Max); - - // Clean up data structures. - cleanup(*this); - - // Print out the final stats. - LLVM_DEBUG(dump()); -} - -BlockFrequency -BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const { - if (!Node.isValid()) { -#ifndef NDEBUG - if (CheckBFIUnknownBlockQueries) { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "*** Detected BFI query for unknown block " << getBlockName(Node); - report_fatal_error(OS.str()); - } -#endif - return 0; - } - return Freqs[Node.Index].Integer; -} - -Optional<uint64_t> -BlockFrequencyInfoImplBase::getBlockProfileCount(const Function &F, - const BlockNode &Node, - bool AllowSynthetic) const { - return getProfileCountFromFreq(F, getBlockFreq(Node).getFrequency(), - AllowSynthetic); -} - -Optional<uint64_t> -BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F, - uint64_t Freq, - bool AllowSynthetic) const { - auto EntryCount = F.getEntryCount(AllowSynthetic); - if (!EntryCount) - return None; - // Use 128 bit APInt to do the arithmetic to avoid overflow. - APInt BlockCount(128, EntryCount.getCount()); - APInt BlockFreq(128, Freq); - APInt EntryFreq(128, getEntryFreq()); - BlockCount *= BlockFreq; - // Rounded division of BlockCount by EntryFreq. Since EntryFreq is unsigned - // lshr by 1 gives EntryFreq/2. - BlockCount = (BlockCount + EntryFreq.lshr(1)).udiv(EntryFreq); - return BlockCount.getLimitedValue(); -} - -bool -BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) { - if (!Node.isValid()) - return false; - return IsIrrLoopHeader.test(Node.Index); -} - -Scaled64 -BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { - if (!Node.isValid()) - return Scaled64::getZero(); - return Freqs[Node.Index].Scaled; -} - -void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node, - uint64_t Freq) { - assert(Node.isValid() && "Expected valid node"); - assert(Node.Index < Freqs.size() && "Expected legal index"); - Freqs[Node.Index].Integer = Freq; -} - -std::string -BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const { - return {}; -} - -std::string -BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const { - return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*"); -} - -raw_ostream & -BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, - const BlockNode &Node) const { - return OS << getFloatingBlockFreq(Node); -} - -raw_ostream & -BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, - const BlockFrequency &Freq) const { - Scaled64 Block(Freq.getFrequency(), 0); - Scaled64 Entry(getEntryFreq(), 0); - - return OS << Block / Entry; -} - -void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) { - Start = OuterLoop.getHeader(); - Nodes.reserve(OuterLoop.Nodes.size()); - for (auto N : OuterLoop.Nodes) - addNode(N); - indexNodes(); -} - -void IrreducibleGraph::addNodesInFunction() { - Start = 0; - for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index) - if (!BFI.Working[Index].isPackaged()) - addNode(Index); - indexNodes(); -} - -void IrreducibleGraph::indexNodes() { - for (auto &I : Nodes) - Lookup[I.Node.Index] = &I; -} - -void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ, - const BFIBase::LoopData *OuterLoop) { - if (OuterLoop && OuterLoop->isHeader(Succ)) - return; - auto L = Lookup.find(Succ.Index); - if (L == Lookup.end()) - return; - IrrNode &SuccIrr = *L->second; - Irr.Edges.push_back(&SuccIrr); - SuccIrr.Edges.push_front(&Irr); - ++SuccIrr.NumIn; -} - -namespace llvm { - -template <> struct GraphTraits<IrreducibleGraph> { - using GraphT = bfi_detail::IrreducibleGraph; - using NodeRef = const GraphT::IrrNode *; - using ChildIteratorType = GraphT::IrrNode::iterator; - - static NodeRef getEntryNode(const GraphT &G) { return G.StartIrr; } - static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); } - static ChildIteratorType child_end(NodeRef N) { return N->succ_end(); } -}; - -} // end namespace llvm - -/// Find extra irreducible headers. -/// -/// Find entry blocks and other blocks with backedges, which exist when \c G -/// contains irreducible sub-SCCs. -static void findIrreducibleHeaders( - const BlockFrequencyInfoImplBase &BFI, - const IrreducibleGraph &G, - const std::vector<const IrreducibleGraph::IrrNode *> &SCC, - LoopData::NodeList &Headers, LoopData::NodeList &Others) { - // Map from nodes in the SCC to whether it's an entry block. - SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC; - - // InSCC also acts the set of nodes in the graph. Seed it. - for (const auto *I : SCC) - InSCC[I] = false; - - for (auto I = InSCC.begin(), E = InSCC.end(); I != E; ++I) { - auto &Irr = *I->first; - for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { - if (InSCC.count(P)) - continue; - - // This is an entry block. - I->second = true; - Headers.push_back(Irr.Node); - LLVM_DEBUG(dbgs() << " => entry = " << BFI.getBlockName(Irr.Node) - << "\n"); - break; - } - } - assert(Headers.size() >= 2 && - "Expected irreducible CFG; -loop-info is likely invalid"); - if (Headers.size() == InSCC.size()) { - // Every block is a header. - llvm::sort(Headers); - return; - } - - // Look for extra headers from irreducible sub-SCCs. - for (const auto &I : InSCC) { - // Entry blocks are already headers. - if (I.second) - continue; - - auto &Irr = *I.first; - for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { - // Skip forward edges. - if (P->Node < Irr.Node) - continue; - - // Skip predecessors from entry blocks. These can have inverted - // ordering. - if (InSCC.lookup(P)) - continue; - - // Store the extra header. - Headers.push_back(Irr.Node); - LLVM_DEBUG(dbgs() << " => extra = " << BFI.getBlockName(Irr.Node) - << "\n"); - break; - } - if (Headers.back() == Irr.Node) - // Added this as a header. - continue; - - // This is not a header. - Others.push_back(Irr.Node); - LLVM_DEBUG(dbgs() << " => other = " << BFI.getBlockName(Irr.Node) << "\n"); - } - llvm::sort(Headers); - llvm::sort(Others); -} - -static void createIrreducibleLoop( - BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, - LoopData *OuterLoop, std::list<LoopData>::iterator Insert, - const std::vector<const IrreducibleGraph::IrrNode *> &SCC) { - // Translate the SCC into RPO. - LLVM_DEBUG(dbgs() << " - found-scc\n"); - - LoopData::NodeList Headers; - LoopData::NodeList Others; - findIrreducibleHeaders(BFI, G, SCC, Headers, Others); - - auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(), - Headers.end(), Others.begin(), Others.end()); - - // Update loop hierarchy. - for (const auto &N : Loop->Nodes) - if (BFI.Working[N.Index].isLoopHeader()) - BFI.Working[N.Index].Loop->Parent = &*Loop; - else - BFI.Working[N.Index].Loop = &*Loop; -} - -iterator_range<std::list<LoopData>::iterator> -BlockFrequencyInfoImplBase::analyzeIrreducible( - const IrreducibleGraph &G, LoopData *OuterLoop, - std::list<LoopData>::iterator Insert) { - assert((OuterLoop == nullptr) == (Insert == Loops.begin())); - auto Prev = OuterLoop ? std::prev(Insert) : Loops.end(); - - for (auto I = scc_begin(G); !I.isAtEnd(); ++I) { - if (I->size() < 2) - continue; - - // Translate the SCC into RPO. - createIrreducibleLoop(*this, G, OuterLoop, Insert, *I); - } - - if (OuterLoop) - return make_range(std::next(Prev), Insert); - return make_range(Loops.begin(), Insert); -} - -void -BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) { - OuterLoop.Exits.clear(); - for (auto &Mass : OuterLoop.BackedgeMass) - Mass = BlockMass::getEmpty(); - auto O = OuterLoop.Nodes.begin() + 1; - for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I) - if (!Working[I->Index].isPackaged()) - *O++ = *I; - OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end()); -} - -void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { - assert(Loop.isIrreducible() && "this only makes sense on irreducible loops"); - - // Since the loop has more than one header block, the mass flowing back into - // each header will be different. Adjust the mass in each header loop to - // reflect the masses flowing through back edges. - // - // To do this, we distribute the initial mass using the backedge masses - // as weights for the distribution. - BlockMass LoopMass = BlockMass::getFull(); - Distribution Dist; - - LLVM_DEBUG(dbgs() << "adjust-loop-header-mass:\n"); - for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { - auto &HeaderNode = Loop.Nodes[H]; - auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)]; - LLVM_DEBUG(dbgs() << " - Add back edge mass for node " - << getBlockName(HeaderNode) << ": " << BackedgeMass - << "\n"); - if (BackedgeMass.getMass() > 0) - Dist.addLocal(HeaderNode, BackedgeMass.getMass()); - else - LLVM_DEBUG(dbgs() << " Nothing added. Back edge mass is zero\n"); - } - - DitheringDistributer D(Dist, LoopMass); - - LLVM_DEBUG(dbgs() << " Distribute loop mass " << LoopMass - << " to headers using above weights\n"); - for (const Weight &W : Dist.Weights) { - BlockMass Taken = D.takeMass(W.Amount); - assert(W.Type == Weight::Local && "all weights should be local"); - Working[W.TargetNode.Index].getMass() = Taken; - LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); - } -} - -void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) { - BlockMass LoopMass = BlockMass::getFull(); - DitheringDistributer D(Dist, LoopMass); - for (const Weight &W : Dist.Weights) { - BlockMass Taken = D.takeMass(W.Amount); - assert(W.Type == Weight::Local && "all weights should be local"); - Working[W.TargetNode.Index].getMass() = Taken; - LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); - } -} +//===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Loops should be simplified before this analysis. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/Config/llvm-config.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/BlockFrequency.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ScaledNumber.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <iterator> +#include <list> +#include <numeric> +#include <utility> +#include <vector> + +using namespace llvm; +using namespace llvm::bfi_detail; + +#define DEBUG_TYPE "block-freq" + +cl::opt<bool> CheckBFIUnknownBlockQueries( + "check-bfi-unknown-block-queries", + cl::init(false), cl::Hidden, + cl::desc("Check if block frequency is queried for an unknown block " + "for debugging missed BFI updates")); + +ScaledNumber<uint64_t> BlockMass::toScaled() const { + if (isFull()) + return ScaledNumber<uint64_t>(1, 0); + return ScaledNumber<uint64_t>(getMass() + 1, -64); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); } +#endif + +static char getHexDigit(int N) { + assert(N < 16); + if (N < 10) + return '0' + N; + return 'a' + N - 10; +} + +raw_ostream &BlockMass::print(raw_ostream &OS) const { + for (int Digits = 0; Digits < 16; ++Digits) + OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf); + return OS; +} + +namespace { + +using BlockNode = BlockFrequencyInfoImplBase::BlockNode; +using Distribution = BlockFrequencyInfoImplBase::Distribution; +using WeightList = BlockFrequencyInfoImplBase::Distribution::WeightList; +using Scaled64 = BlockFrequencyInfoImplBase::Scaled64; +using LoopData = BlockFrequencyInfoImplBase::LoopData; +using Weight = BlockFrequencyInfoImplBase::Weight; +using FrequencyData = BlockFrequencyInfoImplBase::FrequencyData; + +/// Dithering mass distributer. +/// +/// This class splits up a single mass into portions by weight, dithering to +/// spread out error. No mass is lost. The dithering precision depends on the +/// precision of the product of \a BlockMass and \a BranchProbability. +/// +/// The distribution algorithm follows. +/// +/// 1. Initialize by saving the sum of the weights in \a RemWeight and the +/// mass to distribute in \a RemMass. +/// +/// 2. For each portion: +/// +/// 1. Construct a branch probability, P, as the portion's weight divided +/// by the current value of \a RemWeight. +/// 2. Calculate the portion's mass as \a RemMass times P. +/// 3. Update \a RemWeight and \a RemMass at each portion by subtracting +/// the current portion's weight and mass. +struct DitheringDistributer { + uint32_t RemWeight; + BlockMass RemMass; + + DitheringDistributer(Distribution &Dist, const BlockMass &Mass); + + BlockMass takeMass(uint32_t Weight); +}; + +} // end anonymous namespace + +DitheringDistributer::DitheringDistributer(Distribution &Dist, + const BlockMass &Mass) { + Dist.normalize(); + RemWeight = Dist.Total; + RemMass = Mass; +} + +BlockMass DitheringDistributer::takeMass(uint32_t Weight) { + assert(Weight && "invalid weight"); + assert(Weight <= RemWeight); + BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight); + + // Decrement totals (dither). + RemWeight -= Weight; + RemMass -= Mass; + return Mass; +} + +void Distribution::add(const BlockNode &Node, uint64_t Amount, + Weight::DistType Type) { + assert(Amount && "invalid weight of 0"); + uint64_t NewTotal = Total + Amount; + + // Check for overflow. It should be impossible to overflow twice. + bool IsOverflow = NewTotal < Total; + assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow"); + DidOverflow |= IsOverflow; + + // Update the total. + Total = NewTotal; + + // Save the weight. + Weights.push_back(Weight(Type, Node, Amount)); +} + +static void combineWeight(Weight &W, const Weight &OtherW) { + assert(OtherW.TargetNode.isValid()); + if (!W.Amount) { + W = OtherW; + return; + } + assert(W.Type == OtherW.Type); + assert(W.TargetNode == OtherW.TargetNode); + assert(OtherW.Amount && "Expected non-zero weight"); + if (W.Amount > W.Amount + OtherW.Amount) + // Saturate on overflow. + W.Amount = UINT64_MAX; + else + W.Amount += OtherW.Amount; +} + +static void combineWeightsBySorting(WeightList &Weights) { + // Sort so edges to the same node are adjacent. + llvm::sort(Weights, [](const Weight &L, const Weight &R) { + return L.TargetNode < R.TargetNode; + }); + + // Combine adjacent edges. + WeightList::iterator O = Weights.begin(); + for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E; + ++O, (I = L)) { + *O = *I; + + // Find the adjacent weights to the same node. + for (++L; L != E && I->TargetNode == L->TargetNode; ++L) + combineWeight(*O, *L); + } + + // Erase extra entries. + Weights.erase(O, Weights.end()); +} + +static void combineWeightsByHashing(WeightList &Weights) { + // Collect weights into a DenseMap. + using HashTable = DenseMap<BlockNode::IndexType, Weight>; + + HashTable Combined(NextPowerOf2(2 * Weights.size())); + for (const Weight &W : Weights) + combineWeight(Combined[W.TargetNode.Index], W); + + // Check whether anything changed. + if (Weights.size() == Combined.size()) + return; + + // Fill in the new weights. + Weights.clear(); + Weights.reserve(Combined.size()); + for (const auto &I : Combined) + Weights.push_back(I.second); +} + +static void combineWeights(WeightList &Weights) { + // Use a hash table for many successors to keep this linear. + if (Weights.size() > 128) { + combineWeightsByHashing(Weights); + return; + } + + combineWeightsBySorting(Weights); +} + +static uint64_t shiftRightAndRound(uint64_t N, int Shift) { + assert(Shift >= 0); + assert(Shift < 64); + if (!Shift) + return N; + return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1)); +} + +void Distribution::normalize() { + // Early exit for termination nodes. + if (Weights.empty()) + return; + + // Only bother if there are multiple successors. + if (Weights.size() > 1) + combineWeights(Weights); + + // Early exit when combined into a single successor. + if (Weights.size() == 1) { + Total = 1; + Weights.front().Amount = 1; + return; + } + + // Determine how much to shift right so that the total fits into 32-bits. + // + // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1 + // for each weight can cause a 32-bit overflow. + int Shift = 0; + if (DidOverflow) + Shift = 33; + else if (Total > UINT32_MAX) + Shift = 33 - countLeadingZeros(Total); + + // Early exit if nothing needs to be scaled. + if (!Shift) { + // If we didn't overflow then combineWeights() shouldn't have changed the + // sum of the weights, but let's double-check. + assert(Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0), + [](uint64_t Sum, const Weight &W) { + return Sum + W.Amount; + }) && + "Expected total to be correct"); + return; + } + + // Recompute the total through accumulation (rather than shifting it) so that + // it's accurate after shifting and any changes combineWeights() made above. + Total = 0; + + // Sum the weights to each node and shift right if necessary. + for (Weight &W : Weights) { + // Scale down below UINT32_MAX. Since Shift is larger than necessary, we + // can round here without concern about overflow. + assert(W.TargetNode.isValid()); + W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift)); + assert(W.Amount <= UINT32_MAX); + + // Update the total. + Total += W.Amount; + } + assert(Total <= UINT32_MAX); +} + +void BlockFrequencyInfoImplBase::clear() { + // Swap with a default-constructed std::vector, since std::vector<>::clear() + // does not actually clear heap storage. + std::vector<FrequencyData>().swap(Freqs); + IsIrrLoopHeader.clear(); + std::vector<WorkingData>().swap(Working); + Loops.clear(); +} + +/// Clear all memory not needed downstream. +/// +/// Releases all memory not used downstream. In particular, saves Freqs. +static void cleanup(BlockFrequencyInfoImplBase &BFI) { + std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs)); + SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader)); + BFI.clear(); + BFI.Freqs = std::move(SavedFreqs); + BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader); +} + +bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, + const LoopData *OuterLoop, + const BlockNode &Pred, + const BlockNode &Succ, + uint64_t Weight) { + if (!Weight) + Weight = 1; + + auto isLoopHeader = [&OuterLoop](const BlockNode &Node) { + return OuterLoop && OuterLoop->isHeader(Node); + }; + + BlockNode Resolved = Working[Succ.Index].getResolvedNode(); + +#ifndef NDEBUG + auto debugSuccessor = [&](const char *Type) { + dbgs() << " =>" + << " [" << Type << "] weight = " << Weight; + if (!isLoopHeader(Resolved)) + dbgs() << ", succ = " << getBlockName(Succ); + if (Resolved != Succ) + dbgs() << ", resolved = " << getBlockName(Resolved); + dbgs() << "\n"; + }; + (void)debugSuccessor; +#endif + + if (isLoopHeader(Resolved)) { + LLVM_DEBUG(debugSuccessor("backedge")); + Dist.addBackedge(Resolved, Weight); + return true; + } + + if (Working[Resolved.Index].getContainingLoop() != OuterLoop) { + LLVM_DEBUG(debugSuccessor(" exit ")); + Dist.addExit(Resolved, Weight); + return true; + } + + if (Resolved < Pred) { + if (!isLoopHeader(Pred)) { + // If OuterLoop is an irreducible loop, we can't actually handle this. + assert((!OuterLoop || !OuterLoop->isIrreducible()) && + "unhandled irreducible control flow"); + + // Irreducible backedge. Abort. + LLVM_DEBUG(debugSuccessor("abort!!!")); + return false; + } + + // If "Pred" is a loop header, then this isn't really a backedge; rather, + // OuterLoop must be irreducible. These false backedges can come only from + // secondary loop headers. + assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) && + "unhandled irreducible control flow"); + } + + LLVM_DEBUG(debugSuccessor(" local ")); + Dist.addLocal(Resolved, Weight); + return true; +} + +bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist( + const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) { + // Copy the exit map into Dist. + for (const auto &I : Loop.Exits) + if (!addToDist(Dist, OuterLoop, Loop.getHeader(), I.first, + I.second.getMass())) + // Irreducible backedge. + return false; + + return true; +} + +/// Compute the loop scale for a loop. +void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { + // Compute loop scale. + LLVM_DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n"); + + // Infinite loops need special handling. If we give the back edge an infinite + // mass, they may saturate all the other scales in the function down to 1, + // making all the other region temperatures look exactly the same. Choose an + // arbitrary scale to avoid these issues. + // + // FIXME: An alternate way would be to select a symbolic scale which is later + // replaced to be the maximum of all computed scales plus 1. This would + // appropriately describe the loop as having a large scale, without skewing + // the final frequency computation. + const Scaled64 InfiniteLoopScale(1, 12); + + // LoopScale == 1 / ExitMass + // ExitMass == HeadMass - BackedgeMass + BlockMass TotalBackedgeMass; + for (auto &Mass : Loop.BackedgeMass) + TotalBackedgeMass += Mass; + BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass; + + // Block scale stores the inverse of the scale. If this is an infinite loop, + // its exit mass will be zero. In this case, use an arbitrary scale for the + // loop scale. + Loop.Scale = + ExitMass.isEmpty() ? InfiniteLoopScale : ExitMass.toScaled().inverse(); + + LLVM_DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" + << BlockMass::getFull() << " - " << TotalBackedgeMass + << ")\n" + << " - scale = " << Loop.Scale << "\n"); +} + +/// Package up a loop. +void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) { + LLVM_DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n"); + + // Clear the subloop exits to prevent quadratic memory usage. + for (const BlockNode &M : Loop.Nodes) { + if (auto *Loop = Working[M.Index].getPackagedLoop()) + Loop->Exits.clear(); + LLVM_DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n"); + } + Loop.IsPackaged = true; +} + +#ifndef NDEBUG +static void debugAssign(const BlockFrequencyInfoImplBase &BFI, + const DitheringDistributer &D, const BlockNode &T, + const BlockMass &M, const char *Desc) { + dbgs() << " => assign " << M << " (" << D.RemMass << ")"; + if (Desc) + dbgs() << " [" << Desc << "]"; + if (T.isValid()) + dbgs() << " to " << BFI.getBlockName(T); + dbgs() << "\n"; +} +#endif + +void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, + LoopData *OuterLoop, + Distribution &Dist) { + BlockMass Mass = Working[Source.Index].getMass(); + LLVM_DEBUG(dbgs() << " => mass: " << Mass << "\n"); + + // Distribute mass to successors as laid out in Dist. + DitheringDistributer D(Dist, Mass); + + for (const Weight &W : Dist.Weights) { + // Check for a local edge (non-backedge and non-exit). + BlockMass Taken = D.takeMass(W.Amount); + if (W.Type == Weight::Local) { + Working[W.TargetNode.Index].getMass() += Taken; + LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); + continue; + } + + // Backedges and exits only make sense if we're processing a loop. + assert(OuterLoop && "backedge or exit outside of loop"); + + // Check for a backedge. + if (W.Type == Weight::Backedge) { + OuterLoop->BackedgeMass[OuterLoop->getHeaderIndex(W.TargetNode)] += Taken; + LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back")); + continue; + } + + // This must be an exit. + assert(W.Type == Weight::Exit); + OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken)); + LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit")); + } +} + +static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, + const Scaled64 &Min, const Scaled64 &Max) { + // Scale the Factor to a size that creates integers. Ideally, integers would + // be scaled so that Max == UINT64_MAX so that they can be best + // differentiated. However, in the presence of large frequency values, small + // frequencies are scaled down to 1, making it impossible to differentiate + // small, unequal numbers. When the spread between Min and Max frequencies + // fits well within MaxBits, we make the scale be at least 8. + const unsigned MaxBits = 64; + const unsigned SpreadBits = (Max / Min).lg(); + Scaled64 ScalingFactor; + if (SpreadBits <= MaxBits - 3) { + // If the values are small enough, make the scaling factor at least 8 to + // allow distinguishing small values. + ScalingFactor = Min.inverse(); + ScalingFactor <<= 3; + } else { + // If the values need more than MaxBits to be represented, saturate small + // frequency values down to 1 by using a scaling factor that benefits large + // frequency values. + ScalingFactor = Scaled64(1, MaxBits) / Max; + } + + // Translate the floats to integers. + LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max + << ", factor = " << ScalingFactor << "\n"); + for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) { + Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor; + BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>()); + LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = " + << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled + << ", int = " << BFI.Freqs[Index].Integer << "\n"); + } +} + +/// Unwrap a loop package. +/// +/// Visits all the members of a loop, adjusting their BlockData according to +/// the loop's pseudo-node. +static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) { + LLVM_DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop) + << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale + << "\n"); + Loop.Scale *= Loop.Mass.toScaled(); + Loop.IsPackaged = false; + LLVM_DEBUG(dbgs() << " => combined-scale = " << Loop.Scale << "\n"); + + // Propagate the head scale through the loop. Since members are visited in + // RPO, the head scale will be updated by the loop scale first, and then the + // final head scale will be used for updated the rest of the members. + for (const BlockNode &N : Loop.Nodes) { + const auto &Working = BFI.Working[N.Index]; + Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale + : BFI.Freqs[N.Index].Scaled; + Scaled64 New = Loop.Scale * F; + LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " + << New << "\n"); + F = New; + } +} + +void BlockFrequencyInfoImplBase::unwrapLoops() { + // Set initial frequencies from loop-local masses. + for (size_t Index = 0; Index < Working.size(); ++Index) + Freqs[Index].Scaled = Working[Index].Mass.toScaled(); + + for (LoopData &Loop : Loops) + unwrapLoop(*this, Loop); +} + +void BlockFrequencyInfoImplBase::finalizeMetrics() { + // Unwrap loop packages in reverse post-order, tracking min and max + // frequencies. + auto Min = Scaled64::getLargest(); + auto Max = Scaled64::getZero(); + for (size_t Index = 0; Index < Working.size(); ++Index) { + // Update min/max scale. + Min = std::min(Min, Freqs[Index].Scaled); + Max = std::max(Max, Freqs[Index].Scaled); + } + + // Convert to integers. + convertFloatingToInteger(*this, Min, Max); + + // Clean up data structures. + cleanup(*this); + + // Print out the final stats. + LLVM_DEBUG(dump()); +} + +BlockFrequency +BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const { + if (!Node.isValid()) { +#ifndef NDEBUG + if (CheckBFIUnknownBlockQueries) { + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "*** Detected BFI query for unknown block " << getBlockName(Node); + report_fatal_error(OS.str()); + } +#endif + return 0; + } + return Freqs[Node.Index].Integer; +} + +Optional<uint64_t> +BlockFrequencyInfoImplBase::getBlockProfileCount(const Function &F, + const BlockNode &Node, + bool AllowSynthetic) const { + return getProfileCountFromFreq(F, getBlockFreq(Node).getFrequency(), + AllowSynthetic); +} + +Optional<uint64_t> +BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F, + uint64_t Freq, + bool AllowSynthetic) const { + auto EntryCount = F.getEntryCount(AllowSynthetic); + if (!EntryCount) + return None; + // Use 128 bit APInt to do the arithmetic to avoid overflow. + APInt BlockCount(128, EntryCount.getCount()); + APInt BlockFreq(128, Freq); + APInt EntryFreq(128, getEntryFreq()); + BlockCount *= BlockFreq; + // Rounded division of BlockCount by EntryFreq. Since EntryFreq is unsigned + // lshr by 1 gives EntryFreq/2. + BlockCount = (BlockCount + EntryFreq.lshr(1)).udiv(EntryFreq); + return BlockCount.getLimitedValue(); +} + +bool +BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) { + if (!Node.isValid()) + return false; + return IsIrrLoopHeader.test(Node.Index); +} + +Scaled64 +BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { + if (!Node.isValid()) + return Scaled64::getZero(); + return Freqs[Node.Index].Scaled; +} + +void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node, + uint64_t Freq) { + assert(Node.isValid() && "Expected valid node"); + assert(Node.Index < Freqs.size() && "Expected legal index"); + Freqs[Node.Index].Integer = Freq; +} + +std::string +BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const { + return {}; +} + +std::string +BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const { + return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*"); +} + +raw_ostream & +BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, + const BlockNode &Node) const { + return OS << getFloatingBlockFreq(Node); +} + +raw_ostream & +BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, + const BlockFrequency &Freq) const { + Scaled64 Block(Freq.getFrequency(), 0); + Scaled64 Entry(getEntryFreq(), 0); + + return OS << Block / Entry; +} + +void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) { + Start = OuterLoop.getHeader(); + Nodes.reserve(OuterLoop.Nodes.size()); + for (auto N : OuterLoop.Nodes) + addNode(N); + indexNodes(); +} + +void IrreducibleGraph::addNodesInFunction() { + Start = 0; + for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index) + if (!BFI.Working[Index].isPackaged()) + addNode(Index); + indexNodes(); +} + +void IrreducibleGraph::indexNodes() { + for (auto &I : Nodes) + Lookup[I.Node.Index] = &I; +} + +void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ, + const BFIBase::LoopData *OuterLoop) { + if (OuterLoop && OuterLoop->isHeader(Succ)) + return; + auto L = Lookup.find(Succ.Index); + if (L == Lookup.end()) + return; + IrrNode &SuccIrr = *L->second; + Irr.Edges.push_back(&SuccIrr); + SuccIrr.Edges.push_front(&Irr); + ++SuccIrr.NumIn; +} + +namespace llvm { + +template <> struct GraphTraits<IrreducibleGraph> { + using GraphT = bfi_detail::IrreducibleGraph; + using NodeRef = const GraphT::IrrNode *; + using ChildIteratorType = GraphT::IrrNode::iterator; + + static NodeRef getEntryNode(const GraphT &G) { return G.StartIrr; } + static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); } + static ChildIteratorType child_end(NodeRef N) { return N->succ_end(); } +}; + +} // end namespace llvm + +/// Find extra irreducible headers. +/// +/// Find entry blocks and other blocks with backedges, which exist when \c G +/// contains irreducible sub-SCCs. +static void findIrreducibleHeaders( + const BlockFrequencyInfoImplBase &BFI, + const IrreducibleGraph &G, + const std::vector<const IrreducibleGraph::IrrNode *> &SCC, + LoopData::NodeList &Headers, LoopData::NodeList &Others) { + // Map from nodes in the SCC to whether it's an entry block. + SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC; + + // InSCC also acts the set of nodes in the graph. Seed it. + for (const auto *I : SCC) + InSCC[I] = false; + + for (auto I = InSCC.begin(), E = InSCC.end(); I != E; ++I) { + auto &Irr = *I->first; + for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { + if (InSCC.count(P)) + continue; + + // This is an entry block. + I->second = true; + Headers.push_back(Irr.Node); + LLVM_DEBUG(dbgs() << " => entry = " << BFI.getBlockName(Irr.Node) + << "\n"); + break; + } + } + assert(Headers.size() >= 2 && + "Expected irreducible CFG; -loop-info is likely invalid"); + if (Headers.size() == InSCC.size()) { + // Every block is a header. + llvm::sort(Headers); + return; + } + + // Look for extra headers from irreducible sub-SCCs. + for (const auto &I : InSCC) { + // Entry blocks are already headers. + if (I.second) + continue; + + auto &Irr = *I.first; + for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { + // Skip forward edges. + if (P->Node < Irr.Node) + continue; + + // Skip predecessors from entry blocks. These can have inverted + // ordering. + if (InSCC.lookup(P)) + continue; + + // Store the extra header. + Headers.push_back(Irr.Node); + LLVM_DEBUG(dbgs() << " => extra = " << BFI.getBlockName(Irr.Node) + << "\n"); + break; + } + if (Headers.back() == Irr.Node) + // Added this as a header. + continue; + + // This is not a header. + Others.push_back(Irr.Node); + LLVM_DEBUG(dbgs() << " => other = " << BFI.getBlockName(Irr.Node) << "\n"); + } + llvm::sort(Headers); + llvm::sort(Others); +} + +static void createIrreducibleLoop( + BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, + LoopData *OuterLoop, std::list<LoopData>::iterator Insert, + const std::vector<const IrreducibleGraph::IrrNode *> &SCC) { + // Translate the SCC into RPO. + LLVM_DEBUG(dbgs() << " - found-scc\n"); + + LoopData::NodeList Headers; + LoopData::NodeList Others; + findIrreducibleHeaders(BFI, G, SCC, Headers, Others); + + auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(), + Headers.end(), Others.begin(), Others.end()); + + // Update loop hierarchy. + for (const auto &N : Loop->Nodes) + if (BFI.Working[N.Index].isLoopHeader()) + BFI.Working[N.Index].Loop->Parent = &*Loop; + else + BFI.Working[N.Index].Loop = &*Loop; +} + +iterator_range<std::list<LoopData>::iterator> +BlockFrequencyInfoImplBase::analyzeIrreducible( + const IrreducibleGraph &G, LoopData *OuterLoop, + std::list<LoopData>::iterator Insert) { + assert((OuterLoop == nullptr) == (Insert == Loops.begin())); + auto Prev = OuterLoop ? std::prev(Insert) : Loops.end(); + + for (auto I = scc_begin(G); !I.isAtEnd(); ++I) { + if (I->size() < 2) + continue; + + // Translate the SCC into RPO. + createIrreducibleLoop(*this, G, OuterLoop, Insert, *I); + } + + if (OuterLoop) + return make_range(std::next(Prev), Insert); + return make_range(Loops.begin(), Insert); +} + +void +BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) { + OuterLoop.Exits.clear(); + for (auto &Mass : OuterLoop.BackedgeMass) + Mass = BlockMass::getEmpty(); + auto O = OuterLoop.Nodes.begin() + 1; + for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I) + if (!Working[I->Index].isPackaged()) + *O++ = *I; + OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end()); +} + +void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { + assert(Loop.isIrreducible() && "this only makes sense on irreducible loops"); + + // Since the loop has more than one header block, the mass flowing back into + // each header will be different. Adjust the mass in each header loop to + // reflect the masses flowing through back edges. + // + // To do this, we distribute the initial mass using the backedge masses + // as weights for the distribution. + BlockMass LoopMass = BlockMass::getFull(); + Distribution Dist; + + LLVM_DEBUG(dbgs() << "adjust-loop-header-mass:\n"); + for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { + auto &HeaderNode = Loop.Nodes[H]; + auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)]; + LLVM_DEBUG(dbgs() << " - Add back edge mass for node " + << getBlockName(HeaderNode) << ": " << BackedgeMass + << "\n"); + if (BackedgeMass.getMass() > 0) + Dist.addLocal(HeaderNode, BackedgeMass.getMass()); + else + LLVM_DEBUG(dbgs() << " Nothing added. Back edge mass is zero\n"); + } + + DitheringDistributer D(Dist, LoopMass); + + LLVM_DEBUG(dbgs() << " Distribute loop mass " << LoopMass + << " to headers using above weights\n"); + for (const Weight &W : Dist.Weights) { + BlockMass Taken = D.takeMass(W.Amount); + assert(W.Type == Weight::Local && "all weights should be local"); + Working[W.TargetNode.Index].getMass() = Taken; + LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); + } +} + +void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) { + BlockMass LoopMass = BlockMass::getFull(); + DitheringDistributer D(Dist, LoopMass); + for (const Weight &W : Dist.Weights) { + BlockMass Taken = D.takeMass(W.Amount); + assert(W.Type == Weight::Local && "all weights should be local"); + Working[W.TargetNode.Index].getMass() = Taken; + LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); + } +} |