diff options
author | vitalyisaev <vitalyisaev@yandex-team.com> | 2023-06-29 10:00:50 +0300 |
---|---|---|
committer | vitalyisaev <vitalyisaev@yandex-team.com> | 2023-06-29 10:00:50 +0300 |
commit | 6ffe9e53658409f212834330e13564e4952558f6 (patch) | |
tree | 85b1e00183517648b228aafa7c8fb07f5276f419 /contrib/libs/llvm14/tools/llvm-profgen | |
parent | 726057070f9c5a91fc10fde0d5024913d10f1ab9 (diff) | |
download | ydb-6ffe9e53658409f212834330e13564e4952558f6.tar.gz |
YQ Connector: support managed ClickHouse
Со стороны dqrun можно обратиться к инстансу коннектора, который работает на streaming стенде, и извлечь данные из облачного CH.
Diffstat (limited to 'contrib/libs/llvm14/tools/llvm-profgen')
-rw-r--r-- | contrib/libs/llvm14/tools/llvm-profgen/CSPreInliner.cpp | 285 | ||||
-rw-r--r-- | contrib/libs/llvm14/tools/llvm-profgen/CSPreInliner.h | 95 | ||||
-rw-r--r-- | contrib/libs/llvm14/tools/llvm-profgen/CallContext.h | 59 | ||||
-rw-r--r-- | contrib/libs/llvm14/tools/llvm-profgen/ErrorHandling.h | 56 | ||||
-rw-r--r-- | contrib/libs/llvm14/tools/llvm-profgen/PerfReader.cpp | 1222 | ||||
-rw-r--r-- | contrib/libs/llvm14/tools/llvm-profgen/PerfReader.h | 728 | ||||
-rw-r--r-- | contrib/libs/llvm14/tools/llvm-profgen/ProfileGenerator.cpp | 979 | ||||
-rw-r--r-- | contrib/libs/llvm14/tools/llvm-profgen/ProfileGenerator.h | 312 | ||||
-rw-r--r-- | contrib/libs/llvm14/tools/llvm-profgen/ProfiledBinary.cpp | 790 | ||||
-rw-r--r-- | contrib/libs/llvm14/tools/llvm-profgen/ProfiledBinary.h | 541 | ||||
-rw-r--r-- | contrib/libs/llvm14/tools/llvm-profgen/llvm-profgen.cpp | 164 | ||||
-rw-r--r-- | contrib/libs/llvm14/tools/llvm-profgen/ya.make | 80 |
12 files changed, 5311 insertions, 0 deletions
diff --git a/contrib/libs/llvm14/tools/llvm-profgen/CSPreInliner.cpp b/contrib/libs/llvm14/tools/llvm-profgen/CSPreInliner.cpp new file mode 100644 index 0000000000..1e64263990 --- /dev/null +++ b/contrib/libs/llvm14/tools/llvm-profgen/CSPreInliner.cpp @@ -0,0 +1,285 @@ +//===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CSPreInliner.h" +#include "ProfiledBinary.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/Statistic.h" +#include <cstdint> +#include <queue> + +#define DEBUG_TYPE "cs-preinliner" + +using namespace llvm; +using namespace sampleprof; + +STATISTIC(PreInlNumCSInlined, + "Number of functions inlined with context sensitive profile"); +STATISTIC(PreInlNumCSNotInlined, + "Number of functions not inlined with context sensitive profile"); +STATISTIC(PreInlNumCSInlinedHitMinLimit, + "Number of functions with FDO inline stopped due to min size limit"); +STATISTIC(PreInlNumCSInlinedHitMaxLimit, + "Number of functions with FDO inline stopped due to max size limit"); +STATISTIC( + PreInlNumCSInlinedHitGrowthLimit, + "Number of functions with FDO inline stopped due to growth size limit"); + +// The switches specify inline thresholds used in SampleProfileLoader inlining. +// TODO: the actual threshold to be tuned here because the size here is based +// on machine code not LLVM IR. +extern cl::opt<int> SampleHotCallSiteThreshold; +extern cl::opt<int> SampleColdCallSiteThreshold; +extern cl::opt<int> ProfileInlineGrowthLimit; +extern cl::opt<int> ProfileInlineLimitMin; +extern cl::opt<int> ProfileInlineLimitMax; +extern cl::opt<bool> SortProfiledSCC; + +cl::opt<bool> EnableCSPreInliner( + "csspgo-preinliner", cl::Hidden, cl::init(true), + cl::desc("Run a global pre-inliner to merge context profile based on " + "estimated global top-down inline decisions")); + +cl::opt<bool> UseContextCostForPreInliner( + "use-context-cost-for-preinliner", cl::Hidden, cl::init(true), + cl::desc("Use context-sensitive byte size cost for preinliner decisions")); + +static cl::opt<bool> SamplePreInlineReplay( + "csspgo-replay-preinline", cl::Hidden, cl::init(false), + cl::desc( + "Replay previous inlining and adjust context profile accordingly")); + +CSPreInliner::CSPreInliner(SampleProfileMap &Profiles, ProfiledBinary &Binary, + uint64_t HotThreshold, uint64_t ColdThreshold) + : UseContextCost(UseContextCostForPreInliner), + // TODO: Pass in a guid-to-name map in order for + // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes + // as their profile context. + ContextTracker(Profiles, nullptr), ProfileMap(Profiles), Binary(Binary), + HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) { + // Set default preinliner hot/cold call site threshold tuned with CSSPGO. + // for good performance with reasonable profile size. + if (!SampleHotCallSiteThreshold.getNumOccurrences()) + SampleHotCallSiteThreshold = 1500; + if (!SampleColdCallSiteThreshold.getNumOccurrences()) + SampleColdCallSiteThreshold = 0; +} + +std::vector<StringRef> CSPreInliner::buildTopDownOrder() { + std::vector<StringRef> Order; + ProfiledCallGraph ProfiledCG(ContextTracker); + + // Now that we have a profiled call graph, construct top-down order + // by building up SCC and reversing SCC order. + scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG); + while (!I.isAtEnd()) { + auto Range = *I; + if (SortProfiledSCC) { + // Sort nodes in one SCC based on callsite hotness. + scc_member_iterator<ProfiledCallGraph *> SI(*I); + Range = *SI; + } + for (auto *Node : Range) { + if (Node != ProfiledCG.getEntryNode()) + Order.push_back(Node->Name); + } + ++I; + } + std::reverse(Order.begin(), Order.end()); + + return Order; +} + +bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, + const FunctionSamples *CallerSamples) { + assert(CallerSamples && "Expect non-null caller samples"); + + // Ideally we want to consider everything a function calls, but as far as + // context profile is concerned, only those frames that are children of + // current one in the trie is relavent. So we walk the trie instead of call + // targets from function profile. + ContextTrieNode *CallerNode = + ContextTracker.getContextFor(CallerSamples->getContext()); + + bool HasNewCandidate = false; + for (auto &Child : CallerNode->getAllChildContext()) { + ContextTrieNode *CalleeNode = &Child.second; + FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples(); + if (!CalleeSamples) + continue; + + // Call site count is more reliable, so we look up the corresponding call + // target profile in caller's context profile to retrieve call site count. + uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples(); + uint64_t CallsiteCount = 0; + LineLocation Callsite = CalleeNode->getCallSiteLoc(); + if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) { + SampleRecord::CallTargetMap &TargetCounts = CallTargets.get(); + auto It = TargetCounts.find(CalleeSamples->getName()); + if (It != TargetCounts.end()) + CallsiteCount = It->second; + } + + // TODO: call site and callee entry count should be mostly consistent, add + // check for that. + HasNewCandidate = true; + uint32_t CalleeSize = getFuncSize(*CalleeSamples); + CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount), + CalleeSize); + } + + return HasNewCandidate; +} + +uint32_t CSPreInliner::getFuncSize(const FunctionSamples &FSamples) { + if (UseContextCost) { + return Binary.getFuncSizeForContext(FSamples.getContext()); + } + + return FSamples.getBodySamples().size(); +} + +bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { + // If replay inline is requested, simply follow the inline decision of the + // profiled binary. + if (SamplePreInlineReplay) + return Candidate.CalleeSamples->getContext().hasAttribute( + ContextWasInlined); + + // Adjust threshold based on call site hotness, only do this for callsite + // prioritized inliner because otherwise cost-benefit check is done earlier. + unsigned int SampleThreshold = SampleColdCallSiteThreshold; + if (Candidate.CallsiteCount > HotCountThreshold) + SampleThreshold = SampleHotCallSiteThreshold; + + // TODO: for small cold functions, we may inlined them and we need to keep + // context profile accordingly. + if (Candidate.CallsiteCount < ColdCountThreshold) + SampleThreshold = SampleColdCallSiteThreshold; + + return (Candidate.SizeCost < SampleThreshold); +} + +void CSPreInliner::processFunction(const StringRef Name) { + FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name); + if (!FSamples) + return; + + unsigned FuncSize = getFuncSize(*FSamples); + unsigned FuncFinalSize = FuncSize; + unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; + SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); + SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); + + LLVM_DEBUG(dbgs() << "Process " << Name + << " for context-sensitive pre-inlining (pre-inline size: " + << FuncSize << ", size limit: " << SizeLimit << ")\n"); + + ProfiledCandidateQueue CQueue; + getInlineCandidates(CQueue, FSamples); + + while (!CQueue.empty() && FuncFinalSize < SizeLimit) { + ProfiledInlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + bool ShouldInline = false; + if ((ShouldInline = shouldInline(Candidate))) { + // We mark context as inlined as the corresponding context profile + // won't be merged into that function's base profile. + ++PreInlNumCSInlined; + ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples); + Candidate.CalleeSamples->getContext().setAttribute( + ContextShouldBeInlined); + FuncFinalSize += Candidate.SizeCost; + getInlineCandidates(CQueue, Candidate.CalleeSamples); + } else { + ++PreInlNumCSNotInlined; + } + LLVM_DEBUG(dbgs() << (ShouldInline ? " Inlined" : " Outlined") + << " context profile for: " + << Candidate.CalleeSamples->getContext().toString() + << " (callee size: " << Candidate.SizeCost + << ", call count:" << Candidate.CallsiteCount << ")\n"); + } + + if (!CQueue.empty()) { + if (SizeLimit == (unsigned)ProfileInlineLimitMax) + ++PreInlNumCSInlinedHitMaxLimit; + else if (SizeLimit == (unsigned)ProfileInlineLimitMin) + ++PreInlNumCSInlinedHitMinLimit; + else + ++PreInlNumCSInlinedHitGrowthLimit; + } + + LLVM_DEBUG({ + if (!CQueue.empty()) + dbgs() << " Inline candidates ignored due to size limit (inliner " + "original size: " + << FuncSize << ", inliner final size: " << FuncFinalSize + << ", size limit: " << SizeLimit << ")\n"; + + while (!CQueue.empty()) { + ProfiledInlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + bool WasInlined = + Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); + dbgs() << " " << Candidate.CalleeSamples->getContext().toString() + << " (candidate size:" << Candidate.SizeCost + << ", call count: " << Candidate.CallsiteCount << ", previously " + << (WasInlined ? "inlined)\n" : "not inlined)\n"); + } + }); +} + +void CSPreInliner::run() { +#ifndef NDEBUG + auto printProfileNames = [](SampleProfileMap &Profiles, bool IsInput) { + dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" + << Profiles.size() << " total):\n"; + for (auto &It : Profiles) { + const FunctionSamples &Samples = It.second; + dbgs() << " [" << Samples.getContext().toString() << "] " + << Samples.getTotalSamples() << ":" << Samples.getHeadSamples() + << "\n"; + } + }; +#endif + + LLVM_DEBUG(printProfileNames(ProfileMap, true)); + + // Execute global pre-inliner to estimate a global top-down inline + // decision and merge profiles accordingly. This helps with profile + // merge for ThinLTO otherwise we won't be able to merge profiles back + // to base profile across module/thin-backend boundaries. + // It also helps better compress context profile to control profile + // size, as we now only need context profile for functions going to + // be inlined. + for (StringRef FuncName : buildTopDownOrder()) { + processFunction(FuncName); + } + + // Not inlined context profiles are merged into its base, so we can + // trim out such profiles from the output. + std::vector<SampleContext> ProfilesToBeRemoved; + for (auto &It : ProfileMap) { + SampleContext &Context = It.second.getContext(); + if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) { + assert(Context.hasState(MergedContext) && + "Not inlined context profile should be merged already"); + ProfilesToBeRemoved.push_back(It.first); + } + } + + for (auto &ContextName : ProfilesToBeRemoved) { + ProfileMap.erase(ContextName); + } + + // Make sure ProfileMap's key is consistent with FunctionSamples' name. + SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles(); + + LLVM_DEBUG(printProfileNames(ProfileMap, false)); +} diff --git a/contrib/libs/llvm14/tools/llvm-profgen/CSPreInliner.h b/contrib/libs/llvm14/tools/llvm-profgen/CSPreInliner.h new file mode 100644 index 0000000000..9f63f7ef7b --- /dev/null +++ b/contrib/libs/llvm14/tools/llvm-profgen/CSPreInliner.h @@ -0,0 +1,95 @@ +//===-- CSPreInliner.h - Profile guided preinliner ---------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H +#define LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H + +#include "ProfiledBinary.h" +#include "llvm/ADT/PriorityQueue.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/Transforms/IPO/ProfiledCallGraph.h" +#include "llvm/Transforms/IPO/SampleContextTracker.h" + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Inline candidate seen from profile +struct ProfiledInlineCandidate { + ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count, + uint32_t Size) + : CalleeSamples(Samples), CallsiteCount(Count), SizeCost(Size) {} + // Context-sensitive function profile for inline candidate + const FunctionSamples *CalleeSamples; + // Call site count for an inline candidate + // TODO: make sure entry count for context profile and call site + // target count for corresponding call are consistent. + uint64_t CallsiteCount; + // Size proxy for function under particular call context. + uint64_t SizeCost; +}; + +// Inline candidate comparer using call site weight +struct ProfiledCandidateComparer { + bool operator()(const ProfiledInlineCandidate &LHS, + const ProfiledInlineCandidate &RHS) { + if (LHS.CallsiteCount != RHS.CallsiteCount) + return LHS.CallsiteCount < RHS.CallsiteCount; + + if (LHS.SizeCost != RHS.SizeCost) + return LHS.SizeCost > RHS.SizeCost; + + // Tie breaker using GUID so we have stable/deterministic inlining order + assert(LHS.CalleeSamples && RHS.CalleeSamples && + "Expect non-null FunctionSamples"); + return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) < + RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName()); + } +}; + +using ProfiledCandidateQueue = + PriorityQueue<ProfiledInlineCandidate, std::vector<ProfiledInlineCandidate>, + ProfiledCandidateComparer>; + +// Pre-compilation inliner based on context-sensitive profile. +// The PreInliner estimates inline decision using hotness from profile +// and cost estimation from machine code size. It helps merges context +// profile globally and achieves better post-inine profile quality, which +// otherwise won't be possible for ThinLTO. It also reduce context profile +// size by only keep context that is estimated to be inlined. +class CSPreInliner { +public: + CSPreInliner(SampleProfileMap &Profiles, ProfiledBinary &Binary, + uint64_t HotThreshold, uint64_t ColdThreshold); + void run(); + +private: + bool getInlineCandidates(ProfiledCandidateQueue &CQueue, + const FunctionSamples *FCallerContextSamples); + std::vector<StringRef> buildTopDownOrder(); + void processFunction(StringRef Name); + bool shouldInline(ProfiledInlineCandidate &Candidate); + uint32_t getFuncSize(const FunctionSamples &FSamples); + bool UseContextCost; + SampleContextTracker ContextTracker; + SampleProfileMap &ProfileMap; + ProfiledBinary &Binary; + + // Count thresholds to answer isHotCount and isColdCount queries. + // Mirrors the threshold in ProfileSummaryInfo. + uint64_t HotCountThreshold; + uint64_t ColdCountThreshold; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/contrib/libs/llvm14/tools/llvm-profgen/CallContext.h b/contrib/libs/llvm14/tools/llvm-profgen/CallContext.h new file mode 100644 index 0000000000..5e552130d0 --- /dev/null +++ b/contrib/libs/llvm14/tools/llvm-profgen/CallContext.h @@ -0,0 +1,59 @@ +//===-- CallContext.h - Call Context Handler ---------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H +#define LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H + +#include "llvm/ProfileData/SampleProf.h" +#include <sstream> +#include <string> +#include <vector> + +namespace llvm { +namespace sampleprof { + +inline std::string getCallSite(const SampleContextFrame &Callsite) { + std::string CallsiteStr = Callsite.FuncName.str(); + CallsiteStr += ":"; + CallsiteStr += Twine(Callsite.Location.LineOffset).str(); + if (Callsite.Location.Discriminator > 0) { + CallsiteStr += "."; + CallsiteStr += Twine(Callsite.Location.Discriminator).str(); + } + return CallsiteStr; +} + +// TODO: This operation is expansive. If it ever gets called multiple times we +// may think of making a class wrapper with internal states for it. +inline std::string getLocWithContext(const SampleContextFrameVector &Context) { + std::ostringstream OContextStr; + for (const auto &Callsite : Context) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << getCallSite(Callsite); + } + return OContextStr.str(); +} + +// Reverse call context, i.e., in the order of callee frames to caller frames, +// is useful during instruction printing or pseudo probe printing. +inline std::string +getReversedLocWithContext(const SampleContextFrameVector &Context) { + std::ostringstream OContextStr; + for (const auto &Callsite : reverse(Context)) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << getCallSite(Callsite); + } + return OContextStr.str(); +} + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/contrib/libs/llvm14/tools/llvm-profgen/ErrorHandling.h b/contrib/libs/llvm14/tools/llvm-profgen/ErrorHandling.h new file mode 100644 index 0000000000..b797add8a8 --- /dev/null +++ b/contrib/libs/llvm14/tools/llvm-profgen/ErrorHandling.h @@ -0,0 +1,56 @@ +//===-- ErrorHandling.h - Error handler -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H +#define LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H + +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/WithColor.h" +#include <system_error> + +using namespace llvm; + +[[noreturn]] inline void exitWithError(const Twine &Message, + StringRef Whence = StringRef(), + StringRef Hint = StringRef()) { + WithColor::error(errs(), "llvm-profgen"); + if (!Whence.empty()) + errs() << Whence.str() << ": "; + errs() << Message << "\n"; + if (!Hint.empty()) + WithColor::note() << Hint.str() << "\n"; + ::exit(EXIT_FAILURE); +} + +[[noreturn]] inline void exitWithError(std::error_code EC, + StringRef Whence = StringRef()) { + exitWithError(EC.message(), Whence); +} + +[[noreturn]] inline void exitWithError(Error E, StringRef Whence) { + exitWithError(errorToErrorCode(std::move(E)), Whence); +} + +template <typename T, typename... Ts> +T unwrapOrError(Expected<T> EO, Ts &&... Args) { + if (EO) + return std::move(*EO); + exitWithError(EO.takeError(), std::forward<Ts>(Args)...); +} + +inline void emitWarningSummary(uint64_t Num, uint64_t Total, StringRef Msg) { + if (!Total || !Num) + return; + WithColor::warning() << format("%.2f", static_cast<double>(Num) * 100 / Total) + << "%(" << Num << "/" << Total << ") " << Msg << "\n"; +} + +#endif diff --git a/contrib/libs/llvm14/tools/llvm-profgen/PerfReader.cpp b/contrib/libs/llvm14/tools/llvm-profgen/PerfReader.cpp new file mode 100644 index 0000000000..98b4c7cdf1 --- /dev/null +++ b/contrib/libs/llvm14/tools/llvm-profgen/PerfReader.cpp @@ -0,0 +1,1222 @@ +//===-- PerfReader.cpp - perfscript reader ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "PerfReader.h" +#include "ProfileGenerator.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Process.h" + +#define DEBUG_TYPE "perf-reader" + +cl::opt<bool> SkipSymbolization("skip-symbolization", cl::init(false), + cl::ZeroOrMore, + cl::desc("Dump the unsymbolized profile to the " + "output file. It will show unwinder " + "output for CS profile generation.")); + +static cl::opt<bool> ShowMmapEvents("show-mmap-events", cl::init(false), + cl::ZeroOrMore, + cl::desc("Print binary load events.")); + +static cl::opt<bool> + UseOffset("use-offset", cl::init(true), cl::ZeroOrMore, + cl::desc("Work with `--skip-symbolization` or " + "`--unsymbolized-profile` to write/read the " + "offset instead of virtual address.")); + +static cl::opt<bool> UseLoadableSegmentAsBase( + "use-first-loadable-segment-as-base", cl::init(false), cl::ZeroOrMore, + cl::desc("Use first loadable segment address as base address " + "for offsets in unsymbolized profile. By default " + "first executable segment address is used")); + +static cl::opt<bool> + IgnoreStackSamples("ignore-stack-samples", cl::init(false), cl::ZeroOrMore, + cl::desc("Ignore call stack samples for hybrid samples " + "and produce context-insensitive profile.")); +cl::opt<bool> ShowDetailedWarning("show-detailed-warning", cl::init(false), + cl::ZeroOrMore, + cl::desc("Show detailed warning message.")); + +extern cl::opt<std::string> PerfTraceFilename; +extern cl::opt<bool> ShowDisassemblyOnly; +extern cl::opt<bool> ShowSourceLocations; +extern cl::opt<std::string> OutputFilename; + +namespace llvm { +namespace sampleprof { + +void VirtualUnwinder::unwindCall(UnwindState &State) { + uint64_t Source = State.getCurrentLBRSource(); + // An artificial return should push an external frame and an artificial call + // will match it and pop the external frame so that the context before and + // after the external call will be the same. + if (State.getCurrentLBR().IsArtificial) { + NumExtCallBranch++; + // A return is matched and pop the external frame. + if (State.getParentFrame()->isExternalFrame()) { + State.popFrame(); + } else { + // An artificial return is missing, it happens that the sample is just hit + // in the middle of the external code. In this case, the leading branch is + // a call to external, we just keep unwinding use a context-less stack. + if (State.getParentFrame() != State.getDummyRootPtr()) + NumMissingExternalFrame++; + State.clearCallStack(); + State.pushFrame(Source); + State.InstPtr.update(Source); + return; + } + } + + auto *ParentFrame = State.getParentFrame(); + // The 2nd frame after leaf could be missing if stack sample is + // taken when IP is within prolog/epilog, as frame chain isn't + // setup yet. Fill in the missing frame in that case. + // TODO: Currently we just assume all the addr that can't match the + // 2nd frame is in prolog/epilog. In the future, we will switch to + // pro/epi tracker(Dwarf CFI) for the precise check. + if (ParentFrame == State.getDummyRootPtr() || + ParentFrame->Address != Source) { + State.switchToFrame(Source); + if (ParentFrame != State.getDummyRootPtr()) { + if (State.getCurrentLBR().IsArtificial) + NumMismatchedExtCallBranch++; + else + NumMismatchedProEpiBranch++; + } + } else { + State.popFrame(); + } + State.InstPtr.update(Source); +} + +void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) { + InstructionPointer &IP = State.InstPtr; + uint64_t Target = State.getCurrentLBRTarget(); + uint64_t End = IP.Address; + if (Binary->usePseudoProbes()) { + // We don't need to top frame probe since it should be extracted + // from the range. + // The outcome of the virtual unwinding with pseudo probes is a + // map from a context key to the address range being unwound. + // This means basically linear unwinding is not needed for pseudo + // probes. The range will be simply recorded here and will be + // converted to a list of pseudo probes to report in ProfileGenerator. + State.getParentFrame()->recordRangeCount(Target, End, Repeat); + } else { + // Unwind linear execution part. + // Split and record the range by different inline context. For example: + // [0x01] ... main:1 # Target + // [0x02] ... main:2 + // [0x03] ... main:3 @ foo:1 + // [0x04] ... main:3 @ foo:2 + // [0x05] ... main:3 @ foo:3 + // [0x06] ... main:4 + // [0x07] ... main:5 # End + // It will be recorded: + // [main:*] : [0x06, 0x07], [0x01, 0x02] + // [main:3 @ foo:*] : [0x03, 0x05] + while (IP.Address > Target) { + uint64_t PrevIP = IP.Address; + IP.backward(); + // Break into segments for implicit call/return due to inlining + bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address); + if (!SameInlinee) { + State.switchToFrame(PrevIP); + State.CurrentLeafFrame->recordRangeCount(PrevIP, End, Repeat); + End = IP.Address; + } + } + assert(IP.Address == Target && "The last one must be the target address."); + // Record the remaining range, [0x01, 0x02] in the example + State.switchToFrame(IP.Address); + State.CurrentLeafFrame->recordRangeCount(IP.Address, End, Repeat); + } +} + +void VirtualUnwinder::unwindReturn(UnwindState &State) { + // Add extra frame as we unwind through the return + const LBREntry &LBR = State.getCurrentLBR(); + uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target); + State.switchToFrame(CallAddr); + // Push an external frame for the case of returning to external + // address(callback), later if an aitificial call is matched and it will be + // popped up. This is to 1)avoid context being interrupted by callback, + // context before or after the callback should be the same. 2) the call stack + // of function called by callback should be truncated which is done during + // recording the context on trie. For example: + // main (call)--> foo (call)--> callback (call)--> bar (return)--> callback + // (return)--> foo (return)--> main + // Context for bar should not include main and foo. + // For the code of foo, the context of before and after callback should both + // be [foo, main]. + if (LBR.IsArtificial) + State.pushFrame(ExternalAddr); + State.pushFrame(LBR.Source); + State.InstPtr.update(LBR.Source); +} + +void VirtualUnwinder::unwindBranch(UnwindState &State) { + // TODO: Tolerate tail call for now, as we may see tail call from libraries. + // This is only for intra function branches, excluding tail calls. + uint64_t Source = State.getCurrentLBRSource(); + State.switchToFrame(Source); + State.InstPtr.update(Source); +} + +std::shared_ptr<StringBasedCtxKey> FrameStack::getContextKey() { + std::shared_ptr<StringBasedCtxKey> KeyStr = + std::make_shared<StringBasedCtxKey>(); + KeyStr->Context = Binary->getExpandedContext(Stack, KeyStr->WasLeafInlined); + if (KeyStr->Context.empty()) + return nullptr; + return KeyStr; +} + +std::shared_ptr<ProbeBasedCtxKey> ProbeStack::getContextKey() { + std::shared_ptr<ProbeBasedCtxKey> ProbeBasedKey = + std::make_shared<ProbeBasedCtxKey>(); + for (auto CallProbe : Stack) { + ProbeBasedKey->Probes.emplace_back(CallProbe); + } + CSProfileGenerator::compressRecursionContext<const MCDecodedPseudoProbe *>( + ProbeBasedKey->Probes); + CSProfileGenerator::trimContext<const MCDecodedPseudoProbe *>( + ProbeBasedKey->Probes); + return ProbeBasedKey; +} + +template <typename T> +void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, + T &Stack) { + if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty()) + return; + + std::shared_ptr<ContextKey> Key = Stack.getContextKey(); + if (Key == nullptr) + return; + auto Ret = CtxCounterMap->emplace(Hashable<ContextKey>(Key), SampleCounter()); + SampleCounter &SCounter = Ret.first->second; + for (auto &Item : Cur->RangeSamples) { + uint64_t StartOffset = Binary->virtualAddrToOffset(std::get<0>(Item)); + uint64_t EndOffset = Binary->virtualAddrToOffset(std::get<1>(Item)); + SCounter.recordRangeCount(StartOffset, EndOffset, std::get<2>(Item)); + } + + for (auto &Item : Cur->BranchSamples) { + uint64_t SourceOffset = Binary->virtualAddrToOffset(std::get<0>(Item)); + uint64_t TargetOffset = Binary->virtualAddrToOffset(std::get<1>(Item)); + SCounter.recordBranchCount(SourceOffset, TargetOffset, std::get<2>(Item)); + } +} + +template <typename T> +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur, T &Stack) { + if (!Cur->isDummyRoot()) { + // Truncate the context for external frame since this isn't a real call + // context the compiler will see. + if (Cur->isExternalFrame() || !Stack.pushFrame(Cur)) { + // Process truncated context + // Start a new traversal ignoring its bottom context + T EmptyStack(Binary); + collectSamplesFromFrame(Cur, EmptyStack); + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), EmptyStack); + } + + // Keep note of untracked call site and deduplicate them + // for warning later. + if (!Cur->isLeafFrame()) + UntrackedCallsites.insert(Cur->Address); + + return; + } + } + + collectSamplesFromFrame(Cur, Stack); + // Process children frame + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), Stack); + } + // Recover the call stack + Stack.popFrame(); +} + +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur) { + if (Binary->usePseudoProbes()) { + ProbeStack Stack(Binary); + collectSamplesFromFrameTrie<ProbeStack>(Cur, Stack); + } else { + FrameStack Stack(Binary); + collectSamplesFromFrameTrie<FrameStack>(Cur, Stack); + } +} + +void VirtualUnwinder::recordBranchCount(const LBREntry &Branch, + UnwindState &State, uint64_t Repeat) { + if (Branch.IsArtificial || Branch.Target == ExternalAddr) + return; + + if (Binary->usePseudoProbes()) { + // Same as recordRangeCount, We don't need to top frame probe since we will + // extract it from branch's source address + State.getParentFrame()->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } else { + State.CurrentLeafFrame->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } +} + +bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) { + // Capture initial state as starting point for unwinding. + UnwindState State(Sample, Binary); + + // Sanity check - making sure leaf of LBR aligns with leaf of stack sample + // Stack sample sometimes can be unreliable, so filter out bogus ones. + if (!State.validateInitialState()) + return false; + + // Now process the LBR samples in parrallel with stack sample + // Note that we do not reverse the LBR entry order so we can + // unwind the sample stack as we walk through LBR entries. + while (State.hasNextLBR()) { + State.checkStateConsistency(); + + // Do not attempt linear unwind for the leaf range as it's incomplete. + if (!State.IsLastLBR()) { + // Unwind implicit calls/returns from inlining, along the linear path, + // break into smaller sub section each with its own calling context. + unwindLinear(State, Repeat); + } + + // Save the LBR branch before it gets unwound. + const LBREntry &Branch = State.getCurrentLBR(); + + if (isCallState(State)) { + // Unwind calls - we know we encountered call if LBR overlaps with + // transition between leaf the 2nd frame. Note that for calls that + // were not in the original stack sample, we should have added the + // extra frame when processing the return paired with this call. + unwindCall(State); + } else if (isReturnState(State)) { + // Unwind returns - check whether the IP is indeed at a return instruction + unwindReturn(State); + } else { + // Unwind branches + // For regular intra function branches, we only need to record branch with + // context. For an artificial branch cross function boundaries, we got an + // issue with returning to external code. Take the two LBR enties for + // example: [foo:8(RETURN), ext:1] [ext:3(CALL), bar:1] After perf reader, + // we only get[foo:8(RETURN), bar:1], unwinder will be confused like foo + // return to bar. Here we detect and treat this case as BRANCH instead of + // RETURN which only update the source address. + unwindBranch(State); + } + State.advanceLBR(); + // Record `branch` with calling context after unwinding. + recordBranchCount(Branch, State, Repeat); + } + // As samples are aggregated on trie, record them into counter map + collectSamplesFromFrameTrie(State.getDummyRootPtr()); + + return true; +} + +std::unique_ptr<PerfReaderBase> +PerfReaderBase::create(ProfiledBinary *Binary, PerfInputFile &PerfInput) { + std::unique_ptr<PerfReaderBase> PerfReader; + + if (PerfInput.Format == PerfFormat::UnsymbolizedProfile) { + PerfReader.reset( + new UnsymbolizedProfileReader(Binary, PerfInput.InputFile)); + return PerfReader; + } + + // For perf data input, we need to convert them into perf script first. + if (PerfInput.Format == PerfFormat::PerfData) + PerfInput = PerfScriptReader::convertPerfDataToTrace(Binary, PerfInput); + + assert((PerfInput.Format == PerfFormat::PerfScript) && + "Should be a perfscript!"); + + PerfInput.Content = + PerfScriptReader::checkPerfScriptType(PerfInput.InputFile); + if (PerfInput.Content == PerfContent::LBRStack) { + PerfReader.reset(new HybridPerfReader(Binary, PerfInput.InputFile)); + } else if (PerfInput.Content == PerfContent::LBR) { + PerfReader.reset(new LBRPerfReader(Binary, PerfInput.InputFile)); + } else { + exitWithError("Unsupported perfscript!"); + } + + return PerfReader; +} + +PerfInputFile PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary, + PerfInputFile &File) { + StringRef PerfData = File.InputFile; + // Run perf script to retrieve PIDs matching binary we're interested in. + auto PerfExecutable = sys::Process::FindInEnvPath("PATH", "perf"); + if (!PerfExecutable) { + exitWithError("Perf not found."); + } + std::string PerfPath = *PerfExecutable; + std::string PerfTraceFile = PerfData.str() + ".script.tmp"; + StringRef ScriptMMapArgs[] = {PerfPath, "script", "--show-mmap-events", + "-F", "comm,pid", "-i", + PerfData}; + Optional<StringRef> Redirects[] = {llvm::None, // Stdin + StringRef(PerfTraceFile), // Stdout + StringRef(PerfTraceFile)}; // Stderr + sys::ExecuteAndWait(PerfPath, ScriptMMapArgs, llvm::None, Redirects); + + // Collect the PIDs + TraceStream TraceIt(PerfTraceFile); + std::string PIDs; + std::unordered_set<uint32_t> PIDSet; + while (!TraceIt.isAtEoF()) { + MMapEvent MMap; + if (isMMap2Event(TraceIt.getCurrentLine()) && + extractMMap2EventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) { + auto It = PIDSet.emplace(MMap.PID); + if (It.second) { + if (!PIDs.empty()) { + PIDs.append(","); + } + PIDs.append(utostr(MMap.PID)); + } + } + TraceIt.advance(); + } + + if (PIDs.empty()) { + exitWithError("No relevant mmap event is found in perf data."); + } + + // Run perf script again to retrieve events for PIDs collected above + StringRef ScriptSampleArgs[] = {PerfPath, "script", "--show-mmap-events", + "-F", "ip,brstack", "--pid", + PIDs, "-i", PerfData}; + sys::ExecuteAndWait(PerfPath, ScriptSampleArgs, llvm::None, Redirects); + + return {PerfTraceFile, PerfFormat::PerfScript, PerfContent::UnknownContent}; +} + +void PerfScriptReader::updateBinaryAddress(const MMapEvent &Event) { + // Drop the event which doesn't belong to user-provided binary + StringRef BinaryName = llvm::sys::path::filename(Event.BinaryPath); + if (Binary->getName() != BinaryName) + return; + + // Drop the event if its image is loaded at the same address + if (Event.Address == Binary->getBaseAddress()) { + Binary->setIsLoadedByMMap(true); + return; + } + + if (Event.Offset == Binary->getTextSegmentOffset()) { + // A binary image could be unloaded and then reloaded at different + // place, so update binary load address. + // Only update for the first executable segment and assume all other + // segments are loaded at consecutive memory addresses, which is the case on + // X64. + Binary->setBaseAddress(Event.Address); + Binary->setIsLoadedByMMap(true); + } else { + // Verify segments are loaded consecutively. + const auto &Offsets = Binary->getTextSegmentOffsets(); + auto It = std::lower_bound(Offsets.begin(), Offsets.end(), Event.Offset); + if (It != Offsets.end() && *It == Event.Offset) { + // The event is for loading a separate executable segment. + auto I = std::distance(Offsets.begin(), It); + const auto &PreferredAddrs = Binary->getPreferredTextSegmentAddresses(); + if (PreferredAddrs[I] - Binary->getPreferredBaseAddress() != + Event.Address - Binary->getBaseAddress()) + exitWithError("Executable segments not loaded consecutively"); + } else { + if (It == Offsets.begin()) + exitWithError("File offset not found"); + else { + // Find the segment the event falls in. A large segment could be loaded + // via multiple mmap calls with consecutive memory addresses. + --It; + assert(*It < Event.Offset); + if (Event.Offset - *It != Event.Address - Binary->getBaseAddress()) + exitWithError("Segment not loaded by consecutive mmaps"); + } + } + } +} + +static std::string getContextKeyStr(ContextKey *K, + const ProfiledBinary *Binary) { + if (const auto *CtxKey = dyn_cast<StringBasedCtxKey>(K)) { + return SampleContext::getContextString(CtxKey->Context); + } else if (const auto *CtxKey = dyn_cast<ProbeBasedCtxKey>(K)) { + SampleContextFrameVector ContextStack; + for (const auto *Probe : CtxKey->Probes) { + Binary->getInlineContextForProbe(Probe, ContextStack, true); + } + // Probe context key at this point does not have leaf probe, so do not + // include the leaf inline location. + return SampleContext::getContextString(ContextStack, true); + } else { + llvm_unreachable("unexpected key type"); + } +} + +void HybridPerfReader::unwindSamples() { + if (Binary->useFSDiscriminator()) + exitWithError("FS discriminator is not supported in CS profile."); + VirtualUnwinder Unwinder(&SampleCounters, Binary); + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + Unwinder.unwind(Sample, Item.second); + } + + // Warn about untracked frames due to missing probes. + if (ShowDetailedWarning) { + for (auto Address : Unwinder.getUntrackedCallsites()) + WithColor::warning() << "Profile context truncated due to missing probe " + << "for call instruction at " + << format("0x%" PRIx64, Address) << "\n"; + } + + emitWarningSummary(Unwinder.getUntrackedCallsites().size(), + SampleCounters.size(), + "of profiled contexts are truncated due to missing probe " + "for call instruction."); + + emitWarningSummary( + Unwinder.NumMismatchedExtCallBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to unwinding error of external frame."); + + emitWarningSummary( + Unwinder.NumMismatchedProEpiBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to frame in prolog/epilog."); + + emitWarningSummary(Unwinder.NumMissingExternalFrame, + Unwinder.NumExtCallBranch, + "of artificial call branches but doesn't have an external " + "frame to match."); +} + +bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt, + SmallVectorImpl<LBREntry> &LBRStack) { + // The raw format of LBR stack is like: + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 + // It's in FIFO order and seperated by whitespace. + SmallVector<StringRef, 32> Records; + TraceIt.getCurrentLine().split(Records, " ", -1, false); + auto WarnInvalidLBR = [](TraceStream &TraceIt) { + WithColor::warning() << "Invalid address in LBR record at line " + << TraceIt.getLineNumber() << ": " + << TraceIt.getCurrentLine() << "\n"; + }; + + // Skip the leading instruction pointer. + size_t Index = 0; + uint64_t LeadingAddr; + if (!Records.empty() && !Records[0].contains('/')) { + if (Records[0].getAsInteger(16, LeadingAddr)) { + WarnInvalidLBR(TraceIt); + TraceIt.advance(); + return false; + } + Index = 1; + } + // Now extract LBR samples - note that we do not reverse the + // LBR entry order so we can unwind the sample stack as we walk + // through LBR entries. + uint64_t PrevTrDst = 0; + + while (Index < Records.size()) { + auto &Token = Records[Index++]; + if (Token.size() == 0) + continue; + + SmallVector<StringRef, 8> Addresses; + Token.split(Addresses, "/"); + uint64_t Src; + uint64_t Dst; + + // Stop at broken LBR records. + if (Addresses.size() < 2 || Addresses[0].substr(2).getAsInteger(16, Src) || + Addresses[1].substr(2).getAsInteger(16, Dst)) { + WarnInvalidLBR(TraceIt); + break; + } + + bool SrcIsInternal = Binary->addressIsCode(Src); + bool DstIsInternal = Binary->addressIsCode(Dst); + bool IsExternal = !SrcIsInternal && !DstIsInternal; + bool IsIncoming = !SrcIsInternal && DstIsInternal; + bool IsOutgoing = SrcIsInternal && !DstIsInternal; + bool IsArtificial = false; + + // Ignore branches outside the current binary. + if (IsExternal) { + if (!PrevTrDst && !LBRStack.empty()) { + WithColor::warning() + << "Invalid transfer to external code in LBR record at line " + << TraceIt.getLineNumber() << ": " << TraceIt.getCurrentLine() + << "\n"; + } + // Do not ignore the entire samples, the remaining LBR can still be + // unwound using a context-less stack. + continue; + } + + if (IsOutgoing) { + if (!PrevTrDst) { + // This is a leading outgoing LBR, we should keep processing the LBRs. + if (LBRStack.empty()) { + NumLeadingOutgoingLBR++; + // Record this LBR since current source and next LBR' target is still + // a valid range. + LBRStack.emplace_back(LBREntry(Src, ExternalAddr, false)); + continue; + } + // This is middle unpaired outgoing jump which is likely due to + // interrupt or incomplete LBR trace. Ignore current and subsequent + // entries since they are likely in different contexts. + break; + } + + // For transition to external code, group the Source with the next + // availabe transition target. + Dst = PrevTrDst; + PrevTrDst = 0; + IsArtificial = true; + } else { + if (PrevTrDst) { + // If we have seen an incoming transition from external code to internal + // code, but not a following outgoing transition, the incoming + // transition is likely due to interrupt which is usually unpaired. + // Ignore current and subsequent entries since they are likely in + // different contexts. + break; + } + + if (IsIncoming) { + // For transition from external code (such as dynamic libraries) to + // the current binary, keep track of the branch target which will be + // grouped with the Source of the last transition from the current + // binary. + PrevTrDst = Dst; + continue; + } + } + + // TODO: filter out buggy duplicate branches on Skylake + + LBRStack.emplace_back(LBREntry(Src, Dst, IsArtificial)); + } + TraceIt.advance(); + return !LBRStack.empty(); +} + +bool PerfScriptReader::extractCallstack(TraceStream &TraceIt, + SmallVectorImpl<uint64_t> &CallStack) { + // The raw format of call stack is like: + // 4005dc # leaf frame + // 400634 + // 400684 # root frame + // It's in bottom-up order with each frame in one line. + + // Extract stack frames from sample + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) { + StringRef FrameStr = TraceIt.getCurrentLine().ltrim(); + uint64_t FrameAddr = 0; + if (FrameStr.getAsInteger(16, FrameAddr)) { + // We might parse a non-perf sample line like empty line and comments, + // skip it + TraceIt.advance(); + return false; + } + TraceIt.advance(); + // Currently intermixed frame from different binaries is not supported. + if (!Binary->addressIsCode(FrameAddr)) { + if (CallStack.empty()) + NumLeafExternalFrame++; + // Push a special value(ExternalAddr) for the external frames so that + // unwinder can still work on this with artificial Call/Return branch. + // After unwinding, the context will be truncated for external frame. + // Also deduplicate the consecutive external addresses. + if (CallStack.empty() || CallStack.back() != ExternalAddr) + CallStack.emplace_back(ExternalAddr); + continue; + } + + // We need to translate return address to call address for non-leaf frames. + if (!CallStack.empty()) { + auto CallAddr = Binary->getCallAddrFromFrameAddr(FrameAddr); + if (!CallAddr) { + // Stop at an invalid return address caused by bad unwinding. This could + // happen to frame-pointer-based unwinding and the callee functions that + // do not have the frame pointer chain set up. + InvalidReturnAddresses.insert(FrameAddr); + break; + } + FrameAddr = CallAddr; + } + + CallStack.emplace_back(FrameAddr); + } + + // Strip out the bottom external addr. + if (CallStack.size() > 1 && CallStack.back() == ExternalAddr) + CallStack.pop_back(); + + // Skip other unrelated line, find the next valid LBR line + // Note that even for empty call stack, we should skip the address at the + // bottom, otherwise the following pass may generate a truncated callstack + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) { + TraceIt.advance(); + } + // Filter out broken stack sample. We may not have complete frame info + // if sample end up in prolog/epilog, the result is dangling context not + // connected to entry point. This should be relatively rare thus not much + // impact on overall profile quality. However we do want to filter them + // out to reduce the number of different calling contexts. One instance + // of such case - when sample landed in prolog/epilog, somehow stack + // walking will be broken in an unexpected way that higher frames will be + // missing. + return !CallStack.empty() && + !Binary->addressInPrologEpilog(CallStack.front()); +} + +void PerfScriptReader::warnIfMissingMMap() { + if (!Binary->getMissingMMapWarned() && !Binary->getIsLoadedByMMap()) { + WithColor::warning() << "No relevant mmap event is matched for " + << Binary->getName() + << ", will use preferred address (" + << format("0x%" PRIx64, + Binary->getPreferredBaseAddress()) + << ") as the base loading address!\n"; + // Avoid redundant warning, only warn at the first unmatched sample. + Binary->setMissingMMapWarned(true); + } +} + +void HybridPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + // The raw hybird sample started with call stack in FILO order and followed + // intermediately by LBR sample + // e.g. + // 4005dc # call stack leaf + // 400634 + // 400684 # call stack root + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries + // + std::shared_ptr<PerfSample> Sample = std::make_shared<PerfSample>(); + + // Parsing call stack and populate into PerfSample.CallStack + if (!extractCallstack(TraceIt, Sample->CallStack)) { + // Skip the next LBR line matched current call stack + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) + TraceIt.advance(); + return; + } + + warnIfMissingMMap(); + + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) { + // Parsing LBR stack and populate into PerfSample.LBRStack + if (extractLBRStack(TraceIt, Sample->LBRStack)) { + if (IgnoreStackSamples) { + Sample->CallStack.clear(); + } else { + // Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR + // ranges + Sample->CallStack.front() = Sample->LBRStack[0].Target; + } + // Record samples by aggregation + AggregatedSamples[Hashable<PerfSample>(Sample)] += Count; + } + } else { + // LBR sample is encoded in single line after stack sample + exitWithError("'Hybrid perf sample is corrupted, No LBR sample line"); + } +} + +void PerfScriptReader::writeUnsymbolizedProfile(StringRef Filename) { + std::error_code EC; + raw_fd_ostream OS(Filename, EC, llvm::sys::fs::OF_TextWithCRLF); + if (EC) + exitWithError(EC, Filename); + writeUnsymbolizedProfile(OS); +} + +// Use ordered map to make the output deterministic +using OrderedCounterForPrint = std::map<std::string, SampleCounter *>; + +void PerfScriptReader::writeUnsymbolizedProfile(raw_fd_ostream &OS) { + OrderedCounterForPrint OrderedCounters; + for (auto &CI : SampleCounters) { + OrderedCounters[getContextKeyStr(CI.first.getPtr(), Binary)] = &CI.second; + } + + auto SCounterPrinter = [&](RangeSample &Counter, StringRef Separator, + uint32_t Indent) { + OS.indent(Indent); + OS << Counter.size() << "\n"; + for (auto &I : Counter) { + uint64_t Start = I.first.first; + uint64_t End = I.first.second; + + if (!UseOffset || (UseOffset && UseLoadableSegmentAsBase)) { + Start = Binary->offsetToVirtualAddr(Start); + End = Binary->offsetToVirtualAddr(End); + } + + if (UseOffset && UseLoadableSegmentAsBase) { + Start -= Binary->getFirstLoadableAddress(); + End -= Binary->getFirstLoadableAddress(); + } + + OS.indent(Indent); + OS << Twine::utohexstr(Start) << Separator << Twine::utohexstr(End) << ":" + << I.second << "\n"; + } + }; + + for (auto &CI : OrderedCounters) { + uint32_t Indent = 0; + if (ProfileIsCSFlat) { + // Context string key + OS << "[" << CI.first << "]\n"; + Indent = 2; + } + + SampleCounter &Counter = *CI.second; + SCounterPrinter(Counter.RangeCounter, "-", Indent); + SCounterPrinter(Counter.BranchCounter, "->", Indent); + } +} + +// Format of input: +// number of entries in RangeCounter +// from_1-to_1:count_1 +// from_2-to_2:count_2 +// ...... +// from_n-to_n:count_n +// number of entries in BranchCounter +// src_1->dst_1:count_1 +// src_2->dst_2:count_2 +// ...... +// src_n->dst_n:count_n +void UnsymbolizedProfileReader::readSampleCounters(TraceStream &TraceIt, + SampleCounter &SCounters) { + auto exitWithErrorForTraceLine = [](TraceStream &TraceIt) { + std::string Msg = TraceIt.isAtEoF() + ? "Invalid raw profile!" + : "Invalid raw profile at line " + + Twine(TraceIt.getLineNumber()).str() + ": " + + TraceIt.getCurrentLine().str(); + exitWithError(Msg); + }; + auto ReadNumber = [&](uint64_t &Num) { + if (TraceIt.isAtEoF()) + exitWithErrorForTraceLine(TraceIt); + if (TraceIt.getCurrentLine().ltrim().getAsInteger(10, Num)) + exitWithErrorForTraceLine(TraceIt); + TraceIt.advance(); + }; + + auto ReadCounter = [&](RangeSample &Counter, StringRef Separator) { + uint64_t Num = 0; + ReadNumber(Num); + while (Num--) { + if (TraceIt.isAtEoF()) + exitWithErrorForTraceLine(TraceIt); + StringRef Line = TraceIt.getCurrentLine().ltrim(); + + uint64_t Count = 0; + auto LineSplit = Line.split(":"); + if (LineSplit.second.empty() || LineSplit.second.getAsInteger(10, Count)) + exitWithErrorForTraceLine(TraceIt); + + uint64_t Source = 0; + uint64_t Target = 0; + auto Range = LineSplit.first.split(Separator); + if (Range.second.empty() || Range.first.getAsInteger(16, Source) || + Range.second.getAsInteger(16, Target)) + exitWithErrorForTraceLine(TraceIt); + + if (!UseOffset || (UseOffset && UseLoadableSegmentAsBase)) { + uint64_t BaseAddr = 0; + if (UseOffset && UseLoadableSegmentAsBase) + BaseAddr = Binary->getFirstLoadableAddress(); + + Source = Binary->virtualAddrToOffset(Source + BaseAddr); + Target = Binary->virtualAddrToOffset(Target + BaseAddr); + } + + Counter[{Source, Target}] += Count; + TraceIt.advance(); + } + }; + + ReadCounter(SCounters.RangeCounter, "-"); + ReadCounter(SCounters.BranchCounter, "->"); +} + +void UnsymbolizedProfileReader::readUnsymbolizedProfile(StringRef FileName) { + TraceStream TraceIt(FileName); + while (!TraceIt.isAtEoF()) { + std::shared_ptr<StringBasedCtxKey> Key = + std::make_shared<StringBasedCtxKey>(); + StringRef Line = TraceIt.getCurrentLine(); + // Read context stack for CS profile. + if (Line.startswith("[")) { + ProfileIsCSFlat = true; + auto I = ContextStrSet.insert(Line.str()); + SampleContext::createCtxVectorFromStr(*I.first, Key->Context); + TraceIt.advance(); + } + auto Ret = + SampleCounters.emplace(Hashable<ContextKey>(Key), SampleCounter()); + readSampleCounters(TraceIt, Ret.first->second); + } +} + +void UnsymbolizedProfileReader::parsePerfTraces() { + readUnsymbolizedProfile(PerfTraceFile); +} + +void PerfScriptReader::computeCounterFromLBR(const PerfSample *Sample, + uint64_t Repeat) { + SampleCounter &Counter = SampleCounters.begin()->second; + uint64_t EndOffeset = 0; + for (const LBREntry &LBR : Sample->LBRStack) { + assert(LBR.Source != ExternalAddr && + "Branch' source should not be an external address, it should be " + "converted to aritificial branch."); + uint64_t SourceOffset = Binary->virtualAddrToOffset(LBR.Source); + uint64_t TargetOffset = LBR.Target == static_cast<uint64_t>(ExternalAddr) + ? static_cast<uint64_t>(ExternalAddr) + : Binary->virtualAddrToOffset(LBR.Target); + + if (!LBR.IsArtificial && TargetOffset != ExternalAddr) { + Counter.recordBranchCount(SourceOffset, TargetOffset, Repeat); + } + + // If this not the first LBR, update the range count between TO of current + // LBR and FROM of next LBR. + uint64_t StartOffset = TargetOffset; + if (EndOffeset != 0) + Counter.recordRangeCount(StartOffset, EndOffeset, Repeat); + EndOffeset = SourceOffset; + } +} + +void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + std::shared_ptr<PerfSample> Sample = std::make_shared<PerfSample>(); + // Parsing LBR stack and populate into PerfSample.LBRStack + if (extractLBRStack(TraceIt, Sample->LBRStack)) { + warnIfMissingMMap(); + // Record LBR only samples by aggregation + AggregatedSamples[Hashable<PerfSample>(Sample)] += Count; + } +} + +void PerfScriptReader::generateUnsymbolizedProfile() { + // There is no context for LBR only sample, so initialize one entry with + // fake "empty" context key. + assert(SampleCounters.empty() && + "Sample counter map should be empty before raw profile generation"); + std::shared_ptr<StringBasedCtxKey> Key = + std::make_shared<StringBasedCtxKey>(); + SampleCounters.emplace(Hashable<ContextKey>(Key), SampleCounter()); + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + computeCounterFromLBR(Sample, Item.second); + } +} + +uint64_t PerfScriptReader::parseAggregatedCount(TraceStream &TraceIt) { + // The aggregated count is optional, so do not skip the line and return 1 if + // it's unmatched + uint64_t Count = 1; + if (!TraceIt.getCurrentLine().getAsInteger(10, Count)) + TraceIt.advance(); + return Count; +} + +void PerfScriptReader::parseSample(TraceStream &TraceIt) { + NumTotalSample++; + uint64_t Count = parseAggregatedCount(TraceIt); + assert(Count >= 1 && "Aggregated count should be >= 1!"); + parseSample(TraceIt, Count); +} + +bool PerfScriptReader::extractMMap2EventForBinary(ProfiledBinary *Binary, + StringRef Line, + MMapEvent &MMap) { + // Parse a line like: + // PERF_RECORD_MMAP2 2113428/2113428: [0x7fd4efb57000(0x204000) @ 0 + // 08:04 19532229 3585508847]: r-xp /usr/lib64/libdl-2.17.so + constexpr static const char *const Pattern = + "PERF_RECORD_MMAP2 ([0-9]+)/[0-9]+: " + "\\[(0x[a-f0-9]+)\\((0x[a-f0-9]+)\\) @ " + "(0x[a-f0-9]+|0) .*\\]: [-a-z]+ (.*)"; + // Field 0 - whole line + // Field 1 - PID + // Field 2 - base address + // Field 3 - mmapped size + // Field 4 - page offset + // Field 5 - binary path + enum EventIndex { + WHOLE_LINE = 0, + PID = 1, + MMAPPED_ADDRESS = 2, + MMAPPED_SIZE = 3, + PAGE_OFFSET = 4, + BINARY_PATH = 5 + }; + + Regex RegMmap2(Pattern); + SmallVector<StringRef, 6> Fields; + bool R = RegMmap2.match(Line, &Fields); + if (!R) { + std::string ErrorMsg = "Cannot parse mmap event: " + Line.str() + " \n"; + exitWithError(ErrorMsg); + } + Fields[PID].getAsInteger(10, MMap.PID); + Fields[MMAPPED_ADDRESS].getAsInteger(0, MMap.Address); + Fields[MMAPPED_SIZE].getAsInteger(0, MMap.Size); + Fields[PAGE_OFFSET].getAsInteger(0, MMap.Offset); + MMap.BinaryPath = Fields[BINARY_PATH]; + if (ShowMmapEvents) { + outs() << "Mmap: Binary " << MMap.BinaryPath << " loaded at " + << format("0x%" PRIx64 ":", MMap.Address) << " \n"; + } + + StringRef BinaryName = llvm::sys::path::filename(MMap.BinaryPath); + return Binary->getName() == BinaryName; +} + +void PerfScriptReader::parseMMap2Event(TraceStream &TraceIt) { + MMapEvent MMap; + if (extractMMap2EventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) + updateBinaryAddress(MMap); + TraceIt.advance(); +} + +void PerfScriptReader::parseEventOrSample(TraceStream &TraceIt) { + if (isMMap2Event(TraceIt.getCurrentLine())) + parseMMap2Event(TraceIt); + else + parseSample(TraceIt); +} + +void PerfScriptReader::parseAndAggregateTrace() { + // Trace line iterator + TraceStream TraceIt(PerfTraceFile); + while (!TraceIt.isAtEoF()) + parseEventOrSample(TraceIt); +} + +// A LBR sample is like: +// 40062f 0x5c6313f/0x5c63170/P/-/-/0 0x5c630e7/0x5c63130/P/-/-/0 ... +// A heuristic for fast detection by checking whether a +// leading " 0x" and the '/' exist. +bool PerfScriptReader::isLBRSample(StringRef Line) { + // Skip the leading instruction pointer + SmallVector<StringRef, 32> Records; + Line.trim().split(Records, " ", 2, false); + if (Records.size() < 2) + return false; + if (Records[1].startswith("0x") && Records[1].contains('/')) + return true; + return false; +} + +bool PerfScriptReader::isMMap2Event(StringRef Line) { + // Short cut to avoid string find is possible. + if (Line.empty() || Line.size() < 50) + return false; + + if (std::isdigit(Line[0])) + return false; + + // PERF_RECORD_MMAP2 does not appear at the beginning of the line + // for ` perf script --show-mmap-events -i ...` + return Line.contains("PERF_RECORD_MMAP2"); +} + +// The raw hybird sample is like +// e.g. +// 4005dc # call stack leaf +// 400634 +// 400684 # call stack root +// 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... +// ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries +// Determine the perfscript contains hybrid samples(call stack + LBRs) by +// checking whether there is a non-empty call stack immediately followed by +// a LBR sample +PerfContent PerfScriptReader::checkPerfScriptType(StringRef FileName) { + TraceStream TraceIt(FileName); + uint64_t FrameAddr = 0; + while (!TraceIt.isAtEoF()) { + // Skip the aggregated count + if (!TraceIt.getCurrentLine().getAsInteger(10, FrameAddr)) + TraceIt.advance(); + + // Detect sample with call stack + int32_t Count = 0; + while (!TraceIt.isAtEoF() && + !TraceIt.getCurrentLine().ltrim().getAsInteger(16, FrameAddr)) { + Count++; + TraceIt.advance(); + } + if (!TraceIt.isAtEoF()) { + if (isLBRSample(TraceIt.getCurrentLine())) { + if (Count > 0) + return PerfContent::LBRStack; + else + return PerfContent::LBR; + } + TraceIt.advance(); + } + } + + exitWithError("Invalid perf script input!"); + return PerfContent::UnknownContent; +} + +void HybridPerfReader::generateUnsymbolizedProfile() { + ProfileIsCSFlat = !IgnoreStackSamples; + if (ProfileIsCSFlat) + unwindSamples(); + else + PerfScriptReader::generateUnsymbolizedProfile(); +} + +void PerfScriptReader::warnTruncatedStack() { + if (ShowDetailedWarning) { + for (auto Address : InvalidReturnAddresses) { + WithColor::warning() + << "Truncated stack sample due to invalid return address at " + << format("0x%" PRIx64, Address) + << ", likely caused by frame pointer omission\n"; + } + } + emitWarningSummary( + InvalidReturnAddresses.size(), AggregatedSamples.size(), + "of truncated stack samples due to invalid return address, " + "likely caused by frame pointer omission."); +} + +void PerfScriptReader::warnInvalidRange() { + std::unordered_map<std::pair<uint64_t, uint64_t>, uint64_t, + pair_hash<uint64_t, uint64_t>> + Ranges; + + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + uint64_t Count = Item.second; + uint64_t EndOffeset = 0; + for (const LBREntry &LBR : Sample->LBRStack) { + uint64_t SourceOffset = Binary->virtualAddrToOffset(LBR.Source); + uint64_t StartOffset = Binary->virtualAddrToOffset(LBR.Target); + if (EndOffeset != 0) + Ranges[{StartOffset, EndOffeset}] += Count; + EndOffeset = SourceOffset; + } + } + + if (Ranges.empty()) { + WithColor::warning() << "No samples in perf script!\n"; + return; + } + + auto WarnInvalidRange = + [&](uint64_t StartOffset, uint64_t EndOffset, StringRef Msg) { + if (!ShowDetailedWarning) + return; + WithColor::warning() + << "[" + << format("%8" PRIx64, Binary->offsetToVirtualAddr(StartOffset)) + << "," + << format("%8" PRIx64, Binary->offsetToVirtualAddr(EndOffset)) + << "]: " << Msg << "\n"; + }; + + const char *EndNotBoundaryMsg = "Range is not on instruction boundary, " + "likely due to profile and binary mismatch."; + const char *DanglingRangeMsg = "Range does not belong to any functions, " + "likely from PLT, .init or .fini section."; + const char *RangeCrossFuncMsg = + "Fall through range should not cross function boundaries, likely due to " + "profile and binary mismatch."; + + uint64_t InstNotBoundary = 0; + uint64_t UnmatchedRange = 0; + uint64_t RangeCrossFunc = 0; + + for (auto &I : Ranges) { + uint64_t StartOffset = I.first.first; + uint64_t EndOffset = I.first.second; + + if (!Binary->offsetIsCode(StartOffset) || + !Binary->offsetIsTransfer(EndOffset)) { + InstNotBoundary++; + WarnInvalidRange(StartOffset, EndOffset, EndNotBoundaryMsg); + } + + auto *FRange = Binary->findFuncRangeForOffset(StartOffset); + if (!FRange) { + UnmatchedRange++; + WarnInvalidRange(StartOffset, EndOffset, DanglingRangeMsg); + continue; + } + + if (EndOffset >= FRange->EndOffset) { + RangeCrossFunc++; + WarnInvalidRange(StartOffset, EndOffset, RangeCrossFuncMsg); + } + } + + uint64_t TotalRangeNum = Ranges.size(); + emitWarningSummary(InstNotBoundary, TotalRangeNum, + "of profiled ranges are not on instruction boundary."); + emitWarningSummary(UnmatchedRange, TotalRangeNum, + "of profiled ranges do not belong to any functions."); + emitWarningSummary(RangeCrossFunc, TotalRangeNum, + "of profiled ranges do cross function boundaries."); +} + +void PerfScriptReader::parsePerfTraces() { + // Parse perf traces and do aggregation. + parseAndAggregateTrace(); + + emitWarningSummary(NumLeafExternalFrame, NumTotalSample, + "of samples have leaf external frame in call stack."); + emitWarningSummary(NumLeadingOutgoingLBR, NumTotalSample, + "of samples have leading external LBR."); + + // Generate unsymbolized profile. + warnTruncatedStack(); + warnInvalidRange(); + generateUnsymbolizedProfile(); + AggregatedSamples.clear(); + + if (SkipSymbolization) + writeUnsymbolizedProfile(OutputFilename); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/contrib/libs/llvm14/tools/llvm-profgen/PerfReader.h b/contrib/libs/llvm14/tools/llvm-profgen/PerfReader.h new file mode 100644 index 0000000000..9d84ad34bb --- /dev/null +++ b/contrib/libs/llvm14/tools/llvm-profgen/PerfReader.h @@ -0,0 +1,728 @@ +//===-- PerfReader.h - perfscript reader -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H +#define LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H +#include "ErrorHandling.h" +#include "ProfiledBinary.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Regex.h" +#include <cstdint> +#include <fstream> +#include <list> +#include <map> +#include <vector> + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Stream based trace line iterator +class TraceStream { + std::string CurrentLine; + std::ifstream Fin; + bool IsAtEoF = false; + uint64_t LineNumber = 0; + +public: + TraceStream(StringRef Filename) : Fin(Filename.str()) { + if (!Fin.good()) + exitWithError("Error read input perf script file", Filename); + advance(); + } + + StringRef getCurrentLine() { + assert(!IsAtEoF && "Line iterator reaches the End-of-File!"); + return CurrentLine; + } + + uint64_t getLineNumber() { return LineNumber; } + + bool isAtEoF() { return IsAtEoF; } + + // Read the next line + void advance() { + if (!std::getline(Fin, CurrentLine)) { + IsAtEoF = true; + return; + } + LineNumber++; + } +}; + +// The type of input format. +enum PerfFormat { + UnknownFormat = 0, + PerfData = 1, // Raw linux perf.data. + PerfScript = 2, // Perf script create by `perf script` command. + UnsymbolizedProfile = 3, // Unsymbolized profile generated by llvm-profgen. + +}; + +// The type of perfscript content. +enum PerfContent { + UnknownContent = 0, + LBR = 1, // Only LBR sample. + LBRStack = 2, // Hybrid sample including call stack and LBR stack. +}; + +struct PerfInputFile { + std::string InputFile; + PerfFormat Format = PerfFormat::UnknownFormat; + PerfContent Content = PerfContent::UnknownContent; +}; + +// The parsed LBR sample entry. +struct LBREntry { + uint64_t Source = 0; + uint64_t Target = 0; + // An artificial branch stands for a series of consecutive branches starting + // from the current binary with a transition through external code and + // eventually landing back in the current binary. + bool IsArtificial = false; + LBREntry(uint64_t S, uint64_t T, bool I) + : Source(S), Target(T), IsArtificial(I) {} + +#ifndef NDEBUG + void print() const { + dbgs() << "from " << format("%#010x", Source) << " to " + << format("%#010x", Target); + if (IsArtificial) + dbgs() << " Artificial"; + } +#endif +}; + +#ifndef NDEBUG +static inline void printLBRStack(const SmallVectorImpl<LBREntry> &LBRStack) { + for (size_t I = 0; I < LBRStack.size(); I++) { + dbgs() << "[" << I << "] "; + LBRStack[I].print(); + dbgs() << "\n"; + } +} + +static inline void printCallStack(const SmallVectorImpl<uint64_t> &CallStack) { + for (size_t I = 0; I < CallStack.size(); I++) { + dbgs() << "[" << I << "] " << format("%#010x", CallStack[I]) << "\n"; + } +} +#endif + +// Hash interface for generic data of type T +// Data should implement a \fn getHashCode and a \fn isEqual +// Currently getHashCode is non-virtual to avoid the overhead of calling vtable, +// i.e we explicitly calculate hash of derived class, assign to base class's +// HashCode. This also provides the flexibility for calculating the hash code +// incrementally(like rolling hash) during frame stack unwinding since unwinding +// only changes the leaf of frame stack. \fn isEqual is a virtual function, +// which will have perf overhead. In the future, if we redesign a better hash +// function, then we can just skip this or switch to non-virtual function(like +// just ignore comparision if hash conflicts probabilities is low) +template <class T> class Hashable { +public: + std::shared_ptr<T> Data; + Hashable(const std::shared_ptr<T> &D) : Data(D) {} + + // Hash code generation + struct Hash { + uint64_t operator()(const Hashable<T> &Key) const { + // Don't make it virtual for getHashCode + uint64_t Hash = Key.Data->getHashCode(); + assert(Hash && "Should generate HashCode for it!"); + return Hash; + } + }; + + // Hash equal + struct Equal { + bool operator()(const Hashable<T> &LHS, const Hashable<T> &RHS) const { + // Precisely compare the data, vtable will have overhead. + return LHS.Data->isEqual(RHS.Data.get()); + } + }; + + T *getPtr() const { return Data.get(); } +}; + +struct PerfSample { + // LBR stack recorded in FIFO order. + SmallVector<LBREntry, 16> LBRStack; + // Call stack recorded in FILO(leaf to root) order, it's used for CS-profile + // generation + SmallVector<uint64_t, 16> CallStack; + + virtual ~PerfSample() = default; + uint64_t getHashCode() const { + // Use simple DJB2 hash + auto HashCombine = [](uint64_t H, uint64_t V) { + return ((H << 5) + H) + V; + }; + uint64_t Hash = 5381; + for (const auto &Value : CallStack) { + Hash = HashCombine(Hash, Value); + } + for (const auto &Entry : LBRStack) { + Hash = HashCombine(Hash, Entry.Source); + Hash = HashCombine(Hash, Entry.Target); + } + return Hash; + } + + bool isEqual(const PerfSample *Other) const { + const SmallVector<uint64_t, 16> &OtherCallStack = Other->CallStack; + const SmallVector<LBREntry, 16> &OtherLBRStack = Other->LBRStack; + + if (CallStack.size() != OtherCallStack.size() || + LBRStack.size() != OtherLBRStack.size()) + return false; + + if (!std::equal(CallStack.begin(), CallStack.end(), OtherCallStack.begin())) + return false; + + for (size_t I = 0; I < OtherLBRStack.size(); I++) { + if (LBRStack[I].Source != OtherLBRStack[I].Source || + LBRStack[I].Target != OtherLBRStack[I].Target) + return false; + } + return true; + } + +#ifndef NDEBUG + void print() const { + dbgs() << "LBR stack\n"; + printLBRStack(LBRStack); + dbgs() << "Call stack\n"; + printCallStack(CallStack); + } +#endif +}; +// After parsing the sample, we record the samples by aggregating them +// into this counter. The key stores the sample data and the value is +// the sample repeat times. +using AggregatedCounter = + std::unordered_map<Hashable<PerfSample>, uint64_t, + Hashable<PerfSample>::Hash, Hashable<PerfSample>::Equal>; + +using SampleVector = SmallVector<std::tuple<uint64_t, uint64_t, uint64_t>, 16>; + +// The state for the unwinder, it doesn't hold the data but only keep the +// pointer/index of the data, While unwinding, the CallStack is changed +// dynamicially and will be recorded as the context of the sample +struct UnwindState { + // Profiled binary that current frame address belongs to + const ProfiledBinary *Binary; + // Call stack trie node + struct ProfiledFrame { + const uint64_t Address = DummyRoot; + ProfiledFrame *Parent; + SampleVector RangeSamples; + SampleVector BranchSamples; + std::unordered_map<uint64_t, std::unique_ptr<ProfiledFrame>> Children; + + ProfiledFrame(uint64_t Addr = 0, ProfiledFrame *P = nullptr) + : Address(Addr), Parent(P) {} + ProfiledFrame *getOrCreateChildFrame(uint64_t Address) { + assert(Address && "Address can't be zero!"); + auto Ret = Children.emplace( + Address, std::make_unique<ProfiledFrame>(Address, this)); + return Ret.first->second.get(); + } + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) { + RangeSamples.emplace_back(std::make_tuple(Start, End, Count)); + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) { + BranchSamples.emplace_back(std::make_tuple(Source, Target, Count)); + } + bool isDummyRoot() { return Address == DummyRoot; } + bool isExternalFrame() { return Address == ExternalAddr; } + bool isLeafFrame() { return Children.empty(); } + }; + + ProfiledFrame DummyTrieRoot; + ProfiledFrame *CurrentLeafFrame; + // Used to fall through the LBR stack + uint32_t LBRIndex = 0; + // Reference to PerfSample.LBRStack + const SmallVector<LBREntry, 16> &LBRStack; + // Used to iterate the address range + InstructionPointer InstPtr; + UnwindState(const PerfSample *Sample, const ProfiledBinary *Binary) + : Binary(Binary), LBRStack(Sample->LBRStack), + InstPtr(Binary, Sample->CallStack.front()) { + initFrameTrie(Sample->CallStack); + } + + bool validateInitialState() { + uint64_t LBRLeaf = LBRStack[LBRIndex].Target; + uint64_t LeafAddr = CurrentLeafFrame->Address; + assert((LBRLeaf != ExternalAddr || LBRLeaf == LeafAddr) && + "External leading LBR should match the leaf frame."); + + // When we take a stack sample, ideally the sampling distance between the + // leaf IP of stack and the last LBR target shouldn't be very large. + // Use a heuristic size (0x100) to filter out broken records. + if (LeafAddr < LBRLeaf || LeafAddr >= LBRLeaf + 0x100) { + WithColor::warning() << "Bogus trace: stack tip = " + << format("%#010x", LeafAddr) + << ", LBR tip = " << format("%#010x\n", LBRLeaf); + return false; + } + return true; + } + + void checkStateConsistency() { + assert(InstPtr.Address == CurrentLeafFrame->Address && + "IP should align with context leaf"); + } + + bool hasNextLBR() const { return LBRIndex < LBRStack.size(); } + uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; } + uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; } + const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; } + bool IsLastLBR() const { return LBRIndex == 0; } + bool getLBRStackSize() const { return LBRStack.size(); } + void advanceLBR() { LBRIndex++; } + ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; } + + void pushFrame(uint64_t Address) { + CurrentLeafFrame = CurrentLeafFrame->getOrCreateChildFrame(Address); + } + + void switchToFrame(uint64_t Address) { + if (CurrentLeafFrame->Address == Address) + return; + CurrentLeafFrame = CurrentLeafFrame->Parent->getOrCreateChildFrame(Address); + } + + void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; } + + void clearCallStack() { CurrentLeafFrame = &DummyTrieRoot; } + + void initFrameTrie(const SmallVectorImpl<uint64_t> &CallStack) { + ProfiledFrame *Cur = &DummyTrieRoot; + for (auto Address : reverse(CallStack)) { + Cur = Cur->getOrCreateChildFrame(Address); + } + CurrentLeafFrame = Cur; + } + + ProfiledFrame *getDummyRootPtr() { return &DummyTrieRoot; } +}; + +// Base class for sample counter key with context +struct ContextKey { + uint64_t HashCode = 0; + virtual ~ContextKey() = default; + uint64_t getHashCode() { + if (HashCode == 0) + genHashCode(); + return HashCode; + } + virtual void genHashCode() = 0; + virtual bool isEqual(const ContextKey *K) const { + return HashCode == K->HashCode; + }; + + // Utilities for LLVM-style RTTI + enum ContextKind { CK_StringBased, CK_ProbeBased }; + const ContextKind Kind; + ContextKind getKind() const { return Kind; } + ContextKey(ContextKind K) : Kind(K){}; +}; + +// String based context id +struct StringBasedCtxKey : public ContextKey { + SampleContextFrameVector Context; + + bool WasLeafInlined; + StringBasedCtxKey() : ContextKey(CK_StringBased), WasLeafInlined(false){}; + static bool classof(const ContextKey *K) { + return K->getKind() == CK_StringBased; + } + + bool isEqual(const ContextKey *K) const override { + const StringBasedCtxKey *Other = dyn_cast<StringBasedCtxKey>(K); + return Context == Other->Context; + } + + void genHashCode() override { + HashCode = hash_value(SampleContextFrames(Context)); + } +}; + +// Probe based context key as the intermediate key of context +// String based context key will introduce redundant string handling +// since the callee context is inferred from the context string which +// need to be splitted by '@' to get the last location frame, so we +// can just use probe instead and generate the string in the end. +struct ProbeBasedCtxKey : public ContextKey { + SmallVector<const MCDecodedPseudoProbe *, 16> Probes; + + ProbeBasedCtxKey() : ContextKey(CK_ProbeBased) {} + static bool classof(const ContextKey *K) { + return K->getKind() == CK_ProbeBased; + } + + bool isEqual(const ContextKey *K) const override { + const ProbeBasedCtxKey *O = dyn_cast<ProbeBasedCtxKey>(K); + assert(O != nullptr && "Probe based key shouldn't be null in isEqual"); + return std::equal(Probes.begin(), Probes.end(), O->Probes.begin(), + O->Probes.end()); + } + + void genHashCode() override { + for (const auto *P : Probes) { + HashCode = hash_combine(HashCode, P); + } + if (HashCode == 0) { + // Avoid zero value of HashCode when it's an empty list + HashCode = 1; + } + } +}; + +// The counter of branch samples for one function indexed by the branch, +// which is represented as the source and target offset pair. +using BranchSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>; +// The counter of range samples for one function indexed by the range, +// which is represented as the start and end offset pair. +using RangeSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>; +// Wrapper for sample counters including range counter and branch counter +struct SampleCounter { + RangeSample RangeCounter; + BranchSample BranchCounter; + + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) { + assert(Start <= End && "Invalid instruction range"); + RangeCounter[{Start, End}] += Repeat; + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) { + BranchCounter[{Source, Target}] += Repeat; + } +}; + +// Sample counter with context to support context-sensitive profile +using ContextSampleCounterMap = + std::unordered_map<Hashable<ContextKey>, SampleCounter, + Hashable<ContextKey>::Hash, Hashable<ContextKey>::Equal>; + +struct FrameStack { + SmallVector<uint64_t, 16> Stack; + ProfiledBinary *Binary; + FrameStack(ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); + Stack.push_back(Cur->Address); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + std::shared_ptr<StringBasedCtxKey> getContextKey(); +}; + +struct ProbeStack { + SmallVector<const MCDecodedPseudoProbe *, 16> Stack; + ProfiledBinary *Binary; + ProbeStack(ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(Cur->Address); + // We may not find a probe for a merged or external callsite. + // Callsite merging may cause the loss of original probe IDs. + // Cutting off the context from here since the inliner will + // not know how to consume a context with unknown callsites. + if (!CallProbe) + return false; + Stack.push_back(CallProbe); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + // Use pseudo probe based context key to get the sample counter + // A context stands for a call path from 'main' to an uninlined + // callee with all inline frames recovered on that path. The probes + // belonging to that call path is the probes either originated from + // the callee or from any functions inlined into the callee. Since + // pseudo probes are organized in a tri-tree style after decoded, + // the tree path from the tri-tree root (which is the uninlined + // callee) to the probe node forms an inline context. + // Here we use a list of probe(pointer) as the context key to speed up + // aggregation and the final context string will be generate in + // ProfileGenerator + std::shared_ptr<ProbeBasedCtxKey> getContextKey(); +}; + +/* +As in hybrid sample we have a group of LBRs and the most recent sampling call +stack, we can walk through those LBRs to infer more call stacks which would be +used as context for profile. VirtualUnwinder is the class to do the call stack +unwinding based on LBR state. Two types of unwinding are processd here: +1) LBR unwinding and 2) linear range unwinding. +Specifically, for each LBR entry(can be classified into call, return, regular +branch), LBR unwinding will replay the operation by pushing, popping or +switching leaf frame towards the call stack and since the initial call stack +is most recently sampled, the replay should be in anti-execution order, i.e. for +the regular case, pop the call stack when LBR is call, push frame on call stack +when LBR is return. After each LBR processed, it also needs to align with the +next LBR by going through instructions from previous LBR's target to current +LBR's source, which is the linear unwinding. As instruction from linear range +can come from different function by inlining, linear unwinding will do the range +splitting and record counters by the range with same inline context. Over those +unwinding process we will record each call stack as context id and LBR/linear +range as sample counter for further CS profile generation. +*/ +class VirtualUnwinder { +public: + VirtualUnwinder(ContextSampleCounterMap *Counter, ProfiledBinary *B) + : CtxCounterMap(Counter), Binary(B) {} + bool unwind(const PerfSample *Sample, uint64_t Repeat); + std::set<uint64_t> &getUntrackedCallsites() { return UntrackedCallsites; } + + uint64_t NumTotalBranches = 0; + uint64_t NumExtCallBranch = 0; + uint64_t NumMissingExternalFrame = 0; + uint64_t NumMismatchedProEpiBranch = 0; + uint64_t NumMismatchedExtCallBranch = 0; + +private: + bool isCallState(UnwindState &State) const { + // The tail call frame is always missing here in stack sample, we will + // use a specific tail call tracker to infer it. + return Binary->addressIsCall(State.getCurrentLBRSource()); + } + + bool isReturnState(UnwindState &State) const { + // Simply check addressIsReturn, as ret is always reliable, both for + // regular call and tail call. + if (!Binary->addressIsReturn(State.getCurrentLBRSource())) + return false; + + // In a callback case, a return from internal code, say A, to external + // runtime can happen. The external runtime can then call back to + // another internal routine, say B. Making an artificial branch that + // looks like a return from A to B can confuse the unwinder to treat + // the instruction before B as the call instruction. Here we detect this + // case if the return target is not the next inst of call inst, then we just + // do not treat it as a return. + uint64_t CallAddr = + Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()); + return (CallAddr != 0); + } + + void unwindCall(UnwindState &State); + void unwindLinear(UnwindState &State, uint64_t Repeat); + void unwindReturn(UnwindState &State); + void unwindBranch(UnwindState &State); + + template <typename T> + void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack); + // Collect each samples on trie node by DFS traversal + template <typename T> + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur, T &Stack); + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur); + + void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State, + uint64_t Repeat); + void recordBranchCount(const LBREntry &Branch, UnwindState &State, + uint64_t Repeat); + + ContextSampleCounterMap *CtxCounterMap; + // Profiled binary that current frame address belongs to + ProfiledBinary *Binary; + // Keep track of all untracked callsites + std::set<uint64_t> UntrackedCallsites; +}; + +// Read perf trace to parse the events and samples. +class PerfReaderBase { +public: + PerfReaderBase(ProfiledBinary *B, StringRef PerfTrace) + : Binary(B), PerfTraceFile(PerfTrace) { + // Initialize the base address to preferred address. + Binary->setBaseAddress(Binary->getPreferredBaseAddress()); + }; + virtual ~PerfReaderBase() = default; + static std::unique_ptr<PerfReaderBase> create(ProfiledBinary *Binary, + PerfInputFile &PerfInput); + + // Entry of the reader to parse multiple perf traces + virtual void parsePerfTraces() = 0; + const ContextSampleCounterMap &getSampleCounters() const { + return SampleCounters; + } + bool profileIsCSFlat() { return ProfileIsCSFlat; } + +protected: + ProfiledBinary *Binary = nullptr; + StringRef PerfTraceFile; + + ContextSampleCounterMap SampleCounters; + bool ProfileIsCSFlat = false; + + uint64_t NumTotalSample = 0; + uint64_t NumLeafExternalFrame = 0; + uint64_t NumLeadingOutgoingLBR = 0; +}; + +// Read perf script to parse the events and samples. +class PerfScriptReader : public PerfReaderBase { +public: + PerfScriptReader(ProfiledBinary *B, StringRef PerfTrace) + : PerfReaderBase(B, PerfTrace){}; + + // Entry of the reader to parse multiple perf traces + virtual void parsePerfTraces() override; + // Generate perf script from perf data + static PerfInputFile convertPerfDataToTrace(ProfiledBinary *Binary, + PerfInputFile &File); + // Extract perf script type by peaking at the input + static PerfContent checkPerfScriptType(StringRef FileName); + +protected: + // The parsed MMap event + struct MMapEvent { + uint64_t PID = 0; + uint64_t Address = 0; + uint64_t Size = 0; + uint64_t Offset = 0; + StringRef BinaryPath; + }; + + // Check whether a given line is LBR sample + static bool isLBRSample(StringRef Line); + // Check whether a given line is MMAP event + static bool isMMap2Event(StringRef Line); + // Parse a single line of a PERF_RECORD_MMAP2 event looking for a + // mapping between the binary name and its memory layout. + static bool extractMMap2EventForBinary(ProfiledBinary *Binary, StringRef Line, + MMapEvent &MMap); + // Update base address based on mmap events + void updateBinaryAddress(const MMapEvent &Event); + // Parse mmap event and update binary address + void parseMMap2Event(TraceStream &TraceIt); + // Parse perf events/samples and do aggregation + void parseAndAggregateTrace(); + // Parse either an MMAP event or a perf sample + void parseEventOrSample(TraceStream &TraceIt); + // Warn if the relevant mmap event is missing. + void warnIfMissingMMap(); + // Emit accumulate warnings. + void warnTruncatedStack(); + // Warn if range is invalid. + void warnInvalidRange(); + // Extract call stack from the perf trace lines + bool extractCallstack(TraceStream &TraceIt, + SmallVectorImpl<uint64_t> &CallStack); + // Extract LBR stack from one perf trace line + bool extractLBRStack(TraceStream &TraceIt, + SmallVectorImpl<LBREntry> &LBRStack); + uint64_t parseAggregatedCount(TraceStream &TraceIt); + // Parse one sample from multiple perf lines, override this for different + // sample type + void parseSample(TraceStream &TraceIt); + // An aggregated count is given to indicate how many times the sample is + // repeated. + virtual void parseSample(TraceStream &TraceIt, uint64_t Count){}; + void computeCounterFromLBR(const PerfSample *Sample, uint64_t Repeat); + // Post process the profile after trace aggregation, we will do simple range + // overlap computation for AutoFDO, or unwind for CSSPGO(hybrid sample). + virtual void generateUnsymbolizedProfile(); + void writeUnsymbolizedProfile(StringRef Filename); + void writeUnsymbolizedProfile(raw_fd_ostream &OS); + + // Samples with the repeating time generated by the perf reader + AggregatedCounter AggregatedSamples; + // Keep track of all invalid return addresses + std::set<uint64_t> InvalidReturnAddresses; +}; + +/* + The reader of LBR only perf script. + A typical LBR sample is like: + 40062f 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + ... 0x4005c8/0x4005dc/P/-/-/0 +*/ +class LBRPerfReader : public PerfScriptReader { +public: + LBRPerfReader(ProfiledBinary *Binary, StringRef PerfTrace) + : PerfScriptReader(Binary, PerfTrace){}; + // Parse the LBR only sample. + virtual void parseSample(TraceStream &TraceIt, uint64_t Count) override; +}; + +/* + Hybrid perf script includes a group of hybrid samples(LBRs + call stack), + which is used to generate CS profile. An example of hybrid sample: + 4005dc # call stack leaf + 400634 + 400684 # call stack root + 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries +*/ +class HybridPerfReader : public PerfScriptReader { +public: + HybridPerfReader(ProfiledBinary *Binary, StringRef PerfTrace) + : PerfScriptReader(Binary, PerfTrace){}; + // Parse the hybrid sample including the call and LBR line + void parseSample(TraceStream &TraceIt, uint64_t Count) override; + void generateUnsymbolizedProfile() override; + +private: + // Unwind the hybrid samples after aggregration + void unwindSamples(); +}; + +/* + Format of unsymbolized profile: + + [frame1 @ frame2 @ ...] # If it's a CS profile + number of entries in RangeCounter + from_1-to_1:count_1 + from_2-to_2:count_2 + ...... + from_n-to_n:count_n + number of entries in BranchCounter + src_1->dst_1:count_1 + src_2->dst_2:count_2 + ...... + src_n->dst_n:count_n + [frame1 @ frame2 @ ...] # Next context + ...... + +Note that non-CS profile doesn't have the empty `[]` context. +*/ +class UnsymbolizedProfileReader : public PerfReaderBase { +public: + UnsymbolizedProfileReader(ProfiledBinary *Binary, StringRef PerfTrace) + : PerfReaderBase(Binary, PerfTrace){}; + void parsePerfTraces() override; + +private: + void readSampleCounters(TraceStream &TraceIt, SampleCounter &SCounters); + void readUnsymbolizedProfile(StringRef Filename); + + std::unordered_set<std::string> ContextStrSet; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/contrib/libs/llvm14/tools/llvm-profgen/ProfileGenerator.cpp b/contrib/libs/llvm14/tools/llvm-profgen/ProfileGenerator.cpp new file mode 100644 index 0000000000..1248e37dc5 --- /dev/null +++ b/contrib/libs/llvm14/tools/llvm-profgen/ProfileGenerator.cpp @@ -0,0 +1,979 @@ +//===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ProfileGenerator.h" +#include "ErrorHandling.h" +#include "ProfiledBinary.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include <float.h> +#include <unordered_set> + +cl::opt<std::string> OutputFilename("output", cl::value_desc("output"), + cl::Required, + cl::desc("Output profile file")); +static cl::alias OutputA("o", cl::desc("Alias for --output"), + cl::aliasopt(OutputFilename)); + +static cl::opt<SampleProfileFormat> OutputFormat( + "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary), + cl::values( + clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"), + clEnumValN(SPF_Compact_Binary, "compbinary", "Compact binary encoding"), + clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"), + clEnumValN(SPF_Text, "text", "Text encoding"), + clEnumValN(SPF_GCC, "gcc", + "GCC encoding (only meaningful for -sample)"))); + +cl::opt<bool> UseMD5( + "use-md5", cl::init(false), cl::Hidden, + cl::desc("Use md5 to represent function names in the output profile (only " + "meaningful for -extbinary)")); + +static cl::opt<bool> PopulateProfileSymbolList( + "populate-profile-symbol-list", cl::init(false), cl::Hidden, + cl::desc("Populate profile symbol list (only meaningful for -extbinary)")); + +static cl::opt<bool> FillZeroForAllFuncs( + "fill-zero-for-all-funcs", cl::init(false), cl::Hidden, + cl::desc("Attribute all functions' range with zero count " + "even it's not hit by any samples.")); + +static cl::opt<int32_t, true> RecursionCompression( + "compress-recursion", + cl::desc("Compressing recursion by deduplicating adjacent frame " + "sequences up to the specified size. -1 means no size limit."), + cl::Hidden, + cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize)); + +static cl::opt<bool> + TrimColdProfile("trim-cold-profile", cl::init(false), cl::ZeroOrMore, + cl::desc("If the total count of the profile is smaller " + "than threshold, it will be trimmed.")); + +static cl::opt<bool> CSProfMergeColdContext( + "csprof-merge-cold-context", cl::init(true), cl::ZeroOrMore, + cl::desc("If the total count of context profile is smaller than " + "the threshold, it will be merged into context-less base " + "profile.")); + +static cl::opt<uint32_t> CSProfMaxColdContextDepth( + "csprof-max-cold-context-depth", cl::init(1), cl::ZeroOrMore, + cl::desc("Keep the last K contexts while merging cold profile. 1 means the " + "context-less base profile")); + +static cl::opt<int, true> CSProfMaxContextDepth( + "csprof-max-context-depth", cl::ZeroOrMore, + cl::desc("Keep the last K contexts while merging profile. -1 means no " + "depth limit."), + cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth)); + +static cl::opt<double> HotFunctionDensityThreshold( + "hot-function-density-threshold", llvm::cl::init(1000), + llvm::cl::desc( + "specify density threshold for hot functions (default: 1000)"), + llvm::cl::Optional); +static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false), + llvm::cl::desc("show profile density details"), + llvm::cl::Optional); + +static cl::opt<bool> UpdateTotalSamples( + "update-total-samples", llvm::cl::init(false), + llvm::cl::desc( + "Update total samples by accumulating all its body samples."), + llvm::cl::Optional); + +extern cl::opt<int> ProfileSummaryCutoffHot; + +static cl::opt<bool> GenCSNestedProfile( + "gen-cs-nested-profile", cl::Hidden, cl::init(false), + cl::desc("Generate nested function profiles for CSSPGO")); + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Initialize the MaxCompressionSize to -1 which means no size limit +int32_t CSProfileGenerator::MaxCompressionSize = -1; + +int CSProfileGenerator::MaxContextDepth = -1; + +bool ProfileGeneratorBase::UseFSDiscriminator = false; + +std::unique_ptr<ProfileGeneratorBase> +ProfileGeneratorBase::create(ProfiledBinary *Binary, + const ContextSampleCounterMap &SampleCounters, + bool ProfileIsCSFlat) { + std::unique_ptr<ProfileGeneratorBase> Generator; + if (ProfileIsCSFlat) { + if (Binary->useFSDiscriminator()) + exitWithError("FS discriminator is not supported in CS profile."); + Generator.reset(new CSProfileGenerator(Binary, SampleCounters)); + } else { + Generator.reset(new ProfileGenerator(Binary, SampleCounters)); + } + ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); + FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); + + return Generator; +} + +void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer, + SampleProfileMap &ProfileMap) { + // Populate profile symbol list if extended binary format is used. + ProfileSymbolList SymbolList; + + if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) { + Binary->populateSymbolListFromDWARF(SymbolList); + Writer->setProfileSymbolList(&SymbolList); + } + + if (std::error_code EC = Writer->write(ProfileMap)) + exitWithError(std::move(EC)); +} + +void ProfileGeneratorBase::write() { + auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat); + if (std::error_code EC = WriterOrErr.getError()) + exitWithError(EC, OutputFilename); + + if (UseMD5) { + if (OutputFormat != SPF_Ext_Binary) + WithColor::warning() << "-use-md5 is ignored. Specify " + "--format=extbinary to enable it\n"; + else + WriterOrErr.get()->setUseMD5(); + } + + write(std::move(WriterOrErr.get()), ProfileMap); +} + +void ProfileGeneratorBase::showDensitySuggestion(double Density) { + if (Density == 0.0) + WithColor::warning() << "The --profile-summary-cutoff-hot option may be " + "set too low. Please check your command.\n"; + else if (Density < HotFunctionDensityThreshold) + WithColor::warning() + << "AutoFDO is estimated to optimize better with " + << format("%.1f", HotFunctionDensityThreshold / Density) + << "x more samples. Please consider increasing sampling rate or " + "profiling for longer duration to get more samples.\n"; + + if (ShowDensity) + outs() << "Minimum profile density for hot functions with top " + << format("%.2f", + static_cast<double>(ProfileSummaryCutoffHot.getValue()) / + 10000) + << "% total samples: " << format("%.1f", Density) << "\n"; +} + +double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles, + uint64_t HotCntThreshold) { + double Density = DBL_MAX; + std::vector<const FunctionSamples *> HotFuncs; + for (auto &I : Profiles) { + auto &FuncSamples = I.second; + if (FuncSamples.getTotalSamples() < HotCntThreshold) + continue; + HotFuncs.emplace_back(&FuncSamples); + } + + for (auto *FuncSamples : HotFuncs) { + auto *Func = Binary->getBinaryFunction(FuncSamples->getName()); + if (!Func) + continue; + uint64_t FuncSize = Func->getFuncSize(); + if (FuncSize == 0) + continue; + Density = + std::min(Density, static_cast<double>(FuncSamples->getTotalSamples()) / + FuncSize); + } + + return Density == DBL_MAX ? 0.0 : Density; +} + +void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges) { + + /* + Regions may overlap with each other. Using the boundary info, find all + disjoint ranges and their sample count. BoundaryPoint contains the count + multiple samples begin/end at this points. + + |<--100-->| Sample1 + |<------200------>| Sample2 + A B C + + In the example above, + Sample1 begins at A, ends at B, its value is 100. + Sample2 beings at A, ends at C, its value is 200. + For A, BeginCount is the sum of sample begins at A, which is 300 and no + samples ends at A, so EndCount is 0. + Then boundary points A, B, and C with begin/end counts are: + A: (300, 0) + B: (0, 100) + C: (0, 200) + */ + struct BoundaryPoint { + // Sum of sample counts beginning at this point + uint64_t BeginCount = UINT64_MAX; + // Sum of sample counts ending at this point + uint64_t EndCount = UINT64_MAX; + // Is the begin point of a zero range. + bool IsZeroRangeBegin = false; + // Is the end point of a zero range. + bool IsZeroRangeEnd = false; + + void addBeginCount(uint64_t Count) { + if (BeginCount == UINT64_MAX) + BeginCount = 0; + BeginCount += Count; + } + + void addEndCount(uint64_t Count) { + if (EndCount == UINT64_MAX) + EndCount = 0; + EndCount += Count; + } + }; + + /* + For the above example. With boundary points, follwing logic finds two + disjoint region of + + [A,B]: 300 + [B+1,C]: 200 + + If there is a boundary point that both begin and end, the point itself + becomes a separate disjoint region. For example, if we have original + ranges of + + |<--- 100 --->| + |<--- 200 --->| + A B C + + there are three boundary points with their begin/end counts of + + A: (100, 0) + B: (200, 100) + C: (0, 200) + + the disjoint ranges would be + + [A, B-1]: 100 + [B, B]: 300 + [B+1, C]: 200. + + Example for zero value range: + + |<--- 100 --->| + |<--- 200 --->| + |<--------------- 0 ----------------->| + A B C D E F + + [A, B-1] : 0 + [B, C] : 100 + [C+1, D-1]: 0 + [D, E] : 200 + [E+1, F] : 0 + */ + std::map<uint64_t, BoundaryPoint> Boundaries; + + for (const auto &Item : Ranges) { + assert(Item.first.first <= Item.first.second && + "Invalid instruction range"); + auto &BeginPoint = Boundaries[Item.first.first]; + auto &EndPoint = Boundaries[Item.first.second]; + uint64_t Count = Item.second; + + BeginPoint.addBeginCount(Count); + EndPoint.addEndCount(Count); + if (Count == 0) { + BeginPoint.IsZeroRangeBegin = true; + EndPoint.IsZeroRangeEnd = true; + } + } + + // Use UINT64_MAX to indicate there is no existing range between BeginAddress + // and the next valid address + uint64_t BeginAddress = UINT64_MAX; + int ZeroRangeDepth = 0; + uint64_t Count = 0; + for (const auto &Item : Boundaries) { + uint64_t Address = Item.first; + const BoundaryPoint &Point = Item.second; + if (Point.BeginCount != UINT64_MAX) { + if (BeginAddress != UINT64_MAX) + DisjointRanges[{BeginAddress, Address - 1}] = Count; + Count += Point.BeginCount; + BeginAddress = Address; + ZeroRangeDepth += Point.IsZeroRangeBegin; + } + if (Point.EndCount != UINT64_MAX) { + assert((BeginAddress != UINT64_MAX) && + "First boundary point cannot be 'end' point"); + DisjointRanges[{BeginAddress, Address}] = Count; + assert(Count >= Point.EndCount && "Mismatched live ranges"); + Count -= Point.EndCount; + BeginAddress = Address + 1; + ZeroRangeDepth -= Point.IsZeroRangeEnd; + // If the remaining count is zero and it's no longer in a zero range, this + // means we consume all the ranges before, thus mark BeginAddress as + // UINT64_MAX. e.g. supposing we have two non-overlapping ranges: + // [<---- 10 ---->] + // [<---- 20 ---->] + // A B C D + // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't + // have the [B+1, C-1] zero range. + if (Count == 0 && ZeroRangeDepth == 0) + BeginAddress = UINT64_MAX; + } + } +} + +void ProfileGeneratorBase::updateBodySamplesforFunctionProfile( + FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc, + uint64_t Count) { + // Use the maximum count of samples with same line location + uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator); + + // Use duplication factor to compensated for loop unroll/vectorization. + // Note that this is only needed when we're taking MAX of the counts at + // the location instead of SUM. + Count *= getDuplicationFactor(LeafLoc.Location.Discriminator); + + ErrorOr<uint64_t> R = + FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator); + + uint64_t PreviousCount = R ? R.get() : 0; + if (PreviousCount <= Count) { + FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator, + Count - PreviousCount); + } +} + +void ProfileGeneratorBase::updateTotalSamples() { + if (!UpdateTotalSamples) + return; + + for (auto &Item : ProfileMap) { + FunctionSamples &FunctionProfile = Item.second; + FunctionProfile.updateTotalSamples(); + } +} + +FunctionSamples & +ProfileGenerator::getTopLevelFunctionProfile(StringRef FuncName) { + SampleContext Context(FuncName); + auto Ret = ProfileMap.emplace(Context, FunctionSamples()); + if (Ret.second) { + FunctionSamples &FProfile = Ret.first->second; + FProfile.setContext(Context); + } + return Ret.first->second; +} + +void ProfileGenerator::generateProfile() { + if (Binary->usePseudoProbes()) { + // TODO: Support probe based profile generation + exitWithError("Probe based profile generation not supported for AutoFDO, " + "consider dropping `--ignore-stack-samples` or adding `--use-dwarf-correlation`."); + } else { + generateLineNumBasedProfile(); + } + postProcessProfiles(); +} + +void ProfileGenerator::postProcessProfiles() { + computeSummaryAndThreshold(); + trimColdProfiles(ProfileMap, ColdCountThreshold); + calculateAndShowDensity(ProfileMap); +} + +void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles, + uint64_t ColdCntThreshold) { + if (!TrimColdProfile) + return; + + // Move cold profiles into a tmp container. + std::vector<SampleContext> ColdProfiles; + for (const auto &I : ProfileMap) { + if (I.second.getTotalSamples() < ColdCntThreshold) + ColdProfiles.emplace_back(I.first); + } + + // Remove the cold profile from ProfileMap. + for (const auto &I : ColdProfiles) + ProfileMap.erase(I); +} + +void ProfileGenerator::generateLineNumBasedProfile() { + assert(SampleCounters.size() == 1 && + "Must have one entry for profile generation."); + const SampleCounter &SC = SampleCounters.begin()->second; + // Fill in function body samples + populateBodySamplesForAllFunctions(SC.RangeCounter); + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesForAllFunctions(SC.BranchCounter); + + updateTotalSamples(); +} + +FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples( + const SampleContextFrameVector &FrameVec, uint64_t Count) { + // Get top level profile + FunctionSamples *FunctionProfile = + &getTopLevelFunctionProfile(FrameVec[0].FuncName); + FunctionProfile->addTotalSamples(Count); + + for (size_t I = 1; I < FrameVec.size(); I++) { + LineLocation Callsite( + FrameVec[I - 1].Location.LineOffset, + getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator)); + FunctionSamplesMap &SamplesMap = + FunctionProfile->functionSamplesAt(Callsite); + auto Ret = + SamplesMap.emplace(FrameVec[I].FuncName.str(), FunctionSamples()); + if (Ret.second) { + SampleContext Context(FrameVec[I].FuncName); + Ret.first->second.setContext(Context); + } + FunctionProfile = &Ret.first->second; + FunctionProfile->addTotalSamples(Count); + } + + return *FunctionProfile; +} + +RangeSample +ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) { + RangeSample Ranges(RangeCounter.begin(), RangeCounter.end()); + if (FillZeroForAllFuncs) { + for (auto &FuncI : Binary->getAllBinaryFunctions()) { + for (auto &R : FuncI.second.Ranges) { + Ranges[{R.first, R.second - 1}] += 0; + } + } + } else { + // For each range, we search for all ranges of the function it belongs to + // and initialize it with zero count, so it remains zero if doesn't hit any + // samples. This is to be consistent with compiler that interpret zero count + // as unexecuted(cold). + for (const auto &I : RangeCounter) { + uint64_t StartOffset = I.first.first; + for (const auto &Range : Binary->getRangesForOffset(StartOffset)) + Ranges[{Range.first, Range.second - 1}] += 0; + } + } + RangeSample DisjointRanges; + findDisjointRanges(DisjointRanges, Ranges); + return DisjointRanges; +} + +void ProfileGenerator::populateBodySamplesForAllFunctions( + const RangeSample &RangeCounter) { + for (const auto &Range : preprocessRangeCounter(RangeCounter)) { + uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); + uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); + uint64_t Count = Range.second; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + uint64_t Offset = Binary->virtualAddrToOffset(IP.Address); + const SampleContextFrameVector &FrameVec = + Binary->getFrameLocationStack(Offset); + if (!FrameVec.empty()) { + // FIXME: As accumulating total count per instruction caused some + // regression, we changed to accumulate total count per byte as a + // workaround. Tuning hotness threshold on the compiler side might be + // necessary in the future. + FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples( + FrameVec, Count * Binary->getInstSize(Offset)); + updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(), + Count); + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +StringRef ProfileGeneratorBase::getCalleeNameForOffset(uint64_t TargetOffset) { + // Get the function range by branch target if it's a call branch. + auto *FRange = Binary->findFuncRangeForStartOffset(TargetOffset); + + // We won't accumulate sample count for a range whose start is not the real + // function entry such as outlined function or inner labels. + if (!FRange || !FRange->IsFuncEntry) + return StringRef(); + + return FunctionSamples::getCanonicalFnName(FRange->getFuncName()); +} + +void ProfileGenerator::populateBoundarySamplesForAllFunctions( + const BranchSample &BranchCounters) { + for (const auto &Entry : BranchCounters) { + uint64_t SourceOffset = Entry.first.first; + uint64_t TargetOffset = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForOffset(TargetOffset); + if (CalleeName.size() == 0) + continue; + // Record called target sample and its count. + const SampleContextFrameVector &FrameVec = + Binary->getFrameLocationStack(SourceOffset); + if (!FrameVec.empty()) { + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, 0); + FunctionProfile.addCalledTargetSamples( + FrameVec.back().Location.LineOffset, + getBaseDiscriminator(FrameVec.back().Location.Discriminator), + CalleeName, Count); + } + // Add head samples for callee. + FunctionSamples &CalleeProfile = getTopLevelFunctionProfile(CalleeName); + CalleeProfile.addHeadSamples(Count); + } +} + +void ProfileGeneratorBase::calculateAndShowDensity( + const SampleProfileMap &Profiles) { + double Density = calculateDensity(Profiles, HotCountThreshold); + showDensitySuggestion(Density); +} + +FunctionSamples &CSProfileGenerator::getFunctionProfileForContext( + const SampleContextFrameVector &Context, bool WasLeafInlined) { + auto I = ProfileMap.find(SampleContext(Context)); + if (I == ProfileMap.end()) { + // Save the new context for future references. + SampleContextFrames NewContext = *Contexts.insert(Context).first; + SampleContext FContext(NewContext, RawContext); + auto Ret = ProfileMap.emplace(FContext, FunctionSamples()); + if (WasLeafInlined) + FContext.setAttribute(ContextWasInlined); + FunctionSamples &FProfile = Ret.first->second; + FProfile.setContext(FContext); + return Ret.first->second; + } + return I->second; +} + +void CSProfileGenerator::generateProfile() { + FunctionSamples::ProfileIsCSFlat = true; + + if (Binary->getTrackFuncContextSize()) + computeSizeForProfiledFunctions(); + + if (Binary->usePseudoProbes()) { + // Enable pseudo probe functionalities in SampleProf + FunctionSamples::ProfileIsProbeBased = true; + generateProbeBasedProfile(); + } else { + generateLineNumBasedProfile(); + } + postProcessProfiles(); +} + +void CSProfileGenerator::computeSizeForProfiledFunctions() { + // Hash map to deduplicate the function range and the item is a pair of + // function start and end offset. + std::unordered_map<uint64_t, uint64_t> AggregatedRanges; + // Go through all the ranges in the CS counters, use the start of the range to + // look up the function it belongs and record the function range. + for (const auto &CI : SampleCounters) { + for (const auto &Item : CI.second.RangeCounter) { + // FIXME: Filter the bogus crossing function range. + uint64_t StartOffset = Item.first.first; + // Note that a function can be spilt into multiple ranges, so get all + // ranges of the function. + for (const auto &Range : Binary->getRangesForOffset(StartOffset)) + AggregatedRanges[Range.first] = Range.second; + } + } + + for (const auto &I : AggregatedRanges) { + uint64_t StartOffset = I.first; + uint64_t EndOffset = I.second; + Binary->computeInlinedContextSizeForRange(StartOffset, EndOffset); + } +} + +void CSProfileGenerator::generateLineNumBasedProfile() { + for (const auto &CI : SampleCounters) { + const auto *CtxKey = cast<StringBasedCtxKey>(CI.first.getPtr()); + + // Get or create function profile for the range + FunctionSamples &FunctionProfile = + getFunctionProfileForContext(CtxKey->Context, CtxKey->WasLeafInlined); + + // Fill in function body samples + populateBodySamplesForFunction(FunctionProfile, CI.second.RangeCounter); + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesForFunction(CtxKey->Context, FunctionProfile, + CI.second.BranchCounter); + } + // Fill in call site value sample for inlined calls and also use context to + // infer missing samples. Since we don't have call count for inlined + // functions, we estimate it from inlinee's profile using the entry of the + // body sample. + populateInferredFunctionSamples(); + + updateTotalSamples(); +} + +void CSProfileGenerator::populateBodySamplesForFunction( + FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) { + // Compute disjoint ranges first, so we can use MAX + // for calculating count for each location. + RangeSample Ranges; + findDisjointRanges(Ranges, RangeCounter); + for (const auto &Range : Ranges) { + uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); + uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); + uint64_t Count = Range.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (Count == 0) + continue; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + uint64_t Offset = Binary->virtualAddrToOffset(IP.Address); + auto LeafLoc = Binary->getInlineLeafFrameLoc(Offset); + if (LeafLoc.hasValue()) { + // Recording body sample for this specific context + updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count); + FunctionProfile.addTotalSamples(Count); + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +void CSProfileGenerator::populateBoundarySamplesForFunction( + SampleContextFrames ContextId, FunctionSamples &FunctionProfile, + const BranchSample &BranchCounters) { + + for (const auto &Entry : BranchCounters) { + uint64_t SourceOffset = Entry.first.first; + uint64_t TargetOffset = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForOffset(TargetOffset); + if (CalleeName.size() == 0) + continue; + + // Record called target sample and its count + auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceOffset); + if (!LeafLoc.hasValue()) + continue; + FunctionProfile.addCalledTargetSamples( + LeafLoc->Location.LineOffset, + getBaseDiscriminator(LeafLoc->Location.Discriminator), CalleeName, + Count); + + // Record head sample for called target(callee) + SampleContextFrameVector CalleeCtx(ContextId.begin(), ContextId.end()); + assert(CalleeCtx.back().FuncName == LeafLoc->FuncName && + "Leaf function name doesn't match"); + CalleeCtx.back() = *LeafLoc; + CalleeCtx.emplace_back(CalleeName, LineLocation(0, 0)); + FunctionSamples &CalleeProfile = getFunctionProfileForContext(CalleeCtx); + CalleeProfile.addHeadSamples(Count); + } +} + +static SampleContextFrame +getCallerContext(SampleContextFrames CalleeContext, + SampleContextFrameVector &CallerContext) { + assert(CalleeContext.size() > 1 && "Unexpected empty context"); + CalleeContext = CalleeContext.drop_back(); + CallerContext.assign(CalleeContext.begin(), CalleeContext.end()); + SampleContextFrame CallerFrame = CallerContext.back(); + CallerContext.back().Location = LineLocation(0, 0); + return CallerFrame; +} + +void CSProfileGenerator::populateInferredFunctionSamples() { + for (const auto &Item : ProfileMap) { + const auto &CalleeContext = Item.first; + const FunctionSamples &CalleeProfile = Item.second; + + // If we already have head sample counts, we must have value profile + // for call sites added already. Skip to avoid double counting. + if (CalleeProfile.getHeadSamples()) + continue; + // If we don't have context, nothing to do for caller's call site. + // This could happen for entry point function. + if (CalleeContext.isBaseContext()) + continue; + + // Infer Caller's frame loc and context ID through string splitting + SampleContextFrameVector CallerContextId; + SampleContextFrame &&CallerLeafFrameLoc = + getCallerContext(CalleeContext.getContextFrames(), CallerContextId); + SampleContextFrames CallerContext(CallerContextId); + + // It's possible that we haven't seen any sample directly in the caller, + // in which case CallerProfile will not exist. But we can't modify + // ProfileMap while iterating it. + // TODO: created function profile for those callers too + if (ProfileMap.find(CallerContext) == ProfileMap.end()) + continue; + FunctionSamples &CallerProfile = ProfileMap[CallerContext]; + + // Since we don't have call count for inlined functions, we + // estimate it from inlinee's profile using entry body sample. + uint64_t EstimatedCallCount = CalleeProfile.getEntrySamples(); + // If we don't have samples with location, use 1 to indicate live. + if (!EstimatedCallCount && !CalleeProfile.getBodySamples().size()) + EstimatedCallCount = 1; + CallerProfile.addCalledTargetSamples( + CallerLeafFrameLoc.Location.LineOffset, + CallerLeafFrameLoc.Location.Discriminator, + CalleeProfile.getContext().getName(), EstimatedCallCount); + CallerProfile.addBodySamples(CallerLeafFrameLoc.Location.LineOffset, + CallerLeafFrameLoc.Location.Discriminator, + EstimatedCallCount); + CallerProfile.addTotalSamples(EstimatedCallCount); + } +} + +void CSProfileGenerator::postProcessProfiles() { + // Compute hot/cold threshold based on profile. This will be used for cold + // context profile merging/trimming. + computeSummaryAndThreshold(); + + // Run global pre-inliner to adjust/merge context profile based on estimated + // inline decisions. + if (EnableCSPreInliner) { + CSPreInliner(ProfileMap, *Binary, HotCountThreshold, ColdCountThreshold) + .run(); + // Turn off the profile merger by default unless it is explicitly enabled. + if (!CSProfMergeColdContext.getNumOccurrences()) + CSProfMergeColdContext = false; + } + + // Trim and merge cold context profile using cold threshold above. + if (TrimColdProfile || CSProfMergeColdContext) { + SampleContextTrimmer(ProfileMap) + .trimAndMergeColdContextProfiles( + HotCountThreshold, TrimColdProfile, CSProfMergeColdContext, + CSProfMaxColdContextDepth, EnableCSPreInliner); + } + + // Merge function samples of CS profile to calculate profile density. + sampleprof::SampleProfileMap ContextLessProfiles; + for (const auto &I : ProfileMap) { + ContextLessProfiles[I.second.getName()].merge(I.second); + } + + calculateAndShowDensity(ContextLessProfiles); + if (GenCSNestedProfile) { + CSProfileConverter CSConverter(ProfileMap); + CSConverter.convertProfiles(); + FunctionSamples::ProfileIsCSFlat = false; + FunctionSamples::ProfileIsCSNested = EnableCSPreInliner; + } +} + +void ProfileGeneratorBase::computeSummaryAndThreshold() { + SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); + auto Summary = Builder.computeSummaryForProfiles(ProfileMap); + HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold( + (Summary->getDetailedSummary())); + ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); +} + +// Helper function to extract context prefix string stack +// Extract context stack for reusing, leaf context stack will +// be added compressed while looking up function profile +static void extractPrefixContextStack( + SampleContextFrameVector &ContextStack, + const SmallVectorImpl<const MCDecodedPseudoProbe *> &Probes, + ProfiledBinary *Binary) { + for (const auto *P : Probes) { + Binary->getInlineContextForProbe(P, ContextStack, true); + } +} + +void CSProfileGenerator::generateProbeBasedProfile() { + for (const auto &CI : SampleCounters) { + const ProbeBasedCtxKey *CtxKey = + dyn_cast<ProbeBasedCtxKey>(CI.first.getPtr()); + SampleContextFrameVector ContextStack; + extractPrefixContextStack(ContextStack, CtxKey->Probes, Binary); + // Fill in function body samples from probes, also infer caller's samples + // from callee's probe + populateBodySamplesWithProbes(CI.second.RangeCounter, ContextStack); + // Fill in boundary samples for a call probe + populateBoundarySamplesWithProbes(CI.second.BranchCounter, ContextStack); + } +} + +void CSProfileGenerator::extractProbesFromRange(const RangeSample &RangeCounter, + ProbeCounterMap &ProbeCounter) { + RangeSample Ranges; + findDisjointRanges(Ranges, RangeCounter); + for (const auto &Range : Ranges) { + uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); + uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); + uint64_t Count = Range.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (Count == 0) + continue; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + const AddressProbesMap &Address2ProbesMap = + Binary->getAddress2ProbesMap(); + auto It = Address2ProbesMap.find(IP.Address); + if (It != Address2ProbesMap.end()) { + for (const auto &Probe : It->second) { + if (!Probe.isBlock()) + continue; + ProbeCounter[&Probe] += Count; + } + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +void CSProfileGenerator::populateBodySamplesWithProbes( + const RangeSample &RangeCounter, SampleContextFrames ContextStack) { + ProbeCounterMap ProbeCounter; + // Extract the top frame probes by looking up each address among the range in + // the Address2ProbeMap + extractProbesFromRange(RangeCounter, ProbeCounter); + std::unordered_map<MCDecodedPseudoProbeInlineTree *, + std::unordered_set<FunctionSamples *>> + FrameSamples; + for (const auto &PI : ProbeCounter) { + const MCDecodedPseudoProbe *Probe = PI.first; + uint64_t Count = PI.second; + FunctionSamples &FunctionProfile = + getFunctionProfileForLeafProbe(ContextStack, Probe); + // Record the current frame and FunctionProfile whenever samples are + // collected for non-danglie probes. This is for reporting all of the + // zero count probes of the frame later. + FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile); + FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count); + FunctionProfile.addTotalSamples(Count); + if (Probe->isEntry()) { + FunctionProfile.addHeadSamples(Count); + // Look up for the caller's function profile + const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe); + SampleContextFrames CalleeContextId = + FunctionProfile.getContext().getContextFrames(); + if (InlinerDesc != nullptr && CalleeContextId.size() > 1) { + // Since the context id will be compressed, we have to use callee's + // context id to infer caller's context id to ensure they share the + // same context prefix. + SampleContextFrameVector CallerContextId; + SampleContextFrame &&CallerLeafFrameLoc = + getCallerContext(CalleeContextId, CallerContextId); + uint64_t CallerIndex = CallerLeafFrameLoc.Location.LineOffset; + assert(CallerIndex && + "Inferred caller's location index shouldn't be zero!"); + FunctionSamples &CallerProfile = + getFunctionProfileForContext(CallerContextId); + CallerProfile.setFunctionHash(InlinerDesc->FuncHash); + CallerProfile.addBodySamples(CallerIndex, 0, Count); + CallerProfile.addTotalSamples(Count); + CallerProfile.addCalledTargetSamples( + CallerIndex, 0, FunctionProfile.getContext().getName(), Count); + } + } + } + + // Assign zero count for remaining probes without sample hits to + // differentiate from probes optimized away, of which the counts are unknown + // and will be inferred by the compiler. + for (auto &I : FrameSamples) { + for (auto *FunctionProfile : I.second) { + for (auto *Probe : I.first->getProbes()) { + FunctionProfile->addBodySamplesForProbe(Probe->getIndex(), 0); + } + } + } +} + +void CSProfileGenerator::populateBoundarySamplesWithProbes( + const BranchSample &BranchCounter, SampleContextFrames ContextStack) { + for (const auto &BI : BranchCounter) { + uint64_t SourceOffset = BI.first.first; + uint64_t TargetOffset = BI.first.second; + uint64_t Count = BI.second; + uint64_t SourceAddress = Binary->offsetToVirtualAddr(SourceOffset); + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(SourceAddress); + if (CallProbe == nullptr) + continue; + FunctionSamples &FunctionProfile = + getFunctionProfileForLeafProbe(ContextStack, CallProbe); + FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count); + FunctionProfile.addTotalSamples(Count); + StringRef CalleeName = getCalleeNameForOffset(TargetOffset); + if (CalleeName.size() == 0) + continue; + FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), 0, CalleeName, + Count); + } +} + +FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe( + SampleContextFrames ContextStack, const MCDecodedPseudoProbe *LeafProbe) { + + // Explicitly copy the context for appending the leaf context + SampleContextFrameVector NewContextStack(ContextStack.begin(), + ContextStack.end()); + Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true); + // For leaf inlined context with the top frame, we should strip off the top + // frame's probe id, like: + // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar" + auto LeafFrame = NewContextStack.back(); + LeafFrame.Location = LineLocation(0, 0); + NewContextStack.pop_back(); + // Compress the context string except for the leaf frame + CSProfileGenerator::compressRecursionContext(NewContextStack); + CSProfileGenerator::trimContext(NewContextStack); + NewContextStack.push_back(LeafFrame); + + const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid()); + bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite(); + FunctionSamples &FunctionProile = + getFunctionProfileForContext(NewContextStack, WasLeafInlined); + FunctionProile.setFunctionHash(FuncDesc->FuncHash); + return FunctionProile; +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/contrib/libs/llvm14/tools/llvm-profgen/ProfileGenerator.h b/contrib/libs/llvm14/tools/llvm-profgen/ProfileGenerator.h new file mode 100644 index 0000000000..af349ac991 --- /dev/null +++ b/contrib/libs/llvm14/tools/llvm-profgen/ProfileGenerator.h @@ -0,0 +1,312 @@ +//===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#include "CSPreInliner.h" +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/ProfileData/SampleProfWriter.h" +#include <memory> +#include <unordered_set> + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// This base class for profile generation of sample-based PGO. We reuse all +// structures relating to function profiles and profile writers as seen in +// /ProfileData/SampleProf.h. +class ProfileGeneratorBase { + +public: + ProfileGeneratorBase(ProfiledBinary *Binary, + const ContextSampleCounterMap &Counters) + : Binary(Binary), SampleCounters(Counters){}; + virtual ~ProfileGeneratorBase() = default; + static std::unique_ptr<ProfileGeneratorBase> + create(ProfiledBinary *Binary, const ContextSampleCounterMap &SampleCounters, + bool ProfileIsCSFlat); + virtual void generateProfile() = 0; + void write(); + + static uint32_t + getDuplicationFactor(unsigned Discriminator, + bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { + return UseFSD ? 1 + : llvm::DILocation::getDuplicationFactorFromDiscriminator( + Discriminator); + } + + static uint32_t + getBaseDiscriminator(unsigned Discriminator, + bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { + return UseFSD ? Discriminator + : DILocation::getBaseDiscriminatorFromDiscriminator( + Discriminator, /* IsFSDiscriminator */ false); + } + + static bool UseFSDiscriminator; + +protected: + // Use SampleProfileWriter to serialize profile map + void write(std::unique_ptr<SampleProfileWriter> Writer, + SampleProfileMap &ProfileMap); + /* + For each region boundary point, mark if it is begin or end (or both) of + the region. Boundary points are inclusive. Log the sample count as well + so we can use it when we compute the sample count of each disjoint region + later. Note that there might be multiple ranges with different sample + count that share same begin/end point. We need to accumulate the sample + count for the boundary point for such case, because for the example + below, + + |<--100-->| + |<------200------>| + A B C + + sample count for disjoint region [A,B] would be 300. + */ + void findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges); + // Helper function for updating body sample for a leaf location in + // FunctionProfile + void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile, + const SampleContextFrame &LeafLoc, + uint64_t Count); + void updateTotalSamples(); + + StringRef getCalleeNameForOffset(uint64_t TargetOffset); + + void computeSummaryAndThreshold(); + + void calculateAndShowDensity(const SampleProfileMap &Profiles); + + double calculateDensity(const SampleProfileMap &Profiles, + uint64_t HotCntThreshold); + + void showDensitySuggestion(double Density); + + // Thresholds from profile summary to answer isHotCount/isColdCount queries. + uint64_t HotCountThreshold; + + uint64_t ColdCountThreshold; + + // Used by SampleProfileWriter + SampleProfileMap ProfileMap; + + ProfiledBinary *Binary = nullptr; + + const ContextSampleCounterMap &SampleCounters; +}; + +class ProfileGenerator : public ProfileGeneratorBase { + +public: + ProfileGenerator(ProfiledBinary *Binary, + const ContextSampleCounterMap &Counters) + : ProfileGeneratorBase(Binary, Counters){}; + void generateProfile() override; + +private: + void generateLineNumBasedProfile(); + RangeSample preprocessRangeCounter(const RangeSample &RangeCounter); + FunctionSamples &getTopLevelFunctionProfile(StringRef FuncName); + // Helper function to get the leaf frame's FunctionProfile by traversing the + // inline stack and meanwhile it adds the total samples for each frame's + // function profile. + FunctionSamples & + getLeafProfileAndAddTotalSamples(const SampleContextFrameVector &FrameVec, + uint64_t Count); + void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter); + void + populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters); + void postProcessProfiles(); + void trimColdProfiles(const SampleProfileMap &Profiles, + uint64_t ColdCntThreshold); +}; + +using ProbeCounterMap = + std::unordered_map<const MCDecodedPseudoProbe *, uint64_t>; + +class CSProfileGenerator : public ProfileGeneratorBase { +public: + CSProfileGenerator(ProfiledBinary *Binary, + const ContextSampleCounterMap &Counters) + : ProfileGeneratorBase(Binary, Counters){}; + + void generateProfile() override; + + // Trim the context stack at a given depth. + template <typename T> + static void trimContext(SmallVectorImpl<T> &S, int Depth = MaxContextDepth) { + if (Depth < 0 || static_cast<size_t>(Depth) >= S.size()) + return; + std::copy(S.begin() + S.size() - static_cast<size_t>(Depth), S.end(), + S.begin()); + S.resize(Depth); + } + + // Remove adjacent repeated context sequences up to a given sequence length, + // -1 means no size limit. Note that repeated sequences are identified based + // on the exact call site, this is finer granularity than function recursion. + template <typename T> + static void compressRecursionContext(SmallVectorImpl<T> &Context, + int32_t CSize = MaxCompressionSize) { + uint32_t I = 1; + uint32_t HS = static_cast<uint32_t>(Context.size() / 2); + uint32_t MaxDedupSize = + CSize == -1 ? HS : std::min(static_cast<uint32_t>(CSize), HS); + auto BeginIter = Context.begin(); + // Use an in-place algorithm to save memory copy + // End indicates the end location of current iteration's data + uint32_t End = 0; + // Deduplicate from length 1 to the max possible size of a repeated + // sequence. + while (I <= MaxDedupSize) { + // This is a linear algorithm that deduplicates adjacent repeated + // sequences of size I. The deduplication detection runs on a sliding + // window whose size is 2*I and it keeps sliding the window to deduplicate + // the data inside. Once duplication is detected, deduplicate it by + // skipping the right half part of the window, otherwise just copy back + // the new one by appending them at the back of End pointer(for the next + // iteration). + // + // For example: + // Input: [a1, a2, b1, b2] + // (Added index to distinguish the same char, the origin is [a, a, b, + // b], the size of the dedup window is 2(I = 1) at the beginning) + // + // 1) The initial status is a dummy window[null, a1], then just copy the + // right half of the window(End = 0), then slide the window. + // Result: [a1], a2, b1, b2 (End points to the element right before ], + // after ] is the data of the previous iteration) + // + // 2) Next window is [a1, a2]. Since a1 == a2, then skip the right half of + // the window i.e the duplication happen. Only slide the window. + // Result: [a1], a2, b1, b2 + // + // 3) Next window is [a2, b1], copy the right half of the window(b1 is + // new) to the End and slide the window. + // Result: [a1, b1], b1, b2 + // + // 4) Next window is [b1, b2], same to 2), skip b2. + // Result: [a1, b1], b1, b2 + // After resize, it will be [a, b] + + // Use pointers like below to do comparison inside the window + // [a b c a b c] + // | | | | | + // LeftBoundary Left Right Left+I Right+I + // A duplication found if Left < LeftBoundry. + + int32_t Right = I - 1; + End = I; + int32_t LeftBoundary = 0; + while (Right + I < Context.size()) { + // To avoids scanning a part of a sequence repeatedly, it finds out + // the common suffix of two hald in the window. The common suffix will + // serve as the common prefix of next possible pair of duplicate + // sequences. The non-common part will be ignored and never scanned + // again. + + // For example. + // Input: [a, b1], c1, b2, c2 + // I = 2 + // + // 1) For the window [a, b1, c1, b2], non-common-suffix for the right + // part is 'c1', copy it and only slide the window 1 step. + // Result: [a, b1, c1], b2, c2 + // + // 2) Next window is [b1, c1, b2, c2], so duplication happen. + // Result after resize: [a, b, c] + + int32_t Left = Right; + while (Left >= LeftBoundary && Context[Left] == Context[Left + I]) { + // Find the longest suffix inside the window. When stops, Left points + // at the diverging point in the current sequence. + Left--; + } + + bool DuplicationFound = (Left < LeftBoundary); + // Don't need to recheck the data before Right + LeftBoundary = Right + 1; + if (DuplicationFound) { + // Duplication found, skip right half of the window. + Right += I; + } else { + // Copy the non-common-suffix part of the adjacent sequence. + std::copy(BeginIter + Right + 1, BeginIter + Left + I + 1, + BeginIter + End); + End += Left + I - Right; + // Only slide the window by the size of non-common-suffix + Right = Left + I; + } + } + // Don't forget the remaining part that's not scanned. + std::copy(BeginIter + Right + 1, Context.end(), BeginIter + End); + End += Context.size() - Right - 1; + I++; + Context.resize(End); + MaxDedupSize = std::min(static_cast<uint32_t>(End / 2), MaxDedupSize); + } + } + +private: + void generateLineNumBasedProfile(); + // Lookup or create FunctionSamples for the context + FunctionSamples & + getFunctionProfileForContext(const SampleContextFrameVector &Context, + bool WasLeafInlined = false); + // For profiled only functions, on-demand compute their inline context + // function byte size which is used by the pre-inliner. + void computeSizeForProfiledFunctions(); + // Post processing for profiles before writing out, such as mermining + // and trimming cold profiles, running preinliner on profiles. + void postProcessProfiles(); + + void populateBodySamplesForFunction(FunctionSamples &FunctionProfile, + const RangeSample &RangeCounters); + void populateBoundarySamplesForFunction(SampleContextFrames ContextId, + FunctionSamples &FunctionProfile, + const BranchSample &BranchCounters); + void populateInferredFunctionSamples(); + + void generateProbeBasedProfile(); + // Go through each address from range to extract the top frame probe by + // looking up in the Address2ProbeMap + void extractProbesFromRange(const RangeSample &RangeCounter, + ProbeCounterMap &ProbeCounter); + // Fill in function body samples from probes + void populateBodySamplesWithProbes(const RangeSample &RangeCounter, + SampleContextFrames ContextStack); + // Fill in boundary samples for a call probe + void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter, + SampleContextFrames ContextStack); + // Helper function to get FunctionSamples for the leaf probe + FunctionSamples & + getFunctionProfileForLeafProbe(SampleContextFrames ContextStack, + const MCDecodedPseudoProbe *LeafProbe); + + // Underlying context table serves for sample profile writer. + std::unordered_set<SampleContextFrameVector, SampleContextFrameHash> Contexts; + +public: + // Deduplicate adjacent repeated context sequences up to a given sequence + // length. -1 means no size limit. + static int32_t MaxCompressionSize; + static int MaxContextDepth; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/contrib/libs/llvm14/tools/llvm-profgen/ProfiledBinary.cpp b/contrib/libs/llvm14/tools/llvm-profgen/ProfiledBinary.cpp new file mode 100644 index 0000000000..a773a3c98d --- /dev/null +++ b/contrib/libs/llvm14/tools/llvm-profgen/ProfiledBinary.cpp @@ -0,0 +1,790 @@ +//===-- ProfiledBinary.cpp - Binary decoder ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ProfiledBinary.h" +#include "ErrorHandling.h" +#include "ProfileGenerator.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Demangle/Demangle.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/TargetSelect.h" + +#define DEBUG_TYPE "load-binary" + +using namespace llvm; +using namespace sampleprof; + +cl::opt<bool> ShowDisassemblyOnly("show-disassembly-only", cl::init(false), + cl::ZeroOrMore, + cl::desc("Print disassembled code.")); + +cl::opt<bool> ShowSourceLocations("show-source-locations", cl::init(false), + cl::ZeroOrMore, + cl::desc("Print source locations.")); + +static cl::opt<bool> + ShowCanonicalFnName("show-canonical-fname", cl::init(false), cl::ZeroOrMore, + cl::desc("Print canonical function name.")); + +static cl::opt<bool> ShowPseudoProbe( + "show-pseudo-probe", cl::init(false), cl::ZeroOrMore, + cl::desc("Print pseudo probe section and disassembled info.")); + +static cl::opt<bool> UseDwarfCorrelation( + "use-dwarf-correlation", cl::init(false), cl::ZeroOrMore, + cl::desc("Use dwarf for profile correlation even when binary contains " + "pseudo probe.")); + +static cl::list<std::string> DisassembleFunctions( + "disassemble-functions", cl::CommaSeparated, + cl::desc("List of functions to print disassembly for. Accept demangled " + "names only. Only work with show-disassembly-only")); + +extern cl::opt<bool> ShowDetailedWarning; + +namespace llvm { +namespace sampleprof { + +static const Target *getTarget(const ObjectFile *Obj) { + Triple TheTriple = Obj->makeTriple(); + std::string Error; + std::string ArchName; + const Target *TheTarget = + TargetRegistry::lookupTarget(ArchName, TheTriple, Error); + if (!TheTarget) + exitWithError(Error, Obj->getFileName()); + return TheTarget; +} + +void BinarySizeContextTracker::addInstructionForContext( + const SampleContextFrameVector &Context, uint32_t InstrSize) { + ContextTrieNode *CurNode = &RootContext; + bool IsLeaf = true; + for (const auto &Callsite : reverse(Context)) { + StringRef CallerName = Callsite.FuncName; + LineLocation CallsiteLoc = IsLeaf ? LineLocation(0, 0) : Callsite.Location; + CurNode = CurNode->getOrCreateChildContext(CallsiteLoc, CallerName); + IsLeaf = false; + } + + CurNode->addFunctionSize(InstrSize); +} + +uint32_t +BinarySizeContextTracker::getFuncSizeForContext(const SampleContext &Context) { + ContextTrieNode *CurrNode = &RootContext; + ContextTrieNode *PrevNode = nullptr; + SampleContextFrames Frames = Context.getContextFrames(); + int32_t I = Frames.size() - 1; + Optional<uint32_t> Size; + + // Start from top-level context-less function, traverse down the reverse + // context trie to find the best/longest match for given context, then + // retrieve the size. + + while (CurrNode && I >= 0) { + // Process from leaf function to callers (added to context). + const auto &ChildFrame = Frames[I--]; + PrevNode = CurrNode; + CurrNode = + CurrNode->getChildContext(ChildFrame.Location, ChildFrame.FuncName); + if (CurrNode && CurrNode->getFunctionSize().hasValue()) + Size = CurrNode->getFunctionSize().getValue(); + } + + // If we traversed all nodes along the path of the context and haven't + // found a size yet, pivot to look for size from sibling nodes, i.e size + // of inlinee under different context. + if (!Size.hasValue()) { + if (!CurrNode) + CurrNode = PrevNode; + while (!Size.hasValue() && CurrNode && + !CurrNode->getAllChildContext().empty()) { + CurrNode = &CurrNode->getAllChildContext().begin()->second; + if (CurrNode->getFunctionSize().hasValue()) + Size = CurrNode->getFunctionSize().getValue(); + } + } + + assert(Size.hasValue() && "We should at least find one context size."); + return Size.getValue(); +} + +void BinarySizeContextTracker::trackInlineesOptimizedAway( + MCPseudoProbeDecoder &ProbeDecoder) { + ProbeFrameStack ProbeContext; + for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) + trackInlineesOptimizedAway(ProbeDecoder, *Child.second.get(), ProbeContext); +} + +void BinarySizeContextTracker::trackInlineesOptimizedAway( + MCPseudoProbeDecoder &ProbeDecoder, + MCDecodedPseudoProbeInlineTree &ProbeNode, ProbeFrameStack &ProbeContext) { + StringRef FuncName = + ProbeDecoder.getFuncDescForGUID(ProbeNode.Guid)->FuncName; + ProbeContext.emplace_back(FuncName, 0); + + // This ProbeContext has a probe, so it has code before inlining and + // optimization. Make sure we mark its size as known. + if (!ProbeNode.getProbes().empty()) { + ContextTrieNode *SizeContext = &RootContext; + for (auto &ProbeFrame : reverse(ProbeContext)) { + StringRef CallerName = ProbeFrame.first; + LineLocation CallsiteLoc(ProbeFrame.second, 0); + SizeContext = + SizeContext->getOrCreateChildContext(CallsiteLoc, CallerName); + } + // Add 0 size to make known. + SizeContext->addFunctionSize(0); + } + + // DFS down the probe inline tree + for (const auto &ChildNode : ProbeNode.getChildren()) { + InlineSite Location = ChildNode.first; + ProbeContext.back().second = std::get<1>(Location); + trackInlineesOptimizedAway(ProbeDecoder, *ChildNode.second.get(), ProbeContext); + } + + ProbeContext.pop_back(); +} + +void ProfiledBinary::warnNoFuncEntry() { + uint64_t NoFuncEntryNum = 0; + for (auto &F : BinaryFunctions) { + if (F.second.Ranges.empty()) + continue; + bool hasFuncEntry = false; + for (auto &R : F.second.Ranges) { + if (FuncRange *FR = findFuncRangeForStartOffset(R.first)) { + if (FR->IsFuncEntry) { + hasFuncEntry = true; + break; + } + } + } + + if (!hasFuncEntry) { + NoFuncEntryNum++; + if (ShowDetailedWarning) + WithColor::warning() + << "Failed to determine function entry for " << F.first + << " due to inconsistent name from symbol table and dwarf info.\n"; + } + } + emitWarningSummary(NoFuncEntryNum, BinaryFunctions.size(), + "of functions failed to determine function entry due to " + "inconsistent name from symbol table and dwarf info."); +} + +void ProfiledBinary::load() { + // Attempt to open the binary. + OwningBinary<Binary> OBinary = unwrapOrError(createBinary(Path), Path); + Binary &ExeBinary = *OBinary.getBinary(); + + auto *Obj = dyn_cast<ELFObjectFileBase>(&ExeBinary); + if (!Obj) + exitWithError("not a valid Elf image", Path); + + TheTriple = Obj->makeTriple(); + // Current only support X86 + if (!TheTriple.isX86()) + exitWithError("unsupported target", TheTriple.getTriple()); + LLVM_DEBUG(dbgs() << "Loading " << Path << "\n"); + + // Find the preferred load address for text sections. + setPreferredTextSegmentAddresses(Obj); + + // Decode pseudo probe related section + decodePseudoProbe(Obj); + + // Load debug info of subprograms from DWARF section. + // If path of debug info binary is specified, use the debug info from it, + // otherwise use the debug info from the executable binary. + if (!DebugBinaryPath.empty()) { + OwningBinary<Binary> DebugPath = + unwrapOrError(createBinary(DebugBinaryPath), DebugBinaryPath); + loadSymbolsFromDWARF(*dyn_cast<ObjectFile>(DebugPath.getBinary())); + } else { + loadSymbolsFromDWARF(*dyn_cast<ObjectFile>(&ExeBinary)); + } + + // Disassemble the text sections. + disassemble(Obj); + + // Track size for optimized inlinees when probe is available + if (UsePseudoProbes && TrackFuncContextSize) + FuncSizeTracker.trackInlineesOptimizedAway(ProbeDecoder); + + // Use function start and return address to infer prolog and epilog + ProEpilogTracker.inferPrologOffsets(StartOffset2FuncRangeMap); + ProEpilogTracker.inferEpilogOffsets(RetOffsets); + + warnNoFuncEntry(); + + // TODO: decode other sections. +} + +bool ProfiledBinary::inlineContextEqual(uint64_t Address1, uint64_t Address2) { + uint64_t Offset1 = virtualAddrToOffset(Address1); + uint64_t Offset2 = virtualAddrToOffset(Address2); + const SampleContextFrameVector &Context1 = getFrameLocationStack(Offset1); + const SampleContextFrameVector &Context2 = getFrameLocationStack(Offset2); + if (Context1.size() != Context2.size()) + return false; + if (Context1.empty()) + return false; + // The leaf frame contains location within the leaf, and it + // needs to be remove that as it's not part of the calling context + return std::equal(Context1.begin(), Context1.begin() + Context1.size() - 1, + Context2.begin(), Context2.begin() + Context2.size() - 1); +} + +SampleContextFrameVector +ProfiledBinary::getExpandedContext(const SmallVectorImpl<uint64_t> &Stack, + bool &WasLeafInlined) { + SampleContextFrameVector ContextVec; + // Process from frame root to leaf + for (auto Address : Stack) { + uint64_t Offset = virtualAddrToOffset(Address); + const SampleContextFrameVector &ExpandedContext = + getFrameLocationStack(Offset); + // An instruction without a valid debug line will be ignored by sample + // processing + if (ExpandedContext.empty()) + return SampleContextFrameVector(); + // Set WasLeafInlined to the size of inlined frame count for the last + // address which is leaf + WasLeafInlined = (ExpandedContext.size() > 1); + ContextVec.append(ExpandedContext); + } + + // Replace with decoded base discriminator + for (auto &Frame : ContextVec) { + Frame.Location.Discriminator = ProfileGeneratorBase::getBaseDiscriminator( + Frame.Location.Discriminator, UseFSDiscriminator); + } + + assert(ContextVec.size() && "Context length should be at least 1"); + + // Compress the context string except for the leaf frame + auto LeafFrame = ContextVec.back(); + LeafFrame.Location = LineLocation(0, 0); + ContextVec.pop_back(); + CSProfileGenerator::compressRecursionContext(ContextVec); + CSProfileGenerator::trimContext(ContextVec); + ContextVec.push_back(LeafFrame); + return ContextVec; +} + +template <class ELFT> +void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFFile<ELFT> &Obj, StringRef FileName) { + const auto &PhdrRange = unwrapOrError(Obj.program_headers(), FileName); + // FIXME: This should be the page size of the system running profiling. + // However such info isn't available at post-processing time, assuming + // 4K page now. Note that we don't use EXEC_PAGESIZE from <linux/param.h> + // because we may build the tools on non-linux. + uint32_t PageSize = 0x1000; + for (const typename ELFT::Phdr &Phdr : PhdrRange) { + if (Phdr.p_type == ELF::PT_LOAD) { + if (!FirstLoadableAddress) + FirstLoadableAddress = Phdr.p_vaddr & ~(PageSize - 1U); + if (Phdr.p_flags & ELF::PF_X) { + // Segments will always be loaded at a page boundary. + PreferredTextSegmentAddresses.push_back(Phdr.p_vaddr & + ~(PageSize - 1U)); + TextSegmentOffsets.push_back(Phdr.p_offset & ~(PageSize - 1U)); + } + } + } + + if (PreferredTextSegmentAddresses.empty()) + exitWithError("no executable segment found", FileName); +} + +void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFObjectFileBase *Obj) { + if (const auto *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = cast<ELF64BEObjectFile>(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else + llvm_unreachable("invalid ELF object format"); +} + +void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) { + if (UseDwarfCorrelation) + return; + + StringRef FileName = Obj->getFileName(); + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + + if (SectionName == ".pseudo_probe_desc") { + StringRef Contents = unwrapOrError(Section.getContents(), FileName); + if (!ProbeDecoder.buildGUID2FuncDescMap( + reinterpret_cast<const uint8_t *>(Contents.data()), + Contents.size())) + exitWithError("Pseudo Probe decoder fail in .pseudo_probe_desc section"); + } else if (SectionName == ".pseudo_probe") { + StringRef Contents = unwrapOrError(Section.getContents(), FileName); + if (!ProbeDecoder.buildAddress2ProbeMap( + reinterpret_cast<const uint8_t *>(Contents.data()), + Contents.size())) + exitWithError("Pseudo Probe decoder fail in .pseudo_probe section"); + // set UsePseudoProbes flag, used for PerfReader + UsePseudoProbes = true; + } + } + + if (ShowPseudoProbe) + ProbeDecoder.printGUID2FuncDescMap(outs()); +} + +void ProfiledBinary::setIsFuncEntry(uint64_t Offset, StringRef RangeSymName) { + // Note that the start offset of each ELF section can be a non-function + // symbol, we need to binary search for the start of a real function range. + auto *FuncRange = findFuncRangeForOffset(Offset); + // Skip external function symbol. + if (!FuncRange) + return; + + // Set IsFuncEntry to ture if there is only one range in the function or the + // RangeSymName from ELF is equal to its DWARF-based function name. + if (FuncRange->Func->Ranges.size() == 1 || + (!FuncRange->IsFuncEntry && FuncRange->getFuncName() == RangeSymName)) + FuncRange->IsFuncEntry = true; +} + +bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes, + SectionSymbolsTy &Symbols, + const SectionRef &Section) { + std::size_t SE = Symbols.size(); + uint64_t SectionOffset = Section.getAddress() - getPreferredBaseAddress(); + uint64_t SectSize = Section.getSize(); + uint64_t StartOffset = Symbols[SI].Addr - getPreferredBaseAddress(); + uint64_t NextStartOffset = + (SI + 1 < SE) ? Symbols[SI + 1].Addr - getPreferredBaseAddress() + : SectionOffset + SectSize; + setIsFuncEntry(StartOffset, + FunctionSamples::getCanonicalFnName(Symbols[SI].Name)); + + StringRef SymbolName = + ShowCanonicalFnName + ? FunctionSamples::getCanonicalFnName(Symbols[SI].Name) + : Symbols[SI].Name; + bool ShowDisassembly = + ShowDisassemblyOnly && (DisassembleFunctionSet.empty() || + DisassembleFunctionSet.count(SymbolName)); + if (ShowDisassembly) + outs() << '<' << SymbolName << ">:\n"; + + auto WarnInvalidInsts = [](uint64_t Start, uint64_t End) { + WithColor::warning() << "Invalid instructions at " + << format("%8" PRIx64, Start) << " - " + << format("%8" PRIx64, End) << "\n"; + }; + + uint64_t Offset = StartOffset; + // Size of a consecutive invalid instruction range starting from Offset -1 + // backwards. + uint64_t InvalidInstLength = 0; + while (Offset < NextStartOffset) { + MCInst Inst; + uint64_t Size; + // Disassemble an instruction. + bool Disassembled = + DisAsm->getInstruction(Inst, Size, Bytes.slice(Offset - SectionOffset), + Offset + getPreferredBaseAddress(), nulls()); + if (Size == 0) + Size = 1; + + if (ShowDisassembly) { + if (ShowPseudoProbe) { + ProbeDecoder.printProbeForAddress(outs(), + Offset + getPreferredBaseAddress()); + } + outs() << format("%8" PRIx64 ":", Offset + getPreferredBaseAddress()); + size_t Start = outs().tell(); + if (Disassembled) + IPrinter->printInst(&Inst, Offset + Size, "", *STI.get(), outs()); + else + outs() << "\t<unknown>"; + if (ShowSourceLocations) { + unsigned Cur = outs().tell() - Start; + if (Cur < 40) + outs().indent(40 - Cur); + InstructionPointer IP(this, Offset); + outs() << getReversedLocWithContext( + symbolize(IP, ShowCanonicalFnName, ShowPseudoProbe)); + } + outs() << "\n"; + } + + if (Disassembled) { + const MCInstrDesc &MCDesc = MII->get(Inst.getOpcode()); + + // Record instruction size. + Offset2InstSizeMap[Offset] = Size; + + // Populate address maps. + CodeAddrOffsets.push_back(Offset); + if (MCDesc.isCall()) + CallOffsets.insert(Offset); + else if (MCDesc.isReturn()) + RetOffsets.insert(Offset); + else if (MCDesc.isBranch()) + BranchOffsets.insert(Offset); + + if (InvalidInstLength) { + WarnInvalidInsts(Offset - InvalidInstLength, Offset - 1); + InvalidInstLength = 0; + } + } else { + InvalidInstLength += Size; + } + + Offset += Size; + } + + if (InvalidInstLength) + WarnInvalidInsts(Offset - InvalidInstLength, Offset - 1); + + if (ShowDisassembly) + outs() << "\n"; + + return true; +} + +void ProfiledBinary::setUpDisassembler(const ELFObjectFileBase *Obj) { + const Target *TheTarget = getTarget(Obj); + std::string TripleName = TheTriple.getTriple(); + StringRef FileName = Obj->getFileName(); + + MRI.reset(TheTarget->createMCRegInfo(TripleName)); + if (!MRI) + exitWithError("no register info for target " + TripleName, FileName); + + MCTargetOptions MCOptions; + AsmInfo.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions)); + if (!AsmInfo) + exitWithError("no assembly info for target " + TripleName, FileName); + + SubtargetFeatures Features = Obj->getFeatures(); + STI.reset( + TheTarget->createMCSubtargetInfo(TripleName, "", Features.getString())); + if (!STI) + exitWithError("no subtarget info for target " + TripleName, FileName); + + MII.reset(TheTarget->createMCInstrInfo()); + if (!MII) + exitWithError("no instruction info for target " + TripleName, FileName); + + MCContext Ctx(Triple(TripleName), AsmInfo.get(), MRI.get(), STI.get()); + std::unique_ptr<MCObjectFileInfo> MOFI( + TheTarget->createMCObjectFileInfo(Ctx, /*PIC=*/false)); + Ctx.setObjectFileInfo(MOFI.get()); + DisAsm.reset(TheTarget->createMCDisassembler(*STI, Ctx)); + if (!DisAsm) + exitWithError("no disassembler for target " + TripleName, FileName); + + MIA.reset(TheTarget->createMCInstrAnalysis(MII.get())); + + int AsmPrinterVariant = AsmInfo->getAssemblerDialect(); + IPrinter.reset(TheTarget->createMCInstPrinter( + Triple(TripleName), AsmPrinterVariant, *AsmInfo, *MII, *MRI)); + IPrinter->setPrintBranchImmAsAddress(true); +} + +void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) { + // Set up disassembler and related components. + setUpDisassembler(Obj); + + // Create a mapping from virtual address to symbol name. The symbols in text + // sections are the candidates to dissassemble. + std::map<SectionRef, SectionSymbolsTy> AllSymbols; + StringRef FileName = Obj->getFileName(); + for (const SymbolRef &Symbol : Obj->symbols()) { + const uint64_t Addr = unwrapOrError(Symbol.getAddress(), FileName); + const StringRef Name = unwrapOrError(Symbol.getName(), FileName); + section_iterator SecI = unwrapOrError(Symbol.getSection(), FileName); + if (SecI != Obj->section_end()) + AllSymbols[*SecI].push_back(SymbolInfoTy(Addr, Name, ELF::STT_NOTYPE)); + } + + // Sort all the symbols. Use a stable sort to stabilize the output. + for (std::pair<const SectionRef, SectionSymbolsTy> &SecSyms : AllSymbols) + stable_sort(SecSyms.second); + + DisassembleFunctionSet.insert(DisassembleFunctions.begin(), + DisassembleFunctions.end()); + assert((DisassembleFunctionSet.empty() || ShowDisassemblyOnly) && + "Functions to disassemble should be only specified together with " + "--show-disassembly-only"); + + if (ShowDisassemblyOnly) + outs() << "\nDisassembly of " << FileName << ":\n"; + + // Dissassemble a text section. + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + if (!Section.isText()) + continue; + + uint64_t ImageLoadAddr = getPreferredBaseAddress(); + uint64_t SectionOffset = Section.getAddress() - ImageLoadAddr; + uint64_t SectSize = Section.getSize(); + if (!SectSize) + continue; + + // Register the text section. + TextSections.insert({SectionOffset, SectSize}); + + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + + if (ShowDisassemblyOnly) { + outs() << "\nDisassembly of section " << SectionName; + outs() << " [" << format("0x%" PRIx64, Section.getAddress()) << ", " + << format("0x%" PRIx64, Section.getAddress() + SectSize) + << "]:\n\n"; + } + + if (SectionName == ".plt") + continue; + + // Get the section data. + ArrayRef<uint8_t> Bytes = + arrayRefFromStringRef(unwrapOrError(Section.getContents(), FileName)); + + // Get the list of all the symbols in this section. + SectionSymbolsTy &Symbols = AllSymbols[Section]; + + // Disassemble symbol by symbol. + for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) { + if (!dissassembleSymbol(SI, Bytes, Symbols, Section)) + exitWithError("disassembling error", FileName); + } + } + + // Dissassemble rodata section to check if FS discriminator symbol exists. + checkUseFSDiscriminator(Obj, AllSymbols); +} + +void ProfiledBinary::checkUseFSDiscriminator( + const ELFObjectFileBase *Obj, + std::map<SectionRef, SectionSymbolsTy> &AllSymbols) { + const char *FSDiscriminatorVar = "__llvm_fs_discriminator__"; + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + if (!Section.isData() || Section.getSize() == 0) + continue; + SectionSymbolsTy &Symbols = AllSymbols[Section]; + + for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) { + if (Symbols[SI].Name == FSDiscriminatorVar) { + UseFSDiscriminator = true; + return; + } + } + } +} + +void ProfiledBinary::loadSymbolsFromDWARF(ObjectFile &Obj) { + auto DebugContext = llvm::DWARFContext::create(Obj); + if (!DebugContext) + exitWithError("Misssing debug info.", Path); + + for (const auto &CompilationUnit : DebugContext->compile_units()) { + for (const auto &DieInfo : CompilationUnit->dies()) { + llvm::DWARFDie Die(CompilationUnit.get(), &DieInfo); + + if (!Die.isSubprogramDIE()) + continue; + auto Name = Die.getName(llvm::DINameKind::LinkageName); + if (!Name) + Name = Die.getName(llvm::DINameKind::ShortName); + if (!Name) + continue; + + auto RangesOrError = Die.getAddressRanges(); + if (!RangesOrError) + continue; + const DWARFAddressRangesVector &Ranges = RangesOrError.get(); + + if (Ranges.empty()) + continue; + + // Different DWARF symbols can have same function name, search or create + // BinaryFunction indexed by the name. + auto Ret = BinaryFunctions.emplace(Name, BinaryFunction()); + auto &Func = Ret.first->second; + if (Ret.second) + Func.FuncName = Ret.first->first; + + for (const auto &Range : Ranges) { + uint64_t FuncStart = Range.LowPC; + uint64_t FuncSize = Range.HighPC - FuncStart; + + if (FuncSize == 0 || FuncStart < getPreferredBaseAddress()) + continue; + + uint64_t StartOffset = FuncStart - getPreferredBaseAddress(); + uint64_t EndOffset = Range.HighPC - getPreferredBaseAddress(); + + // We may want to know all ranges for one function. Here group the + // ranges and store them into BinaryFunction. + Func.Ranges.emplace_back(StartOffset, EndOffset); + + auto R = StartOffset2FuncRangeMap.emplace(StartOffset, FuncRange()); + if (R.second) { + FuncRange &FRange = R.first->second; + FRange.Func = &Func; + FRange.StartOffset = StartOffset; + FRange.EndOffset = EndOffset; + } else { + WithColor::warning() + << "Duplicated symbol start address at " + << format("%8" PRIx64, StartOffset + getPreferredBaseAddress()) + << " " << R.first->second.getFuncName() << " and " << Name + << "\n"; + } + } + } + } + assert(!StartOffset2FuncRangeMap.empty() && "Misssing debug info."); +} + +void ProfiledBinary::populateSymbolListFromDWARF( + ProfileSymbolList &SymbolList) { + for (auto &I : StartOffset2FuncRangeMap) + SymbolList.add(I.second.getFuncName()); +} + +void ProfiledBinary::setupSymbolizer() { + symbolize::LLVMSymbolizer::Options SymbolizerOpts; + SymbolizerOpts.PrintFunctions = + DILineInfoSpecifier::FunctionNameKind::LinkageName; + SymbolizerOpts.Demangle = false; + SymbolizerOpts.DefaultArch = TheTriple.getArchName().str(); + SymbolizerOpts.UseSymbolTable = false; + SymbolizerOpts.RelativeAddresses = false; + Symbolizer = std::make_unique<symbolize::LLVMSymbolizer>(SymbolizerOpts); +} + +SampleContextFrameVector ProfiledBinary::symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName, + bool UseProbeDiscriminator) { + assert(this == IP.Binary && + "Binary should only symbolize its own instruction"); + auto Addr = object::SectionedAddress{IP.Offset + getPreferredBaseAddress(), + object::SectionedAddress::UndefSection}; + DIInliningInfo InlineStack = unwrapOrError( + Symbolizer->symbolizeInlinedCode(SymbolizerPath.str(), Addr), + SymbolizerPath); + + SampleContextFrameVector CallStack; + for (int32_t I = InlineStack.getNumberOfFrames() - 1; I >= 0; I--) { + const auto &CallerFrame = InlineStack.getFrame(I); + if (CallerFrame.FunctionName == "<invalid>") + break; + + StringRef FunctionName(CallerFrame.FunctionName); + if (UseCanonicalFnName) + FunctionName = FunctionSamples::getCanonicalFnName(FunctionName); + + uint32_t Discriminator = CallerFrame.Discriminator; + uint32_t LineOffset = (CallerFrame.Line - CallerFrame.StartLine) & 0xffff; + if (UseProbeDiscriminator) { + LineOffset = + PseudoProbeDwarfDiscriminator::extractProbeIndex(Discriminator); + Discriminator = 0; + } + + LineLocation Line(LineOffset, Discriminator); + auto It = NameStrings.insert(FunctionName.str()); + CallStack.emplace_back(*It.first, Line); + } + + return CallStack; +} + +void ProfiledBinary::computeInlinedContextSizeForRange(uint64_t StartOffset, + uint64_t EndOffset) { + uint64_t RangeBegin = offsetToVirtualAddr(StartOffset); + uint64_t RangeEnd = offsetToVirtualAddr(EndOffset); + InstructionPointer IP(this, RangeBegin, true); + + if (IP.Address != RangeBegin) + WithColor::warning() << "Invalid start instruction at " + << format("%8" PRIx64, RangeBegin) << "\n"; + + if (IP.Address >= RangeEnd) + return; + + do { + uint64_t Offset = virtualAddrToOffset(IP.Address); + const SampleContextFrameVector &SymbolizedCallStack = + getFrameLocationStack(Offset, UsePseudoProbes); + uint64_t Size = Offset2InstSizeMap[Offset]; + + // Record instruction size for the corresponding context + FuncSizeTracker.addInstructionForContext(SymbolizedCallStack, Size); + + } while (IP.advance() && IP.Address < RangeEnd); +} + +InstructionPointer::InstructionPointer(const ProfiledBinary *Binary, + uint64_t Address, bool RoundToNext) + : Binary(Binary), Address(Address) { + Index = Binary->getIndexForAddr(Address); + if (RoundToNext) { + // we might get address which is not the code + // it should round to the next valid address + if (Index >= Binary->getCodeOffsetsSize()) + this->Address = UINT64_MAX; + else + this->Address = Binary->getAddressforIndex(Index); + } +} + +bool InstructionPointer::advance() { + Index++; + if (Index >= Binary->getCodeOffsetsSize()) { + Address = UINT64_MAX; + return false; + } + Address = Binary->getAddressforIndex(Index); + return true; +} + +bool InstructionPointer::backward() { + if (Index == 0) { + Address = 0; + return false; + } + Index--; + Address = Binary->getAddressforIndex(Index); + return true; +} + +void InstructionPointer::update(uint64_t Addr) { + Address = Addr; + Index = Binary->getIndexForAddr(Address); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/contrib/libs/llvm14/tools/llvm-profgen/ProfiledBinary.h b/contrib/libs/llvm14/tools/llvm-profgen/ProfiledBinary.h new file mode 100644 index 0000000000..d3d1c6f1fd --- /dev/null +++ b/contrib/libs/llvm14/tools/llvm-profgen/ProfiledBinary.h @@ -0,0 +1,541 @@ +//===-- ProfiledBinary.h - Binary decoder -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H +#define LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H + +#include "CallContext.h" +#include "ErrorHandling.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/Symbolize/Symbolize.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCPseudoProbe.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Path.h" +#include "llvm/Transforms/IPO/SampleContextTracker.h" +#include <list> +#include <map> +#include <set> +#include <sstream> +#include <string> +#include <unordered_map> +#include <unordered_set> +#include <vector> + +extern cl::opt<bool> EnableCSPreInliner; +extern cl::opt<bool> UseContextCostForPreInliner; + +using namespace llvm; +using namespace sampleprof; +using namespace llvm::object; + +namespace llvm { +namespace sampleprof { + +class ProfiledBinary; + +struct InstructionPointer { + const ProfiledBinary *Binary; + union { + // Offset of the executable segment of the binary. + uint64_t Offset = 0; + // Also used as address in unwinder + uint64_t Address; + }; + // Index to the sorted code address array of the binary. + uint64_t Index = 0; + InstructionPointer(const ProfiledBinary *Binary, uint64_t Address, + bool RoundToNext = false); + bool advance(); + bool backward(); + void update(uint64_t Addr); +}; + +// The special frame addresses. +enum SpecialFrameAddr { + // Dummy root of frame trie. + DummyRoot = 0, + // Represent all the addresses outside of current binary. + // This's also used to indicate the call stack should be truncated since this + // isn't a real call context the compiler will see. + ExternalAddr = 1, +}; + +using RangesTy = std::vector<std::pair<uint64_t, uint64_t>>; + +struct BinaryFunction { + StringRef FuncName; + // End of range is an exclusive bound. + RangesTy Ranges; + + uint64_t getFuncSize() { + uint64_t Sum = 0; + for (auto &R : Ranges) { + Sum += R.second - R.first; + } + return Sum; + } +}; + +// Info about function range. A function can be split into multiple +// non-continuous ranges, each range corresponds to one FuncRange. +struct FuncRange { + uint64_t StartOffset; + // EndOffset is an exclusive bound. + uint64_t EndOffset; + // Function the range belongs to + BinaryFunction *Func; + // Whether the start offset is the real entry of the function. + bool IsFuncEntry = false; + + StringRef getFuncName() { return Func->FuncName; } +}; + +// PrologEpilog offset tracker, used to filter out broken stack samples +// Currently we use a heuristic size (two) to infer prolog and epilog +// based on the start address and return address. In the future, +// we will switch to Dwarf CFI based tracker +struct PrologEpilogTracker { + // A set of prolog and epilog offsets. Used by virtual unwinding. + std::unordered_set<uint64_t> PrologEpilogSet; + ProfiledBinary *Binary; + PrologEpilogTracker(ProfiledBinary *Bin) : Binary(Bin){}; + + // Take the two addresses from the start of function as prolog + void inferPrologOffsets(std::map<uint64_t, FuncRange> &FuncStartOffsetMap) { + for (auto I : FuncStartOffsetMap) { + PrologEpilogSet.insert(I.first); + InstructionPointer IP(Binary, I.first); + if (!IP.advance()) + break; + PrologEpilogSet.insert(IP.Offset); + } + } + + // Take the last two addresses before the return address as epilog + void inferEpilogOffsets(std::unordered_set<uint64_t> &RetAddrs) { + for (auto Addr : RetAddrs) { + PrologEpilogSet.insert(Addr); + InstructionPointer IP(Binary, Addr); + if (!IP.backward()) + break; + PrologEpilogSet.insert(IP.Offset); + } + } +}; + +// Track function byte size under different context (outlined version as well as +// various inlined versions). It also provides query support to get function +// size with the best matching context, which is used to help pre-inliner use +// accurate post-optimization size to make decisions. +// TODO: If an inlinee is completely optimized away, ideally we should have zero +// for its context size, currently we would misss such context since it doesn't +// have instructions. To fix this, we need to mark all inlinee with entry probe +// but without instructions as having zero size. +class BinarySizeContextTracker { +public: + // Add instruction with given size to a context + void addInstructionForContext(const SampleContextFrameVector &Context, + uint32_t InstrSize); + + // Get function size with a specific context. When there's no exact match + // for the given context, try to retrieve the size of that function from + // closest matching context. + uint32_t getFuncSizeForContext(const SampleContext &Context); + + // For inlinees that are full optimized away, we can establish zero size using + // their remaining probes. + void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder); + + void dump() { RootContext.dumpTree(); } + +private: + using ProbeFrameStack = SmallVector<std::pair<StringRef, uint32_t>>; + void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder, + MCDecodedPseudoProbeInlineTree &ProbeNode, + ProbeFrameStack &Context); + + // Root node for context trie tree, node that this is a reverse context trie + // with callee as parent and caller as child. This way we can traverse from + // root to find the best/longest matching context if an exact match does not + // exist. It gives us the best possible estimate for function's post-inline, + // post-optimization byte size. + ContextTrieNode RootContext; +}; + +using OffsetRange = std::pair<uint64_t, uint64_t>; + +class ProfiledBinary { + // Absolute path of the executable binary. + std::string Path; + // Path of the debug info binary. + std::string DebugBinaryPath; + // Path of symbolizer path which should be pointed to binary with debug info. + StringRef SymbolizerPath; + // The target triple. + Triple TheTriple; + // The runtime base address that the first executable segment is loaded at. + uint64_t BaseAddress = 0; + // The runtime base address that the first loadabe segment is loaded at. + uint64_t FirstLoadableAddress = 0; + // The preferred load address of each executable segment. + std::vector<uint64_t> PreferredTextSegmentAddresses; + // The file offset of each executable segment. + std::vector<uint64_t> TextSegmentOffsets; + + // Mutiple MC component info + std::unique_ptr<const MCRegisterInfo> MRI; + std::unique_ptr<const MCAsmInfo> AsmInfo; + std::unique_ptr<const MCSubtargetInfo> STI; + std::unique_ptr<const MCInstrInfo> MII; + std::unique_ptr<MCDisassembler> DisAsm; + std::unique_ptr<const MCInstrAnalysis> MIA; + std::unique_ptr<MCInstPrinter> IPrinter; + // A list of text sections sorted by start RVA and size. Used to check + // if a given RVA is a valid code address. + std::set<std::pair<uint64_t, uint64_t>> TextSections; + + // A map of mapping function name to BinaryFunction info. + std::unordered_map<std::string, BinaryFunction> BinaryFunctions; + + // An ordered map of mapping function's start offset to function range + // relevant info. Currently to determine if the offset of ELF is the start of + // a real function, we leverage the function range info from DWARF. + std::map<uint64_t, FuncRange> StartOffset2FuncRangeMap; + + // Offset to context location map. Used to expand the context. + std::unordered_map<uint64_t, SampleContextFrameVector> Offset2LocStackMap; + + // Offset to instruction size map. Also used for quick offset lookup. + std::unordered_map<uint64_t, uint64_t> Offset2InstSizeMap; + + // An array of offsets of all instructions sorted in increasing order. The + // sorting is needed to fast advance to the next forward/backward instruction. + std::vector<uint64_t> CodeAddrOffsets; + // A set of call instruction offsets. Used by virtual unwinding. + std::unordered_set<uint64_t> CallOffsets; + // A set of return instruction offsets. Used by virtual unwinding. + std::unordered_set<uint64_t> RetOffsets; + // A set of branch instruction offsets. + std::unordered_set<uint64_t> BranchOffsets; + + // Estimate and track function prolog and epilog ranges. + PrologEpilogTracker ProEpilogTracker; + + // Track function sizes under different context + BinarySizeContextTracker FuncSizeTracker; + + // The symbolizer used to get inline context for an instruction. + std::unique_ptr<symbolize::LLVMSymbolizer> Symbolizer; + + // String table owning function name strings created from the symbolizer. + std::unordered_set<std::string> NameStrings; + + // A collection of functions to print disassembly for. + StringSet<> DisassembleFunctionSet; + + // Pseudo probe decoder + MCPseudoProbeDecoder ProbeDecoder; + + bool UsePseudoProbes = false; + + bool UseFSDiscriminator = false; + + // Whether we need to symbolize all instructions to get function context size. + bool TrackFuncContextSize = false; + + // Indicate if the base loading address is parsed from the mmap event or uses + // the preferred address + bool IsLoadedByMMap = false; + // Use to avoid redundant warning. + bool MissingMMapWarned = false; + + void setPreferredTextSegmentAddresses(const ELFObjectFileBase *O); + + template <class ELFT> + void setPreferredTextSegmentAddresses(const ELFFile<ELFT> &Obj, StringRef FileName); + + void decodePseudoProbe(const ELFObjectFileBase *Obj); + + void + checkUseFSDiscriminator(const ELFObjectFileBase *Obj, + std::map<SectionRef, SectionSymbolsTy> &AllSymbols); + + // Set up disassembler and related components. + void setUpDisassembler(const ELFObjectFileBase *Obj); + void setupSymbolizer(); + + // Load debug info of subprograms from DWARF section. + void loadSymbolsFromDWARF(ObjectFile &Obj); + + // A function may be spilt into multiple non-continuous address ranges. We use + // this to set whether start offset of a function is the real entry of the + // function and also set false to the non-function label. + void setIsFuncEntry(uint64_t Offset, StringRef RangeSymName); + + // Warn if no entry range exists in the function. + void warnNoFuncEntry(); + + /// Dissassemble the text section and build various address maps. + void disassemble(const ELFObjectFileBase *O); + + /// Helper function to dissassemble the symbol and extract info for unwinding + bool dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes, + SectionSymbolsTy &Symbols, const SectionRef &Section); + /// Symbolize a given instruction pointer and return a full call context. + SampleContextFrameVector symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName = false, + bool UseProbeDiscriminator = false); + /// Decode the interesting parts of the binary and build internal data + /// structures. On high level, the parts of interest are: + /// 1. Text sections, including the main code section and the PLT + /// entries that will be used to handle cross-module call transitions. + /// 2. The .debug_line section, used by Dwarf-based profile generation. + /// 3. Pseudo probe related sections, used by probe-based profile + /// generation. + void load(); + +public: + ProfiledBinary(const StringRef ExeBinPath, const StringRef DebugBinPath) + : Path(ExeBinPath), DebugBinaryPath(DebugBinPath), ProEpilogTracker(this), + TrackFuncContextSize(EnableCSPreInliner && + UseContextCostForPreInliner) { + // Point to executable binary if debug info binary is not specified. + SymbolizerPath = DebugBinPath.empty() ? ExeBinPath : DebugBinPath; + setupSymbolizer(); + load(); + } + uint64_t virtualAddrToOffset(uint64_t VirtualAddress) const { + return VirtualAddress - BaseAddress; + } + uint64_t offsetToVirtualAddr(uint64_t Offset) const { + return Offset + BaseAddress; + } + StringRef getPath() const { return Path; } + StringRef getName() const { return llvm::sys::path::filename(Path); } + uint64_t getBaseAddress() const { return BaseAddress; } + void setBaseAddress(uint64_t Address) { BaseAddress = Address; } + + // Return the preferred load address for the first executable segment. + uint64_t getPreferredBaseAddress() const { return PreferredTextSegmentAddresses[0]; } + // Return the preferred load address for the first loadable segment. + uint64_t getFirstLoadableAddress() const { return FirstLoadableAddress; } + // Return the file offset for the first executable segment. + uint64_t getTextSegmentOffset() const { return TextSegmentOffsets[0]; } + const std::vector<uint64_t> &getPreferredTextSegmentAddresses() const { + return PreferredTextSegmentAddresses; + } + const std::vector<uint64_t> &getTextSegmentOffsets() const { + return TextSegmentOffsets; + } + + uint64_t getInstSize(uint64_t Offset) const { + auto I = Offset2InstSizeMap.find(Offset); + if (I == Offset2InstSizeMap.end()) + return 0; + return I->second; + } + + bool offsetIsCode(uint64_t Offset) const { + return Offset2InstSizeMap.find(Offset) != Offset2InstSizeMap.end(); + } + bool addressIsCode(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return offsetIsCode(Offset); + } + bool addressIsCall(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return CallOffsets.count(Offset); + } + bool addressIsReturn(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return RetOffsets.count(Offset); + } + bool addressInPrologEpilog(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return ProEpilogTracker.PrologEpilogSet.count(Offset); + } + + bool offsetIsTransfer(uint64_t Offset) { + return BranchOffsets.count(Offset) || RetOffsets.count(Offset) || + CallOffsets.count(Offset); + } + + uint64_t getAddressforIndex(uint64_t Index) const { + return offsetToVirtualAddr(CodeAddrOffsets[Index]); + } + + size_t getCodeOffsetsSize() const { return CodeAddrOffsets.size(); } + + bool usePseudoProbes() const { return UsePseudoProbes; } + bool useFSDiscriminator() const { return UseFSDiscriminator; } + // Get the index in CodeAddrOffsets for the address + // As we might get an address which is not the code + // here it would round to the next valid code address by + // using lower bound operation + uint32_t getIndexForOffset(uint64_t Offset) const { + auto Low = llvm::lower_bound(CodeAddrOffsets, Offset); + return Low - CodeAddrOffsets.begin(); + } + uint32_t getIndexForAddr(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return getIndexForOffset(Offset); + } + + uint64_t getCallAddrFromFrameAddr(uint64_t FrameAddr) const { + if (FrameAddr == ExternalAddr) + return ExternalAddr; + auto I = getIndexForAddr(FrameAddr); + FrameAddr = I ? getAddressforIndex(I - 1) : 0; + if (FrameAddr && addressIsCall(FrameAddr)) + return FrameAddr; + return 0; + } + + FuncRange *findFuncRangeForStartOffset(uint64_t Offset) { + auto I = StartOffset2FuncRangeMap.find(Offset); + if (I == StartOffset2FuncRangeMap.end()) + return nullptr; + return &I->second; + } + + // Binary search the function range which includes the input offset. + FuncRange *findFuncRangeForOffset(uint64_t Offset) { + auto I = StartOffset2FuncRangeMap.upper_bound(Offset); + if (I == StartOffset2FuncRangeMap.begin()) + return nullptr; + I--; + + if (Offset >= I->second.EndOffset) + return nullptr; + + return &I->second; + } + + // Get all ranges of one function. + RangesTy getRangesForOffset(uint64_t Offset) { + auto *FRange = findFuncRangeForOffset(Offset); + // Ignore the range which falls into plt section or system lib. + if (!FRange) + return RangesTy(); + + return FRange->Func->Ranges; + } + + const std::unordered_map<std::string, BinaryFunction> & + getAllBinaryFunctions() { + return BinaryFunctions; + } + + BinaryFunction *getBinaryFunction(StringRef FName) { + auto I = BinaryFunctions.find(FName.str()); + if (I == BinaryFunctions.end()) + return nullptr; + return &I->second; + } + + uint32_t getFuncSizeForContext(SampleContext &Context) { + return FuncSizeTracker.getFuncSizeForContext(Context); + } + + // Load the symbols from debug table and populate into symbol list. + void populateSymbolListFromDWARF(ProfileSymbolList &SymbolList); + + const SampleContextFrameVector & + getFrameLocationStack(uint64_t Offset, bool UseProbeDiscriminator = false) { + auto I = Offset2LocStackMap.emplace(Offset, SampleContextFrameVector()); + if (I.second) { + InstructionPointer IP(this, Offset); + I.first->second = symbolize(IP, true, UseProbeDiscriminator); + } + return I.first->second; + } + + Optional<SampleContextFrame> getInlineLeafFrameLoc(uint64_t Offset) { + const auto &Stack = getFrameLocationStack(Offset); + if (Stack.empty()) + return {}; + return Stack.back(); + } + + // Compare two addresses' inline context + bool inlineContextEqual(uint64_t Add1, uint64_t Add2); + + // Get the full context of the current stack with inline context filled in. + // It will search the disassembling info stored in Offset2LocStackMap. This is + // used as the key of function sample map + SampleContextFrameVector + getExpandedContext(const SmallVectorImpl<uint64_t> &Stack, + bool &WasLeafInlined); + // Go through instructions among the given range and record its size for the + // inline context. + void computeInlinedContextSizeForRange(uint64_t StartOffset, + uint64_t EndOffset); + + const MCDecodedPseudoProbe *getCallProbeForAddr(uint64_t Address) const { + return ProbeDecoder.getCallProbeForAddr(Address); + } + + void getInlineContextForProbe(const MCDecodedPseudoProbe *Probe, + SampleContextFrameVector &InlineContextStack, + bool IncludeLeaf = false) const { + SmallVector<MCPseduoProbeFrameLocation, 16> ProbeInlineContext; + ProbeDecoder.getInlineContextForProbe(Probe, ProbeInlineContext, + IncludeLeaf); + for (uint32_t I = 0; I < ProbeInlineContext.size(); I++) { + auto &Callsite = ProbeInlineContext[I]; + // Clear the current context for an unknown probe. + if (Callsite.second == 0 && I != ProbeInlineContext.size() - 1) { + InlineContextStack.clear(); + continue; + } + InlineContextStack.emplace_back(Callsite.first, + LineLocation(Callsite.second, 0)); + } + } + const AddressProbesMap &getAddress2ProbesMap() const { + return ProbeDecoder.getAddress2ProbesMap(); + } + const MCPseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) { + return ProbeDecoder.getFuncDescForGUID(GUID); + } + + const MCPseudoProbeFuncDesc * + getInlinerDescForProbe(const MCDecodedPseudoProbe *Probe) { + return ProbeDecoder.getInlinerDescForProbe(Probe); + } + + bool getTrackFuncContextSize() { return TrackFuncContextSize; } + + bool getIsLoadedByMMap() { return IsLoadedByMMap; } + + void setIsLoadedByMMap(bool Value) { IsLoadedByMMap = Value; } + + bool getMissingMMapWarned() { return MissingMMapWarned; } + + void setMissingMMapWarned(bool Value) { MissingMMapWarned = Value; } +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/contrib/libs/llvm14/tools/llvm-profgen/llvm-profgen.cpp b/contrib/libs/llvm14/tools/llvm-profgen/llvm-profgen.cpp new file mode 100644 index 0000000000..f092df04d5 --- /dev/null +++ b/contrib/libs/llvm14/tools/llvm-profgen/llvm-profgen.cpp @@ -0,0 +1,164 @@ +//===- llvm-profgen.cpp - LLVM SPGO profile generation tool -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// llvm-profgen generates SPGO profiles from perf script ouput. +// +//===----------------------------------------------------------------------===// + +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfileGenerator.h" +#include "ProfiledBinary.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/TargetSelect.h" + +static cl::OptionCategory ProfGenCategory("ProfGen Options"); + +static cl::opt<std::string> PerfScriptFilename( + "perfscript", cl::value_desc("perfscript"), cl::ZeroOrMore, + llvm::cl::MiscFlags::CommaSeparated, + cl::desc("Path of perf-script trace created by Linux perf tool with " + "`script` command(the raw perf.data should be profiled with -b)"), + cl::cat(ProfGenCategory)); +static cl::alias PSA("ps", cl::desc("Alias for --perfscript"), + cl::aliasopt(PerfScriptFilename)); + +static cl::opt<std::string> PerfDataFilename( + "perfdata", cl::value_desc("perfdata"), cl::ZeroOrMore, + llvm::cl::MiscFlags::CommaSeparated, + cl::desc("Path of raw perf data created by Linux perf tool (it should be " + "profiled with -b)"), + cl::cat(ProfGenCategory)); +static cl::alias PDA("pd", cl::desc("Alias for --perfdata"), + cl::aliasopt(PerfDataFilename)); + +static cl::opt<std::string> UnsymbolizedProfFilename( + "unsymbolized-profile", cl::value_desc("unsymbolized profile"), + cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated, + cl::desc("Path of the unsymbolized profile created by " + "`llvm-profgen` with `--skip-symbolization`"), + cl::cat(ProfGenCategory)); +static cl::alias UPA("up", cl::desc("Alias for --unsymbolized-profile"), + cl::aliasopt(UnsymbolizedProfFilename)); + +static cl::opt<std::string> + BinaryPath("binary", cl::value_desc("binary"), cl::Required, + cl::desc("Path of profiled executable binary."), + cl::cat(ProfGenCategory)); + +static cl::opt<std::string> DebugBinPath( + "debug-binary", cl::value_desc("debug-binary"), cl::ZeroOrMore, + cl::desc("Path of debug info binary, llvm-profgen will load the DWARF info " + "from it instead of the executable binary."), + cl::cat(ProfGenCategory)); + +extern cl::opt<bool> ShowDisassemblyOnly; +extern cl::opt<bool> ShowSourceLocations; +extern cl::opt<bool> SkipSymbolization; + +using namespace llvm; +using namespace sampleprof; + +// Validate the command line input. +static void validateCommandLine() { + // Allow the missing perfscript if we only use to show binary disassembly. + if (!ShowDisassemblyOnly) { + // Validate input profile is provided only once + uint16_t HasPerfData = PerfDataFilename.getNumOccurrences(); + uint16_t HasPerfScript = PerfScriptFilename.getNumOccurrences(); + uint16_t HasUnsymbolizedProfile = + UnsymbolizedProfFilename.getNumOccurrences(); + uint16_t S = HasPerfData + HasPerfScript + HasUnsymbolizedProfile; + if (S != 1) { + std::string Msg = + S > 1 + ? "`--perfscript`, `--perfdata` and `--unsymbolized-profile` " + "cannot be used together." + : "Perf input file is missing, please use one of `--perfscript`, " + "`--perfdata` and `--unsymbolized-profile` for the input."; + exitWithError(Msg); + } + + auto CheckFileExists = [](bool H, StringRef File) { + if (H && !llvm::sys::fs::exists(File)) { + std::string Msg = "Input perf file(" + File.str() + ") doesn't exist."; + exitWithError(Msg); + } + }; + + CheckFileExists(HasPerfData, PerfDataFilename); + CheckFileExists(HasPerfScript, PerfScriptFilename); + CheckFileExists(HasUnsymbolizedProfile, UnsymbolizedProfFilename); + } + + if (!llvm::sys::fs::exists(BinaryPath)) { + std::string Msg = "Input binary(" + BinaryPath + ") doesn't exist."; + exitWithError(Msg); + } + + if (CSProfileGenerator::MaxCompressionSize < -1) { + exitWithError("Value of --compress-recursion should >= -1"); + } + if (ShowSourceLocations && !ShowDisassemblyOnly) { + exitWithError("--show-source-locations should work together with " + "--show-disassembly-only!"); + } +} + +static PerfInputFile getPerfInputFile() { + PerfInputFile File; + if (PerfDataFilename.getNumOccurrences()) { + File.InputFile = PerfDataFilename; + File.Format = PerfFormat::PerfData; + } else if (PerfScriptFilename.getNumOccurrences()) { + File.InputFile = PerfScriptFilename; + File.Format = PerfFormat::PerfScript; + } else if (UnsymbolizedProfFilename.getNumOccurrences()) { + File.InputFile = UnsymbolizedProfFilename; + File.Format = PerfFormat::UnsymbolizedProfile; + } + return File; +} + +int main(int argc, const char *argv[]) { + InitLLVM X(argc, argv); + + // Initialize targets and assembly printers/parsers. + InitializeAllTargetInfos(); + InitializeAllTargetMCs(); + InitializeAllDisassemblers(); + + cl::HideUnrelatedOptions({&ProfGenCategory, &getColorCategory()}); + cl::ParseCommandLineOptions(argc, argv, "llvm SPGO profile generator\n"); + validateCommandLine(); + + // Load symbols and disassemble the code of a given binary. + std::unique_ptr<ProfiledBinary> Binary = + std::make_unique<ProfiledBinary>(BinaryPath, DebugBinPath); + if (ShowDisassemblyOnly) + return EXIT_SUCCESS; + + PerfInputFile PerfFile = getPerfInputFile(); + std::unique_ptr<PerfReaderBase> Reader = + PerfReaderBase::create(Binary.get(), PerfFile); + // Parse perf events and samples + Reader->parsePerfTraces(); + + if (SkipSymbolization) + return EXIT_SUCCESS; + + std::unique_ptr<ProfileGeneratorBase> Generator = + ProfileGeneratorBase::create(Binary.get(), Reader->getSampleCounters(), + Reader->profileIsCSFlat()); + Generator->generateProfile(); + Generator->write(); + + return EXIT_SUCCESS; +} diff --git a/contrib/libs/llvm14/tools/llvm-profgen/ya.make b/contrib/libs/llvm14/tools/llvm-profgen/ya.make new file mode 100644 index 0000000000..c6c45c6e36 --- /dev/null +++ b/contrib/libs/llvm14/tools/llvm-profgen/ya.make @@ -0,0 +1,80 @@ +# Generated by devtools/yamaker. + +PROGRAM() + +LICENSE(Apache-2.0 WITH LLVM-exception) + +LICENSE_TEXTS(.yandex_meta/licenses.list.txt) + +PEERDIR( + contrib/libs/llvm14 + contrib/libs/llvm14/include + contrib/libs/llvm14/lib/Analysis + contrib/libs/llvm14/lib/AsmParser + contrib/libs/llvm14/lib/BinaryFormat + contrib/libs/llvm14/lib/Bitcode/Reader + contrib/libs/llvm14/lib/Bitcode/Writer + contrib/libs/llvm14/lib/Bitstream/Reader + contrib/libs/llvm14/lib/DebugInfo/CodeView + contrib/libs/llvm14/lib/DebugInfo/DWARF + contrib/libs/llvm14/lib/DebugInfo/MSF + contrib/libs/llvm14/lib/DebugInfo/PDB + contrib/libs/llvm14/lib/DebugInfo/Symbolize + contrib/libs/llvm14/lib/Demangle + contrib/libs/llvm14/lib/Frontend/OpenMP + contrib/libs/llvm14/lib/IR + contrib/libs/llvm14/lib/IRReader + contrib/libs/llvm14/lib/Linker + contrib/libs/llvm14/lib/MC + contrib/libs/llvm14/lib/MC/MCDisassembler + contrib/libs/llvm14/lib/MC/MCParser + contrib/libs/llvm14/lib/Object + contrib/libs/llvm14/lib/ProfileData + contrib/libs/llvm14/lib/Remarks + contrib/libs/llvm14/lib/Support + contrib/libs/llvm14/lib/Target/AArch64/Disassembler + contrib/libs/llvm14/lib/Target/AArch64/MCTargetDesc + contrib/libs/llvm14/lib/Target/AArch64/TargetInfo + contrib/libs/llvm14/lib/Target/AArch64/Utils + contrib/libs/llvm14/lib/Target/ARM/Disassembler + contrib/libs/llvm14/lib/Target/ARM/MCTargetDesc + contrib/libs/llvm14/lib/Target/ARM/TargetInfo + contrib/libs/llvm14/lib/Target/ARM/Utils + contrib/libs/llvm14/lib/Target/BPF/Disassembler + contrib/libs/llvm14/lib/Target/BPF/MCTargetDesc + contrib/libs/llvm14/lib/Target/BPF/TargetInfo + contrib/libs/llvm14/lib/Target/NVPTX/MCTargetDesc + contrib/libs/llvm14/lib/Target/NVPTX/TargetInfo + contrib/libs/llvm14/lib/Target/PowerPC/Disassembler + contrib/libs/llvm14/lib/Target/PowerPC/MCTargetDesc + contrib/libs/llvm14/lib/Target/PowerPC/TargetInfo + contrib/libs/llvm14/lib/Target/X86/Disassembler + contrib/libs/llvm14/lib/Target/X86/MCTargetDesc + contrib/libs/llvm14/lib/Target/X86/TargetInfo + contrib/libs/llvm14/lib/TextAPI + contrib/libs/llvm14/lib/Transforms/AggressiveInstCombine + contrib/libs/llvm14/lib/Transforms/IPO + contrib/libs/llvm14/lib/Transforms/InstCombine + contrib/libs/llvm14/lib/Transforms/Instrumentation + contrib/libs/llvm14/lib/Transforms/Scalar + contrib/libs/llvm14/lib/Transforms/Utils + contrib/libs/llvm14/lib/Transforms/Vectorize +) + +ADDINCL( + contrib/libs/llvm14/tools/llvm-profgen +) + +NO_COMPILER_WARNINGS() + +NO_UTIL() + +SRCS( + CSPreInliner.cpp + PerfReader.cpp + ProfileGenerator.cpp + ProfiledBinary.cpp + llvm-profgen.cpp +) + +END() |