#include <util/system/defaults.h>
#include "profile.h"
#if defined(_unix_) && !defined(_bionic_) && !defined(_cygwin_)
#include <signal.h>
#include <sys/time.h>
#include <sys/resource.h>
#if defined(_darwin_)
#include <sys/ucontext.h>
#else
#include <ucontext.h>
#endif
#include <dlfcn.h>
#include <library/cpp/deprecated/atomic/atomic.h>
#include <util/system/platform.h>
#include <util/generic/hash.h>
#include <util/generic/map.h>
#include <util/generic/noncopyable.h>
#include <util/generic/algorithm.h>
#include <util/generic/vector.h>
#include <util/stream/file.h>
#include <util/string/util.h>
#include <util/system/datetime.h>
// This class sets SIGPROF handler and captures instruction pointer in it.
class TExecutionSampler : TNonCopyable {
public:
typedef TVector<std::pair<void*, size_t>> TSampleVector;
struct TStats {
ui64 SavedSamples;
ui64 DroppedSamples;
ui64 SearchSkipCount;
};
// NOTE: There is no synchronization here as the instance is supposed to be
// created on the main thread.
static TExecutionSampler* Instance() {
if (SInstance == nullptr) {
SInstance = new TExecutionSampler();
}
return SInstance;
}
void Start() {
// Set signal handler
struct sigaction sa;
sa.sa_sigaction = ProfilerSignalHandler;
sigemptyset(&sa.sa_mask);
sa.sa_flags = SA_SIGINFO;
if (sigaction(SIGPROF, &sa, &OldSignalHandler) != 0)
return;
// Set interval timer
itimerval tv;
tv.it_interval.tv_sec = tv.it_value.tv_sec = 0;
tv.it_interval.tv_usec = tv.it_value.tv_usec = SAMPLE_INTERVAL;
setitimer(ITIMER_PROF, &tv, &OldTimerValue);
Started = true;
}
void Stop(TSampleVector& sampleVector, TStats& stats) {
// Reset signal handler and timer
if (Started) {
setitimer(ITIMER_PROF, &OldTimerValue, nullptr);
sleep(1);
}
WaitForWriteFlag();
if (Started) {
sigaction(SIGPROF, &OldSignalHandler, nullptr);
Started = false;
}
TExecutionSampler::TSampleVector hits;
hits.reserve(Samples);
for (size_t i = 0; i < SZ; ++i) {
if (Ips[i].first != nullptr) {
hits.push_back(Ips[i]);
}
}
stats.SavedSamples = Samples;
stats.DroppedSamples = AtomicGet(DroppedSamples);
stats.SearchSkipCount = SearchSkipCount;
AtomicUnlock(&WriteFlag);
Sort(hits.begin(), hits.end(), TCompareFirst());
sampleVector.swap(hits);
}
void ResetStats() {
WaitForWriteFlag();
Clear();
AtomicUnlock(&WriteFlag);
}
private:
static const size_t SZ = 2 * 1024 * 1024; // size of the hash table
// inserts work faster if it's a power of 2
static const int SAMPLE_INTERVAL = 1000; // in microseconds
struct TCompareFirst {
bool operator()(const std::pair<void*, size_t>& a, const std::pair<void*, size_t>& b) const {
return a.first < b.first;
}
};
TExecutionSampler()
: Started(false)
, Ips(SZ)
, WriteFlag(0)
, DroppedSamples(0)
, Samples(0)
, UniqueSamples(0)
, SearchSkipCount(0)
{
}
~TExecutionSampler() = default;
// Signal handler is not allowed to do anything that can deadlock with activity
// on the thread to which the signal is delivered or corrupt data structures that
// were in process of update.
// One such thing is memory allocation. That's why a fixed size vector is
// preallocated at start.
static void ProfilerSignalHandler(int signal, siginfo_t* info, void* context) {
(void)info;
if (signal != SIGPROF) {
return;
}
ucontext_t* ucontext = reinterpret_cast<ucontext_t*>(context);
Y_ASSERT(SInstance != nullptr);
SInstance->CaptureIP(GetIp(&ucontext->uc_mcontext));
}
void WaitForWriteFlag() {
// Wait for write flag to be reset
ui32 delay = 100;
while (!AtomicTryLock(&WriteFlag)) {
usleep(delay);
delay += delay;
delay = Min(delay, (ui32)5000);
}
}
void CaptureIP(void* rip) {
// Check if the handler on another thread is in the process of adding a sample
// If this is the case, we just drop the current sample as this should happen
// rarely.
if (AtomicTryLock(&WriteFlag)) {
AddSample(rip);
AtomicUnlock(&WriteFlag);
} else {
AtomicIncrement(DroppedSamples);
}
}
// Hash function applied to the addresses
static inline ui32 Hash(void* key) {
return ((size_t)key + (size_t)key / SZ) % SZ;
}
// Get instruction pointer from the context
static inline void* GetIp(const mcontext_t* mctx) {
#if defined _freebsd_
#if defined _64_
return (void*)mctx->mc_rip;
#else
return (void*)mctx->mc_eip;
#endif
#elif defined _linux_
#if defined _64_
#if defined(_arm_)
return (void*)mctx->pc;
#else
return (void*)mctx->gregs[REG_RIP];
#endif
#else
return (void*)mctx->gregs[REG_EIP];
#endif
#elif defined _darwin_
#if defined _64_
#if defined(_arm_)
return (void*)(*mctx)->__ss.__pc;
#else
return (void*)(*mctx)->__ss.__rip;
#endif
#else
#if defined(__IOS__)
return (void*)(*mctx)->__ss.__pc;
#else
return (void*)(*mctx)->__ss.__eip;
#endif
#endif
#endif
}
inline bool AddSample(void* key) {
ui32 slot = Hash(key);
ui32 prevSlot = (slot - 1) % SZ;
while (key != Ips[slot].first && !IsSlotEmpty(slot) && slot != prevSlot) {
slot = (slot + 1) % SZ;
SearchSkipCount++;
}
if (key == Ips[slot].first) {
// increment the count
Ips[slot].second++;
++Samples;
} else if (InsertsAllowed()) {
// add new sample and set the count to 1
Ips[slot].first = key;
Ips[slot].second = 1;
++UniqueSamples;
++Samples;
} else {
// don't insert new sample if the search is becoming too slow
AtomicIncrement(DroppedSamples);
return false;
}
return true;
}
inline bool IsSlotEmpty(ui32 slot) const {
return Ips[slot].first == nullptr;
}
inline bool InsertsAllowed() const {
return UniqueSamples < SZ / 2;
}
void
Clear() {
Y_ASSERT(WriteFlag == 1);
for (size_t i = 0; i < SZ; ++i) {
Ips[i] = std::make_pair((void*)nullptr, (size_t)0);
}
Samples = 0;
AtomicSet(DroppedSamples, 0);
UniqueSamples = 0;
SearchSkipCount = 0;
}
bool Started;
struct sigaction OldSignalHandler;
itimerval OldTimerValue;
TVector<std::pair<void*, size_t>>
Ips; // The hash table storing addresses and their hitcounts
// TODO: on a big multiproc cache line false sharing by the flag and count might become an issue
TAtomic WriteFlag; // Is used to syncronize access to the hash table
TAtomic DroppedSamples; // "dropped sample" count will show how many times
// a sample was dropped either because of write conflict
// or because of the hash table had become too filled up
ui64 Samples; // Saved samples count
ui64 UniqueSamples; // Number of unique addresses
ui64 SearchSkipCount; // Total number of linear hash table probes due to collisions
static TExecutionSampler* SInstance;
};
// Performs analysis of samples captured by TExecutionSampler
class TSampleAnalyser : TNonCopyable {
public:
TSampleAnalyser(TExecutionSampler::TSampleVector& samples, const TExecutionSampler::TStats& stats, bool putTimeStamps = false)
: Samples()
, Stats(stats)
, PutTimestamps(putTimeStamps)
{
Samples.swap(samples);
}
~TSampleAnalyser() = default;
void Analyze(FILE* out) const;
private:
TExecutionSampler::TSampleVector Samples;
TExecutionSampler::TStats Stats;
bool PutTimestamps;
};
void TSampleAnalyser::Analyze(FILE* out) const {
fprintf(out, "samples: %" PRIu64 " unique: %" PRIu64 " dropped: %" PRIu64 " searchskips: %" PRIu64 "\n",
(ui64)Stats.SavedSamples, (ui64)Samples.size(),
(ui64)Stats.DroppedSamples, (ui64)Stats.SearchSkipCount);
fprintf(out, "\nSamples:\n");
size_t funcCnt = 0;
void* prevModBase = (void*)-1;
void* prevFunc = (void*)-1;
for (size_t i = 0; i < Samples.size(); ++i) {
// print cycle count once in a while to estimate time consumed by
// dumping the samples
if (PutTimestamps && (i % 1000 == 0)) {
ui64 tm = GetCycleCount();
fprintf(out, "TM: %" PRIu64 "\n", tm);
}
Dl_info addrInfo;
if (dladdr(Samples[i].first, &addrInfo)) {
if (addrInfo.dli_fbase != prevModBase || addrInfo.dli_saddr != prevFunc) {
fprintf(out, "Func\t%" PRISZT "\t%p\t%p\t%s\t%s\n",
funcCnt,
addrInfo.dli_fbase,
addrInfo.dli_saddr,
addrInfo.dli_fname,
addrInfo.dli_sname);
prevModBase = addrInfo.dli_fbase;
prevFunc = addrInfo.dli_saddr;
++funcCnt;
}
} else {
fprintf(out, "[dladdr failed]\n");
}
fprintf(out, "%" PRISZT "\t%p\t%lu\n", i, Samples[i].first, Samples[i].second);
}
}
TExecutionSampler* TExecutionSampler::SInstance = nullptr;
// Starts capturing execution samples
void BeginProfiling() {
TExecutionSampler::Instance()->Start();
}
// Resets captured execution samples
void ResetProfile() {
TExecutionSampler::Instance()->ResetStats();
}
void DumpRUsage(FILE* out) {
rusage ru;
int e = getrusage(RUSAGE_SELF, &ru);
if (e != 0)
return;
fprintf(out,
"user time: %lf\n"
"system time: %lf\n"
"max RSS: %ld\n"
"shared text: %ld\n"
"unshared data: %ld\n"
"unshared stack: %ld\n"
"page reclaims: %ld\n"
"page faults: %ld\n"
"swaps: %ld\n"
"block input ops: %ld\n"
"block output ops: %ld\n"
"msg sent: %ld\n"
"msg received: %ld\n"
"signals: %ld\n"
"voluntary ctx switches: %ld\n"
"involuntary ctx switches: %ld\n\n",
ru.ru_utime.tv_sec + (ru.ru_utime.tv_usec * 0.000001),
ru.ru_stime.tv_sec + (ru.ru_stime.tv_usec * 0.000001),
ru.ru_maxrss, ru.ru_ixrss, ru.ru_idrss, ru.ru_isrss,
ru.ru_minflt, ru.ru_majflt, ru.ru_nswap,
ru.ru_inblock, ru.ru_oublock,
ru.ru_msgsnd, ru.ru_msgrcv,
ru.ru_nsignals,
ru.ru_nvcsw, ru.ru_nivcsw);
}
// Pauses capturing execution samples and dumps them to the file
// Samples are not cleared so that profiling can be continued by calling BeginProfiling()
// or it can be started from scratch by calling ResetProfile() and then BeginProfiling()
void EndProfiling(FILE* out) {
DumpRUsage(out);
TExecutionSampler::TSampleVector samples;
TExecutionSampler::TStats stats;
TExecutionSampler::Instance()->Stop(samples, stats);
TSampleAnalyser analyzer(samples, stats);
analyzer.Analyze(out);
}
void EndProfiling() {
static unsigned cnt = 0;
char nameBuf[256];
snprintf(nameBuf, sizeof(nameBuf), "./%s.%d.%u.profile", getprogname(), (int)getpid(), cnt);
FILE* out = fopen(nameBuf, "a");
EndProfiling(out);
fclose(out);
++cnt;
}
#else
// NOTE: not supported on Windows
void BeginProfiling() {
}
void ResetProfile() {
}
void EndProfiling(FILE*) {
}
void EndProfiling() {
}
#endif