diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/lfalloc/alloc_profiler | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/lfalloc/alloc_profiler')
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/align_ut.cpp | 23 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/profiler.cpp | 81 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/profiler.h | 45 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp | 76 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/stackcollect.cpp | 332 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/stackcollect.h | 107 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/ut/ya.make | 22 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/ya.make | 17 |
8 files changed, 703 insertions, 0 deletions
diff --git a/library/cpp/lfalloc/alloc_profiler/align_ut.cpp b/library/cpp/lfalloc/alloc_profiler/align_ut.cpp new file mode 100644 index 00000000000..db9b17b95ba --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/align_ut.cpp @@ -0,0 +1,23 @@ +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/scope.h> + +Y_UNIT_TEST_SUITE(MemAlign) { + Y_UNIT_TEST(ShouldAlign) + { + for (ui64 size = 8; size <= 32 * 1024; size *= 8) { + for (ui64 align = 8; align <= 4096; align *=2) { + void* ptr = nullptr; + + int res = posix_memalign(&ptr, align, size); + UNIT_ASSERT_C(res == 0 && ptr != nullptr, "memalign failed"); + + Y_DEFER { + free(ptr); + }; + + UNIT_ASSERT_C((uintptr_t)ptr % align == 0, "non aligned memory"); + } + } + } +} diff --git a/library/cpp/lfalloc/alloc_profiler/profiler.cpp b/library/cpp/lfalloc/alloc_profiler/profiler.cpp new file mode 100644 index 00000000000..0e30927a5a2 --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/profiler.cpp @@ -0,0 +1,81 @@ +#include "profiler.h" + +#include "stackcollect.h" + +#include <util/generic/algorithm.h> +#include <util/generic/singleton.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/stream/str.h> + +namespace NAllocProfiler { + +namespace { + +static TAllocationStackCollector& AllocationStackCollector() +{ + return *Singleton<TAllocationStackCollector>(); +} + +int AllocationCallback(int tag, size_t size, int sizeIdx) +{ + Y_UNUSED(sizeIdx); + + static const size_t STACK_FRAMES_COUNT = 32; + static const size_t STACK_FRAMES_SKIP = 1; + + void* frames[STACK_FRAMES_COUNT]; + size_t frameCount = BackTrace(frames, Y_ARRAY_SIZE(frames)); + if (frameCount <= STACK_FRAMES_SKIP) { + return -1; + } + + void** stack = &frames[STACK_FRAMES_SKIP]; + frameCount -= STACK_FRAMES_SKIP; + + auto& collector = AllocationStackCollector(); + return collector.Alloc(stack, frameCount, tag, size); +} + +void DeallocationCallback(int stackId, int tag, size_t size, int sizeIdx) +{ + Y_UNUSED(tag); + Y_UNUSED(sizeIdx); + + auto& collector = AllocationStackCollector(); + collector.Free(stackId, size); +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +bool StartAllocationSampling(bool profileAllThreads) +{ + auto& collector = AllocationStackCollector(); + collector.Clear(); + + NAllocDbg::SetProfileAllThreads(profileAllThreads); + NAllocDbg::SetAllocationCallback(AllocationCallback); + NAllocDbg::SetDeallocationCallback(DeallocationCallback); + NAllocDbg::SetAllocationSamplingEnabled(true); + return true; +} + +bool StopAllocationSampling(IAllocationStatsDumper &out, int count) +{ + NAllocDbg::SetAllocationCallback(nullptr); + NAllocDbg::SetDeallocationCallback(nullptr); + NAllocDbg::SetAllocationSamplingEnabled(false); + + auto& collector = AllocationStackCollector(); + collector.Dump(count, out); + return true; +} + +bool StopAllocationSampling(IOutputStream& out, int count) { + TAllocationStatsDumper dumper(out); + return StopAllocationSampling(dumper, count); +} + +} // namespace NProfiler diff --git a/library/cpp/lfalloc/alloc_profiler/profiler.h b/library/cpp/lfalloc/alloc_profiler/profiler.h new file mode 100644 index 00000000000..4ea49b9dcc8 --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/profiler.h @@ -0,0 +1,45 @@ +#pragma once + +#include "stackcollect.h" + +#include <library/cpp/lfalloc/dbg_info/dbg_info.h> + +#include <util/generic/noncopyable.h> +#include <util/stream/output.h> + +namespace NAllocProfiler { + +//////////////////////////////////////////////////////////////////////////////// + +inline int SetCurrentScopeTag(int value) +{ + return NAllocDbg::SetThreadAllocTag(value); +} + +inline bool SetProfileCurrentThread(bool value) +{ + return NAllocDbg::SetProfileCurrentThread(value); +} + +bool StartAllocationSampling(bool profileAllThreads = false); +bool StopAllocationSampling(IAllocationStatsDumper& out, int count = 100); +bool StopAllocationSampling(IOutputStream& out, int count = 100); + +//////////////////////////////////////////////////////////////////////////////// + +class TProfilingScope: private TNonCopyable { +private: + const int Prev; + +public: + explicit TProfilingScope(int value) + : Prev(SetCurrentScopeTag(value)) + {} + + ~TProfilingScope() + { + SetCurrentScopeTag(Prev); + } +}; + +} // namespace NAllocProfiler diff --git a/library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp b/library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp new file mode 100644 index 00000000000..4341dda6ed9 --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp @@ -0,0 +1,76 @@ +#include "profiler.h" + +#include <library/cpp/testing/unittest/registar.h> + +namespace NAllocProfiler { + +//////////////////////////////////////////////////////////////////////////////// + +Y_UNIT_TEST_SUITE(Profiler) { + Y_UNIT_TEST(StackCollection) + { + TStringStream str; + + NAllocProfiler::StartAllocationSampling(true); + TVector<TAutoPtr<int>> test; + // Do many allocations and no deallocations + for (int i = 0; i < 10000; ++i) { + test.push_back(new int); + } + NAllocProfiler::StopAllocationSampling(str); + //Cout << str.Str() << Endl; + +#if !defined(ARCH_AARCH64) + /* Check that output resembles this: + + STACK #2: 0 Allocs: 10 Frees: 0 CurrentSize: 40 + 0000000000492353 ?? + 000000000048781F operator new(unsigned long) +1807 + 00000000003733FA NAllocProfiler::NTestSuiteProfiler::TTestCaseStackCollection::Execute_(NUnitTest::TTestContext&) +218 + 00000000004A1938 NUnitTest::TTestBase::Run(std::__y1::function<void ()>, TString, char const*, bool) +120 + 0000000000375656 NAllocProfiler::NTestSuiteProfiler::TCurrentTest::Execute() +342 + 00000000004A20CF NUnitTest::TTestFactory::Execute() +847 + 000000000049922D NUnitTest::RunMain(int, char**) +1965 + 00007FF665778F45 __libc_start_main +245 + */ + + UNIT_ASSERT_STRING_CONTAINS(str.Str(), "StackCollection"); + UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NUnitTest::TTestBase::Run"); + UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NAllocProfiler::NTestSuiteProfiler::TCurrentTest::Execute"); + UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NUnitTest::TTestFactory::Execute"); + UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NUnitTest::RunMain"); +#endif + } + + class TAllocDumper : public NAllocProfiler::TAllocationStatsDumper { + public: + explicit TAllocDumper(IOutputStream& out) : NAllocProfiler::TAllocationStatsDumper(out) {} + + TString FormatTag(int tag) override { + UNIT_ASSERT_VALUES_EQUAL(tag, 42); + return "TAG_NAME_42"; + } + }; + + Y_UNIT_TEST(TagNames) + { + TStringStream str; + + NAllocProfiler::StartAllocationSampling(true); + TVector<TAutoPtr<int>> test; + NAllocProfiler::TProfilingScope scope(42); + // Do many allocations and no deallocations + for (int i = 0; i < 10000; ++i) { + test.push_back(new int); + } + + TAllocDumper dumper(str); + NAllocProfiler::StopAllocationSampling(dumper); + +#if !defined(ARCH_AARCH64) + UNIT_ASSERT_STRING_CONTAINS(str.Str(), "TAG_NAME_42"); +#endif + } +} + +} diff --git a/library/cpp/lfalloc/alloc_profiler/stackcollect.cpp b/library/cpp/lfalloc/alloc_profiler/stackcollect.cpp new file mode 100644 index 00000000000..fded4e2fd1a --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/stackcollect.cpp @@ -0,0 +1,332 @@ +#include "stackcollect.h" + +#include "profiler.h" + +#include <util/generic/algorithm.h> +#include <util/generic/vector.h> +#include <util/stream/format.h> +#include <util/stream/str.h> +#include <util/string/cast.h> +#include <util/string/printf.h> +#include <util/system/backtrace.h> +#include <util/system/spinlock.h> +#include <util/system/yassert.h> + + +namespace NAllocProfiler { + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +class TStackCollector: private TNonCopyable { +public: + struct TFrameInfo { + int PrevInd; + void* Addr; + int Tag; + T Stats; + + void Clear() + { + PrevInd = 0; + Addr = nullptr; + Tag = 0; + Stats.Clear(); + } + }; + +private: + static const size_t STACKS_HASH_MAP_SIZE = 256 * 1024; + TFrameInfo Frames[STACKS_HASH_MAP_SIZE]; + + ui64 Samples; // Saved samples count + ui64 UniqueSamples; // Number of unique addresses + ui64 UsedSlots; // Number of occupied slots in the hashtable + ui64 DroppedSamples; // Number of unsaved addresses + ui64 SearchSkipCount; // Total number of linear hash table probes due to collisions + + TAdaptiveLock Lock; + +public: + TStackCollector() + { + Clear(); + } + + int AddStack(void** stack, size_t frameCount, int tag) + { + Y_ASSERT(frameCount > 0); + + int prevInd = -1; + with_lock (Lock) { + for (int i = frameCount - 1; i >= 0; --i) { + prevInd = AddFrame(stack[i], prevInd, ((i == 0) ? tag : 0), (i == 0)); + if (prevInd == -1) { + break; + } + } + } + return prevInd; + } + + T& GetStats(int stackId) + { + Y_ASSERT(stackId >= 0 && (size_t)stackId < Y_ARRAY_SIZE(Frames)); + Y_ASSERT(!IsSlotEmpty(stackId)); + + return Frames[stackId].Stats; + } + + const TFrameInfo* GetFrames() const + { + return Frames; + } + + size_t GetFramesCount() const + { + return Y_ARRAY_SIZE(Frames); + } + + void BackTrace(const TFrameInfo* stack, TStackVec<void*, 64>& frames) const + { + frames.clear(); + for (size_t i = 0; i < 100; ++i) { + frames.push_back(stack->Addr); + int prevInd = stack->PrevInd; + if (prevInd == -1) { + break; + } + stack = &Frames[prevInd]; + } + } + + void Clear() + { + for (auto& frame: Frames) { + frame.Clear(); + } + + Samples = 0; + DroppedSamples = 0; + UniqueSamples = 0; + UsedSlots = 0; + SearchSkipCount = 0; + } + +private: + // Hash function applied to the addresses + static ui32 Hash(void* addr, int prevInd, int tag) + { + return (((size_t)addr + ((size_t)addr / STACKS_HASH_MAP_SIZE)) + prevInd + tag) % STACKS_HASH_MAP_SIZE; + } + + static bool EqualFrame(const TFrameInfo& frame, void* addr, int prevInd, int tag) + { + return (frame.Addr == addr && frame.PrevInd == prevInd && frame.Tag == tag); + } + + bool IsSlotEmpty(ui32 slot) const + { + return Frames[slot].Addr == 0; + } + + bool InsertsAllowed() const + { + return UsedSlots < STACKS_HASH_MAP_SIZE / 2; + } + + // returns the index in the hashmap + int AddFrame(void* addr, int prevFrameIndex, int tag, bool last) + { + ui32 slot = Hash(addr, prevFrameIndex, tag); + ui32 prevSlot = (slot - 1) % STACKS_HASH_MAP_SIZE; + + while (!EqualFrame(Frames[slot], addr, prevFrameIndex, tag) && !IsSlotEmpty(slot) && slot != prevSlot) { + slot = (slot + 1) % STACKS_HASH_MAP_SIZE; + SearchSkipCount++; + } + + if (EqualFrame(Frames[slot], addr, prevFrameIndex, tag)) { + if (last) { + ++Samples; + } + } else if (InsertsAllowed() && IsSlotEmpty(slot)) { + // add new sample + Frames[slot].Clear(); + Frames[slot].Addr = addr; + Frames[slot].PrevInd = prevFrameIndex; + Frames[slot].Tag = tag; + ++UsedSlots; + if (last) { + ++UniqueSamples; + ++Samples; + } + } else { + // don't insert new sample if the search is becoming too slow + ++DroppedSamples; + return -1; + } + + return slot; + } +}; + + +//////////////////////////////////////////////////////////////////////////////// + +class TAllocationStackCollector::TImpl: public TStackCollector<TStats> { + using TBase = TStackCollector<TStats>; + +private: + TStats Total; + +public: + int Alloc(void** stack, size_t frameCount, int tag, size_t size) + { + int stackId = TBase::AddStack(stack, frameCount, tag); + if (stackId >= 0) { + TBase::GetStats(stackId).Alloc(size); + Total.Alloc(size); + } + return stackId; + } + + void Free(int stackId, size_t size) + { + TBase::GetStats(stackId).Free(size); + Total.Free(size); + } + + void Clear() + { + TBase::Clear(); + Total.Clear(); + } + + void Dump(int count, IAllocationStatsDumper& out) const + { + const TFrameInfo* frames = TBase::GetFrames(); + size_t framesCount = TBase::GetFramesCount(); + + TVector<const TFrameInfo*> stacks; + for (size_t i = 0; i < framesCount; ++i) { + if (frames[i].Stats.Allocs) { + stacks.push_back(&frames[i]); + } + } + + Sort(stacks, [] (const TFrameInfo* l, const TFrameInfo* r) { + const auto& ls = l->Stats; + const auto& rs = r->Stats; + return ls.CurrentSize != rs.CurrentSize + ? ls.CurrentSize > rs.CurrentSize + : ls.Allocs != rs.Allocs + ? ls.Allocs > rs.Allocs + : ls.Frees > rs.Frees; + }); + + out.DumpTotal(Total); + + TAllocationInfo allocInfo; + int printedCount = 0; + for (const TFrameInfo* stack: stacks) { + allocInfo.Clear(); + allocInfo.Tag = stack->Tag; + allocInfo.Stats = stack->Stats; + TBase::BackTrace(stack, allocInfo.Stack); + + out.DumpEntry(allocInfo); + + if (++printedCount >= count) { + break; + } + } + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +TAllocationStackCollector::TAllocationStackCollector() + : Impl(new TImpl()) +{} + +TAllocationStackCollector::~TAllocationStackCollector() +{} + +int TAllocationStackCollector::Alloc(void** stack, size_t frameCount, int tag, size_t size) +{ + return Impl->Alloc(stack, frameCount, tag, size); +} + +void TAllocationStackCollector::Free(int stackId, size_t size) +{ + Impl->Free(stackId, size); +} + +void TAllocationStackCollector::Clear() +{ + Impl->Clear(); +} + +void TAllocationStackCollector::Dump(int count, IAllocationStatsDumper &out) const +{ + Impl->Dump(count, out); +} + + +TString IAllocationStatsDumper::FormatTag(int tag) { + return ToString(tag); +} + +TString IAllocationStatsDumper::FormatSize(intptr_t sz) { + return ToString(sz); +} + + +TAllocationStatsDumper::TAllocationStatsDumper(IOutputStream& out) + : PrintedCount(0) + , Out(out) + , SymbolCache(2048) +{} + +void TAllocationStatsDumper::DumpTotal(const TStats& total) { + Out << "TOTAL" + << "\tAllocs: " << total.Allocs + << "\tFrees: " << total.Frees + << "\tCurrentSize: " << FormatSize(total.CurrentSize) + << Endl; +} + +void TAllocationStatsDumper::DumpEntry(const TAllocationInfo& allocInfo) { + Out << Endl + << "STACK #" << PrintedCount+1 << ": " << FormatTag(allocInfo.Tag) + << "\tAllocs: " << allocInfo.Stats.Allocs + << "\tFrees: " << allocInfo.Stats.Frees + << "\tCurrentSize: " << FormatSize(allocInfo.Stats.CurrentSize) + << Endl; + FormatBackTrace(allocInfo.Stack.data(), allocInfo.Stack.size()); + PrintedCount++; +} + +void TAllocationStatsDumper::FormatBackTrace(void* const* stack, size_t sz) { + char name[1024]; + for (size_t i = 0; i < sz; ++i) { + TSymbol symbol; + auto it = SymbolCache.Find(stack[i]); + if (it != SymbolCache.End()) { + symbol = it.Value(); + } else { + TResolvedSymbol rs = ResolveSymbol(stack[i], name, sizeof(name)); + symbol = {rs.NearestSymbol, rs.Name}; + SymbolCache.Insert(stack[i], symbol); + } + + Out << Hex((intptr_t)stack[i], HF_FULL) << "\t" << symbol.Name; + intptr_t offset = (intptr_t)stack[i] - (intptr_t)symbol.Address; + if (offset) + Out << " +" << offset; + Out << Endl; + } +} + +} // namespace NAllocProfiler diff --git a/library/cpp/lfalloc/alloc_profiler/stackcollect.h b/library/cpp/lfalloc/alloc_profiler/stackcollect.h new file mode 100644 index 00000000000..80715ed7cb5 --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/stackcollect.h @@ -0,0 +1,107 @@ +#pragma once + +#include <library/cpp/containers/stack_vector/stack_vec.h> +#include <library/cpp/cache/cache.h> + +#include <util/generic/noncopyable.h> +#include <util/generic/ptr.h> +#include <util/stream/output.h> + +namespace NAllocProfiler { + +struct TStats { + intptr_t Allocs = 0; + intptr_t Frees = 0; + intptr_t CurrentSize = 0; + + void Clear() + { + Allocs = 0; + Frees = 0; + CurrentSize = 0; + } + + void Alloc(size_t size) + { + AtomicIncrement(Allocs); + AtomicAdd(CurrentSize, size); + } + + void Free(size_t size) + { + AtomicIncrement(Frees); + AtomicSub(CurrentSize, size); + } +}; + +struct TAllocationInfo { + int Tag; + TStats Stats; + TStackVec<void*, 64> Stack; + + void Clear() { + Tag = 0; + Stats.Clear(); + Stack.clear(); + } +}; + + +class IAllocationStatsDumper { +public: + virtual ~IAllocationStatsDumper() = default; + + // Total stats + virtual void DumpTotal(const TStats& total) = 0; + + // Stats for individual stack + virtual void DumpEntry(const TAllocationInfo& allocInfo) = 0; + + // App-specific tag printer + virtual TString FormatTag(int tag); + + // Size printer (e.g. "10KB", "100MB", "over 9000") + virtual TString FormatSize(intptr_t sz); +}; + +// Default implementation +class TAllocationStatsDumper: public IAllocationStatsDumper { +public: + explicit TAllocationStatsDumper(IOutputStream& out); + void DumpTotal(const TStats& total) override; + void DumpEntry(const TAllocationInfo& allocInfo) override; + +private: + void FormatBackTrace(void* const* stack, size_t sz); + +private: + struct TSymbol { + const void* Address; + TString Name; + }; + + size_t PrintedCount; + IOutputStream& Out; + TLFUCache<void*, TSymbol> SymbolCache; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TAllocationStackCollector: private TNonCopyable { +private: + class TImpl; + THolder<TImpl> Impl; + +public: + TAllocationStackCollector(); + ~TAllocationStackCollector(); + + int Alloc(void** stack, size_t frameCount, int tag, size_t size); + void Free(int stackId, size_t size); + + void Clear(); + + void Dump(int count, IAllocationStatsDumper& out) const; +}; + +} // namespace NAllocProfiler diff --git a/library/cpp/lfalloc/alloc_profiler/ut/ya.make b/library/cpp/lfalloc/alloc_profiler/ut/ya.make new file mode 100644 index 00000000000..8a7daa74af6 --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/ut/ya.make @@ -0,0 +1,22 @@ +UNITTEST_FOR(library/cpp/lfalloc/alloc_profiler) + +OWNER(g:rtmr g:kikimr) + +PEERDIR( + library/cpp/testing/unittest +) + +IF (ARCH_AARCH64) + PEERDIR( + contrib/libs/jemalloc + ) +ELSE() + ALLOCATOR(LF_DBG) +ENDIF() + +SRCS( + profiler_ut.cpp + align_ut.cpp +) + +END() diff --git a/library/cpp/lfalloc/alloc_profiler/ya.make b/library/cpp/lfalloc/alloc_profiler/ya.make new file mode 100644 index 00000000000..0f58d917678 --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/ya.make @@ -0,0 +1,17 @@ +LIBRARY() + +OWNER(g:rtmr g:kikimr) + +SRCS( + profiler.cpp + stackcollect.cpp +) + +PEERDIR( + library/cpp/lfalloc/dbg_info + library/cpp/cache +) + +END() + +RECURSE(ut) |