diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/lfalloc | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/lfalloc')
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/align_ut.cpp | 23 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/profiler.cpp | 81 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/profiler.h | 45 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp | 76 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/stackcollect.cpp | 332 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/stackcollect.h | 107 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/ut/ya.make | 22 | ||||
-rw-r--r-- | library/cpp/lfalloc/alloc_profiler/ya.make | 17 | ||||
-rw-r--r-- | library/cpp/lfalloc/dbg/ya.make | 32 | ||||
-rw-r--r-- | library/cpp/lfalloc/dbg_info/dbg_info.cpp | 124 | ||||
-rw-r--r-- | library/cpp/lfalloc/dbg_info/dbg_info.h | 77 | ||||
-rw-r--r-- | library/cpp/lfalloc/dbg_info/ya.make | 15 | ||||
-rw-r--r-- | library/cpp/lfalloc/lf_allocX64.cpp | 144 | ||||
-rw-r--r-- | library/cpp/lfalloc/lf_allocX64.h | 1926 | ||||
-rw-r--r-- | library/cpp/lfalloc/ya.make | 25 | ||||
-rw-r--r-- | library/cpp/lfalloc/yt/ya.make | 29 |
16 files changed, 3075 insertions, 0 deletions
diff --git a/library/cpp/lfalloc/alloc_profiler/align_ut.cpp b/library/cpp/lfalloc/alloc_profiler/align_ut.cpp new file mode 100644 index 0000000000..db9b17b95b --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/align_ut.cpp @@ -0,0 +1,23 @@ +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/scope.h> + +Y_UNIT_TEST_SUITE(MemAlign) { + Y_UNIT_TEST(ShouldAlign) + { + for (ui64 size = 8; size <= 32 * 1024; size *= 8) { + for (ui64 align = 8; align <= 4096; align *=2) { + void* ptr = nullptr; + + int res = posix_memalign(&ptr, align, size); + UNIT_ASSERT_C(res == 0 && ptr != nullptr, "memalign failed"); + + Y_DEFER { + free(ptr); + }; + + UNIT_ASSERT_C((uintptr_t)ptr % align == 0, "non aligned memory"); + } + } + } +} diff --git a/library/cpp/lfalloc/alloc_profiler/profiler.cpp b/library/cpp/lfalloc/alloc_profiler/profiler.cpp new file mode 100644 index 0000000000..0e30927a5a --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/profiler.cpp @@ -0,0 +1,81 @@ +#include "profiler.h" + +#include "stackcollect.h" + +#include <util/generic/algorithm.h> +#include <util/generic/singleton.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/stream/str.h> + +namespace NAllocProfiler { + +namespace { + +static TAllocationStackCollector& AllocationStackCollector() +{ + return *Singleton<TAllocationStackCollector>(); +} + +int AllocationCallback(int tag, size_t size, int sizeIdx) +{ + Y_UNUSED(sizeIdx); + + static const size_t STACK_FRAMES_COUNT = 32; + static const size_t STACK_FRAMES_SKIP = 1; + + void* frames[STACK_FRAMES_COUNT]; + size_t frameCount = BackTrace(frames, Y_ARRAY_SIZE(frames)); + if (frameCount <= STACK_FRAMES_SKIP) { + return -1; + } + + void** stack = &frames[STACK_FRAMES_SKIP]; + frameCount -= STACK_FRAMES_SKIP; + + auto& collector = AllocationStackCollector(); + return collector.Alloc(stack, frameCount, tag, size); +} + +void DeallocationCallback(int stackId, int tag, size_t size, int sizeIdx) +{ + Y_UNUSED(tag); + Y_UNUSED(sizeIdx); + + auto& collector = AllocationStackCollector(); + collector.Free(stackId, size); +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +bool StartAllocationSampling(bool profileAllThreads) +{ + auto& collector = AllocationStackCollector(); + collector.Clear(); + + NAllocDbg::SetProfileAllThreads(profileAllThreads); + NAllocDbg::SetAllocationCallback(AllocationCallback); + NAllocDbg::SetDeallocationCallback(DeallocationCallback); + NAllocDbg::SetAllocationSamplingEnabled(true); + return true; +} + +bool StopAllocationSampling(IAllocationStatsDumper &out, int count) +{ + NAllocDbg::SetAllocationCallback(nullptr); + NAllocDbg::SetDeallocationCallback(nullptr); + NAllocDbg::SetAllocationSamplingEnabled(false); + + auto& collector = AllocationStackCollector(); + collector.Dump(count, out); + return true; +} + +bool StopAllocationSampling(IOutputStream& out, int count) { + TAllocationStatsDumper dumper(out); + return StopAllocationSampling(dumper, count); +} + +} // namespace NProfiler diff --git a/library/cpp/lfalloc/alloc_profiler/profiler.h b/library/cpp/lfalloc/alloc_profiler/profiler.h new file mode 100644 index 0000000000..4ea49b9dcc --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/profiler.h @@ -0,0 +1,45 @@ +#pragma once + +#include "stackcollect.h" + +#include <library/cpp/lfalloc/dbg_info/dbg_info.h> + +#include <util/generic/noncopyable.h> +#include <util/stream/output.h> + +namespace NAllocProfiler { + +//////////////////////////////////////////////////////////////////////////////// + +inline int SetCurrentScopeTag(int value) +{ + return NAllocDbg::SetThreadAllocTag(value); +} + +inline bool SetProfileCurrentThread(bool value) +{ + return NAllocDbg::SetProfileCurrentThread(value); +} + +bool StartAllocationSampling(bool profileAllThreads = false); +bool StopAllocationSampling(IAllocationStatsDumper& out, int count = 100); +bool StopAllocationSampling(IOutputStream& out, int count = 100); + +//////////////////////////////////////////////////////////////////////////////// + +class TProfilingScope: private TNonCopyable { +private: + const int Prev; + +public: + explicit TProfilingScope(int value) + : Prev(SetCurrentScopeTag(value)) + {} + + ~TProfilingScope() + { + SetCurrentScopeTag(Prev); + } +}; + +} // namespace NAllocProfiler diff --git a/library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp b/library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp new file mode 100644 index 0000000000..4341dda6ed --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp @@ -0,0 +1,76 @@ +#include "profiler.h" + +#include <library/cpp/testing/unittest/registar.h> + +namespace NAllocProfiler { + +//////////////////////////////////////////////////////////////////////////////// + +Y_UNIT_TEST_SUITE(Profiler) { + Y_UNIT_TEST(StackCollection) + { + TStringStream str; + + NAllocProfiler::StartAllocationSampling(true); + TVector<TAutoPtr<int>> test; + // Do many allocations and no deallocations + for (int i = 0; i < 10000; ++i) { + test.push_back(new int); + } + NAllocProfiler::StopAllocationSampling(str); + //Cout << str.Str() << Endl; + +#if !defined(ARCH_AARCH64) + /* Check that output resembles this: + + STACK #2: 0 Allocs: 10 Frees: 0 CurrentSize: 40 + 0000000000492353 ?? + 000000000048781F operator new(unsigned long) +1807 + 00000000003733FA NAllocProfiler::NTestSuiteProfiler::TTestCaseStackCollection::Execute_(NUnitTest::TTestContext&) +218 + 00000000004A1938 NUnitTest::TTestBase::Run(std::__y1::function<void ()>, TString, char const*, bool) +120 + 0000000000375656 NAllocProfiler::NTestSuiteProfiler::TCurrentTest::Execute() +342 + 00000000004A20CF NUnitTest::TTestFactory::Execute() +847 + 000000000049922D NUnitTest::RunMain(int, char**) +1965 + 00007FF665778F45 __libc_start_main +245 + */ + + UNIT_ASSERT_STRING_CONTAINS(str.Str(), "StackCollection"); + UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NUnitTest::TTestBase::Run"); + UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NAllocProfiler::NTestSuiteProfiler::TCurrentTest::Execute"); + UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NUnitTest::TTestFactory::Execute"); + UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NUnitTest::RunMain"); +#endif + } + + class TAllocDumper : public NAllocProfiler::TAllocationStatsDumper { + public: + explicit TAllocDumper(IOutputStream& out) : NAllocProfiler::TAllocationStatsDumper(out) {} + + TString FormatTag(int tag) override { + UNIT_ASSERT_VALUES_EQUAL(tag, 42); + return "TAG_NAME_42"; + } + }; + + Y_UNIT_TEST(TagNames) + { + TStringStream str; + + NAllocProfiler::StartAllocationSampling(true); + TVector<TAutoPtr<int>> test; + NAllocProfiler::TProfilingScope scope(42); + // Do many allocations and no deallocations + for (int i = 0; i < 10000; ++i) { + test.push_back(new int); + } + + TAllocDumper dumper(str); + NAllocProfiler::StopAllocationSampling(dumper); + +#if !defined(ARCH_AARCH64) + UNIT_ASSERT_STRING_CONTAINS(str.Str(), "TAG_NAME_42"); +#endif + } +} + +} diff --git a/library/cpp/lfalloc/alloc_profiler/stackcollect.cpp b/library/cpp/lfalloc/alloc_profiler/stackcollect.cpp new file mode 100644 index 0000000000..fded4e2fd1 --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/stackcollect.cpp @@ -0,0 +1,332 @@ +#include "stackcollect.h" + +#include "profiler.h" + +#include <util/generic/algorithm.h> +#include <util/generic/vector.h> +#include <util/stream/format.h> +#include <util/stream/str.h> +#include <util/string/cast.h> +#include <util/string/printf.h> +#include <util/system/backtrace.h> +#include <util/system/spinlock.h> +#include <util/system/yassert.h> + + +namespace NAllocProfiler { + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +class TStackCollector: private TNonCopyable { +public: + struct TFrameInfo { + int PrevInd; + void* Addr; + int Tag; + T Stats; + + void Clear() + { + PrevInd = 0; + Addr = nullptr; + Tag = 0; + Stats.Clear(); + } + }; + +private: + static const size_t STACKS_HASH_MAP_SIZE = 256 * 1024; + TFrameInfo Frames[STACKS_HASH_MAP_SIZE]; + + ui64 Samples; // Saved samples count + ui64 UniqueSamples; // Number of unique addresses + ui64 UsedSlots; // Number of occupied slots in the hashtable + ui64 DroppedSamples; // Number of unsaved addresses + ui64 SearchSkipCount; // Total number of linear hash table probes due to collisions + + TAdaptiveLock Lock; + +public: + TStackCollector() + { + Clear(); + } + + int AddStack(void** stack, size_t frameCount, int tag) + { + Y_ASSERT(frameCount > 0); + + int prevInd = -1; + with_lock (Lock) { + for (int i = frameCount - 1; i >= 0; --i) { + prevInd = AddFrame(stack[i], prevInd, ((i == 0) ? tag : 0), (i == 0)); + if (prevInd == -1) { + break; + } + } + } + return prevInd; + } + + T& GetStats(int stackId) + { + Y_ASSERT(stackId >= 0 && (size_t)stackId < Y_ARRAY_SIZE(Frames)); + Y_ASSERT(!IsSlotEmpty(stackId)); + + return Frames[stackId].Stats; + } + + const TFrameInfo* GetFrames() const + { + return Frames; + } + + size_t GetFramesCount() const + { + return Y_ARRAY_SIZE(Frames); + } + + void BackTrace(const TFrameInfo* stack, TStackVec<void*, 64>& frames) const + { + frames.clear(); + for (size_t i = 0; i < 100; ++i) { + frames.push_back(stack->Addr); + int prevInd = stack->PrevInd; + if (prevInd == -1) { + break; + } + stack = &Frames[prevInd]; + } + } + + void Clear() + { + for (auto& frame: Frames) { + frame.Clear(); + } + + Samples = 0; + DroppedSamples = 0; + UniqueSamples = 0; + UsedSlots = 0; + SearchSkipCount = 0; + } + +private: + // Hash function applied to the addresses + static ui32 Hash(void* addr, int prevInd, int tag) + { + return (((size_t)addr + ((size_t)addr / STACKS_HASH_MAP_SIZE)) + prevInd + tag) % STACKS_HASH_MAP_SIZE; + } + + static bool EqualFrame(const TFrameInfo& frame, void* addr, int prevInd, int tag) + { + return (frame.Addr == addr && frame.PrevInd == prevInd && frame.Tag == tag); + } + + bool IsSlotEmpty(ui32 slot) const + { + return Frames[slot].Addr == 0; + } + + bool InsertsAllowed() const + { + return UsedSlots < STACKS_HASH_MAP_SIZE / 2; + } + + // returns the index in the hashmap + int AddFrame(void* addr, int prevFrameIndex, int tag, bool last) + { + ui32 slot = Hash(addr, prevFrameIndex, tag); + ui32 prevSlot = (slot - 1) % STACKS_HASH_MAP_SIZE; + + while (!EqualFrame(Frames[slot], addr, prevFrameIndex, tag) && !IsSlotEmpty(slot) && slot != prevSlot) { + slot = (slot + 1) % STACKS_HASH_MAP_SIZE; + SearchSkipCount++; + } + + if (EqualFrame(Frames[slot], addr, prevFrameIndex, tag)) { + if (last) { + ++Samples; + } + } else if (InsertsAllowed() && IsSlotEmpty(slot)) { + // add new sample + Frames[slot].Clear(); + Frames[slot].Addr = addr; + Frames[slot].PrevInd = prevFrameIndex; + Frames[slot].Tag = tag; + ++UsedSlots; + if (last) { + ++UniqueSamples; + ++Samples; + } + } else { + // don't insert new sample if the search is becoming too slow + ++DroppedSamples; + return -1; + } + + return slot; + } +}; + + +//////////////////////////////////////////////////////////////////////////////// + +class TAllocationStackCollector::TImpl: public TStackCollector<TStats> { + using TBase = TStackCollector<TStats>; + +private: + TStats Total; + +public: + int Alloc(void** stack, size_t frameCount, int tag, size_t size) + { + int stackId = TBase::AddStack(stack, frameCount, tag); + if (stackId >= 0) { + TBase::GetStats(stackId).Alloc(size); + Total.Alloc(size); + } + return stackId; + } + + void Free(int stackId, size_t size) + { + TBase::GetStats(stackId).Free(size); + Total.Free(size); + } + + void Clear() + { + TBase::Clear(); + Total.Clear(); + } + + void Dump(int count, IAllocationStatsDumper& out) const + { + const TFrameInfo* frames = TBase::GetFrames(); + size_t framesCount = TBase::GetFramesCount(); + + TVector<const TFrameInfo*> stacks; + for (size_t i = 0; i < framesCount; ++i) { + if (frames[i].Stats.Allocs) { + stacks.push_back(&frames[i]); + } + } + + Sort(stacks, [] (const TFrameInfo* l, const TFrameInfo* r) { + const auto& ls = l->Stats; + const auto& rs = r->Stats; + return ls.CurrentSize != rs.CurrentSize + ? ls.CurrentSize > rs.CurrentSize + : ls.Allocs != rs.Allocs + ? ls.Allocs > rs.Allocs + : ls.Frees > rs.Frees; + }); + + out.DumpTotal(Total); + + TAllocationInfo allocInfo; + int printedCount = 0; + for (const TFrameInfo* stack: stacks) { + allocInfo.Clear(); + allocInfo.Tag = stack->Tag; + allocInfo.Stats = stack->Stats; + TBase::BackTrace(stack, allocInfo.Stack); + + out.DumpEntry(allocInfo); + + if (++printedCount >= count) { + break; + } + } + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +TAllocationStackCollector::TAllocationStackCollector() + : Impl(new TImpl()) +{} + +TAllocationStackCollector::~TAllocationStackCollector() +{} + +int TAllocationStackCollector::Alloc(void** stack, size_t frameCount, int tag, size_t size) +{ + return Impl->Alloc(stack, frameCount, tag, size); +} + +void TAllocationStackCollector::Free(int stackId, size_t size) +{ + Impl->Free(stackId, size); +} + +void TAllocationStackCollector::Clear() +{ + Impl->Clear(); +} + +void TAllocationStackCollector::Dump(int count, IAllocationStatsDumper &out) const +{ + Impl->Dump(count, out); +} + + +TString IAllocationStatsDumper::FormatTag(int tag) { + return ToString(tag); +} + +TString IAllocationStatsDumper::FormatSize(intptr_t sz) { + return ToString(sz); +} + + +TAllocationStatsDumper::TAllocationStatsDumper(IOutputStream& out) + : PrintedCount(0) + , Out(out) + , SymbolCache(2048) +{} + +void TAllocationStatsDumper::DumpTotal(const TStats& total) { + Out << "TOTAL" + << "\tAllocs: " << total.Allocs + << "\tFrees: " << total.Frees + << "\tCurrentSize: " << FormatSize(total.CurrentSize) + << Endl; +} + +void TAllocationStatsDumper::DumpEntry(const TAllocationInfo& allocInfo) { + Out << Endl + << "STACK #" << PrintedCount+1 << ": " << FormatTag(allocInfo.Tag) + << "\tAllocs: " << allocInfo.Stats.Allocs + << "\tFrees: " << allocInfo.Stats.Frees + << "\tCurrentSize: " << FormatSize(allocInfo.Stats.CurrentSize) + << Endl; + FormatBackTrace(allocInfo.Stack.data(), allocInfo.Stack.size()); + PrintedCount++; +} + +void TAllocationStatsDumper::FormatBackTrace(void* const* stack, size_t sz) { + char name[1024]; + for (size_t i = 0; i < sz; ++i) { + TSymbol symbol; + auto it = SymbolCache.Find(stack[i]); + if (it != SymbolCache.End()) { + symbol = it.Value(); + } else { + TResolvedSymbol rs = ResolveSymbol(stack[i], name, sizeof(name)); + symbol = {rs.NearestSymbol, rs.Name}; + SymbolCache.Insert(stack[i], symbol); + } + + Out << Hex((intptr_t)stack[i], HF_FULL) << "\t" << symbol.Name; + intptr_t offset = (intptr_t)stack[i] - (intptr_t)symbol.Address; + if (offset) + Out << " +" << offset; + Out << Endl; + } +} + +} // namespace NAllocProfiler diff --git a/library/cpp/lfalloc/alloc_profiler/stackcollect.h b/library/cpp/lfalloc/alloc_profiler/stackcollect.h new file mode 100644 index 0000000000..80715ed7cb --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/stackcollect.h @@ -0,0 +1,107 @@ +#pragma once + +#include <library/cpp/containers/stack_vector/stack_vec.h> +#include <library/cpp/cache/cache.h> + +#include <util/generic/noncopyable.h> +#include <util/generic/ptr.h> +#include <util/stream/output.h> + +namespace NAllocProfiler { + +struct TStats { + intptr_t Allocs = 0; + intptr_t Frees = 0; + intptr_t CurrentSize = 0; + + void Clear() + { + Allocs = 0; + Frees = 0; + CurrentSize = 0; + } + + void Alloc(size_t size) + { + AtomicIncrement(Allocs); + AtomicAdd(CurrentSize, size); + } + + void Free(size_t size) + { + AtomicIncrement(Frees); + AtomicSub(CurrentSize, size); + } +}; + +struct TAllocationInfo { + int Tag; + TStats Stats; + TStackVec<void*, 64> Stack; + + void Clear() { + Tag = 0; + Stats.Clear(); + Stack.clear(); + } +}; + + +class IAllocationStatsDumper { +public: + virtual ~IAllocationStatsDumper() = default; + + // Total stats + virtual void DumpTotal(const TStats& total) = 0; + + // Stats for individual stack + virtual void DumpEntry(const TAllocationInfo& allocInfo) = 0; + + // App-specific tag printer + virtual TString FormatTag(int tag); + + // Size printer (e.g. "10KB", "100MB", "over 9000") + virtual TString FormatSize(intptr_t sz); +}; + +// Default implementation +class TAllocationStatsDumper: public IAllocationStatsDumper { +public: + explicit TAllocationStatsDumper(IOutputStream& out); + void DumpTotal(const TStats& total) override; + void DumpEntry(const TAllocationInfo& allocInfo) override; + +private: + void FormatBackTrace(void* const* stack, size_t sz); + +private: + struct TSymbol { + const void* Address; + TString Name; + }; + + size_t PrintedCount; + IOutputStream& Out; + TLFUCache<void*, TSymbol> SymbolCache; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TAllocationStackCollector: private TNonCopyable { +private: + class TImpl; + THolder<TImpl> Impl; + +public: + TAllocationStackCollector(); + ~TAllocationStackCollector(); + + int Alloc(void** stack, size_t frameCount, int tag, size_t size); + void Free(int stackId, size_t size); + + void Clear(); + + void Dump(int count, IAllocationStatsDumper& out) const; +}; + +} // namespace NAllocProfiler diff --git a/library/cpp/lfalloc/alloc_profiler/ut/ya.make b/library/cpp/lfalloc/alloc_profiler/ut/ya.make new file mode 100644 index 0000000000..8a7daa74af --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/ut/ya.make @@ -0,0 +1,22 @@ +UNITTEST_FOR(library/cpp/lfalloc/alloc_profiler) + +OWNER(g:rtmr g:kikimr) + +PEERDIR( + library/cpp/testing/unittest +) + +IF (ARCH_AARCH64) + PEERDIR( + contrib/libs/jemalloc + ) +ELSE() + ALLOCATOR(LF_DBG) +ENDIF() + +SRCS( + profiler_ut.cpp + align_ut.cpp +) + +END() diff --git a/library/cpp/lfalloc/alloc_profiler/ya.make b/library/cpp/lfalloc/alloc_profiler/ya.make new file mode 100644 index 0000000000..0f58d91767 --- /dev/null +++ b/library/cpp/lfalloc/alloc_profiler/ya.make @@ -0,0 +1,17 @@ +LIBRARY() + +OWNER(g:rtmr g:kikimr) + +SRCS( + profiler.cpp + stackcollect.cpp +) + +PEERDIR( + library/cpp/lfalloc/dbg_info + library/cpp/cache +) + +END() + +RECURSE(ut) diff --git a/library/cpp/lfalloc/dbg/ya.make b/library/cpp/lfalloc/dbg/ya.make new file mode 100644 index 0000000000..3dce653a8c --- /dev/null +++ b/library/cpp/lfalloc/dbg/ya.make @@ -0,0 +1,32 @@ +LIBRARY() + +OWNER(vskipin) + +NO_UTIL() + +NO_COMPILER_WARNINGS() + +IF (ARCH_AARCH64) + PEERDIR( + contrib/libs/jemalloc + ) +ELSE() + IF ("${YMAKE}" MATCHES "devtools") + CFLAGS(-DYMAKE=1) + ENDIF() + CXXFLAGS( + -DLFALLOC_DBG + -DLFALLOC_YT + ) + SRCS( + ../lf_allocX64.cpp + ) +ENDIF() + +PEERDIR( + library/cpp/malloc/api +) + +SET(IDE_FOLDER "util") + +END() diff --git a/library/cpp/lfalloc/dbg_info/dbg_info.cpp b/library/cpp/lfalloc/dbg_info/dbg_info.cpp new file mode 100644 index 0000000000..1fb9f7ad93 --- /dev/null +++ b/library/cpp/lfalloc/dbg_info/dbg_info.cpp @@ -0,0 +1,124 @@ +#include "dbg_info.h" + +#include <library/cpp/malloc/api/malloc.h> + +namespace NAllocDbg { + //////////////////////////////////////////////////////////////////////////////// + + using TGetAllocationCounter = i64(int counter); + + using TSetThreadAllocTag = int(int tag); + using TGetPerTagAllocInfo = void( + bool flushPerThreadCounters, + TPerTagAllocInfo* info, + int& maxTag, + int& numSizes); + + using TSetProfileCurrentThread = bool(bool newVal); + using TSetProfileAllThreads = bool(bool newVal); + using TSetAllocationSamplingEnabled = bool(bool newVal); + + using TSetAllocationSampleRate = size_t(size_t newVal); + using TSetAllocationSampleMaxSize = size_t(size_t newVal); + + using TSetAllocationCallback = TAllocationCallback*(TAllocationCallback* newVal); + using TSetDeallocationCallback = TDeallocationCallback*(TDeallocationCallback* newVal); + + struct TAllocFn { + TGetAllocationCounter* GetAllocationCounterFast = nullptr; + TGetAllocationCounter* GetAllocationCounterFull = nullptr; + + TSetThreadAllocTag* SetThreadAllocTag = nullptr; + TGetPerTagAllocInfo* GetPerTagAllocInfo = nullptr; + + TSetProfileCurrentThread* SetProfileCurrentThread = nullptr; + TSetProfileAllThreads* SetProfileAllThreads = nullptr; + TSetAllocationSamplingEnabled* SetAllocationSamplingEnabled = nullptr; + + TSetAllocationSampleRate* SetAllocationSampleRate = nullptr; + TSetAllocationSampleMaxSize* SetAllocationSampleMaxSize = nullptr; + + TSetAllocationCallback* SetAllocationCallback = nullptr; + TSetDeallocationCallback* SetDeallocationCallback = nullptr; + + TAllocFn() { + auto mallocInfo = NMalloc::MallocInfo(); + + GetAllocationCounterFast = (TGetAllocationCounter*)mallocInfo.GetParam("GetLFAllocCounterFast"); + GetAllocationCounterFull = (TGetAllocationCounter*)mallocInfo.GetParam("GetLFAllocCounterFull"); + + SetThreadAllocTag = (TSetThreadAllocTag*)mallocInfo.GetParam("SetThreadAllocTag"); + GetPerTagAllocInfo = (TGetPerTagAllocInfo*)mallocInfo.GetParam("GetPerTagAllocInfo"); + + SetProfileCurrentThread = (TSetProfileCurrentThread*)mallocInfo.GetParam("SetProfileCurrentThread"); + SetProfileAllThreads = (TSetProfileAllThreads*)mallocInfo.GetParam("SetProfileAllThreads"); + SetAllocationSamplingEnabled = (TSetAllocationSamplingEnabled*)mallocInfo.GetParam("SetAllocationSamplingEnabled"); + + SetAllocationSampleRate = (TSetAllocationSampleRate*)mallocInfo.GetParam("SetAllocationSampleRate"); + SetAllocationSampleMaxSize = (TSetAllocationSampleMaxSize*)mallocInfo.GetParam("SetAllocationSampleMaxSize"); + + SetAllocationCallback = (TSetAllocationCallback*)mallocInfo.GetParam("SetAllocationCallback"); + SetDeallocationCallback = (TSetDeallocationCallback*)mallocInfo.GetParam("SetDeallocationCallback"); + } + }; + + //////////////////////////////////////////////////////////////////////////////// + + static TAllocFn AllocFn; + + i64 GetAllocationCounterFast(ELFAllocCounter counter) { + return AllocFn.GetAllocationCounterFast ? AllocFn.GetAllocationCounterFast(counter) : 0; + } + + i64 GetAllocationCounterFull(ELFAllocCounter counter) { + return AllocFn.GetAllocationCounterFull ? AllocFn.GetAllocationCounterFull(counter) : 0; + } + + int SetThreadAllocTag(int tag) { + return AllocFn.SetThreadAllocTag ? AllocFn.SetThreadAllocTag(tag) : 0; + } + + TArrayPtr<TPerTagAllocInfo> GetPerTagAllocInfo( + bool flushPerThreadCounters, + int& maxTag, + int& numSizes) { + if (AllocFn.GetPerTagAllocInfo) { + AllocFn.GetPerTagAllocInfo(flushPerThreadCounters, nullptr, maxTag, numSizes); + TArrayPtr<TPerTagAllocInfo> info = new TPerTagAllocInfo[maxTag * numSizes]; + AllocFn.GetPerTagAllocInfo(flushPerThreadCounters, info.Get(), maxTag, numSizes); + return info; + } + maxTag = 0; + numSizes = 0; + return nullptr; + } + + bool SetProfileCurrentThread(bool newVal) { + return AllocFn.SetProfileCurrentThread ? AllocFn.SetProfileCurrentThread(newVal) : false; + } + + bool SetProfileAllThreads(bool newVal) { + return AllocFn.SetProfileAllThreads ? AllocFn.SetProfileAllThreads(newVal) : false; + } + + bool SetAllocationSamplingEnabled(bool newVal) { + return AllocFn.SetAllocationSamplingEnabled ? AllocFn.SetAllocationSamplingEnabled(newVal) : false; + } + + size_t SetAllocationSampleRate(size_t newVal) { + return AllocFn.SetAllocationSampleRate ? AllocFn.SetAllocationSampleRate(newVal) : 0; + } + + size_t SetAllocationSampleMaxSize(size_t newVal) { + return AllocFn.SetAllocationSampleMaxSize ? AllocFn.SetAllocationSampleMaxSize(newVal) : 0; + } + + TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal) { + return AllocFn.SetAllocationCallback ? AllocFn.SetAllocationCallback(newVal) : nullptr; + } + + TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal) { + return AllocFn.SetDeallocationCallback ? AllocFn.SetDeallocationCallback(newVal) : nullptr; + } + +} diff --git a/library/cpp/lfalloc/dbg_info/dbg_info.h b/library/cpp/lfalloc/dbg_info/dbg_info.h new file mode 100644 index 0000000000..071562a81a --- /dev/null +++ b/library/cpp/lfalloc/dbg_info/dbg_info.h @@ -0,0 +1,77 @@ +#pragma once + +#include <util/generic/ptr.h> +#include <util/system/types.h> + +namespace NAllocDbg { + //////////////////////////////////////////////////////////////////////////////// + // Allocation statistics + + enum ELFAllocCounter { + CT_USER_ALLOC, // accumulated size requested by user code + CT_MMAP, // accumulated mmapped size + CT_MMAP_CNT, // number of mmapped regions + CT_MUNMAP, // accumulated unmmapped size + CT_MUNMAP_CNT, // number of munmaped regions + CT_SYSTEM_ALLOC, // accumulated allocated size for internal lfalloc needs + CT_SYSTEM_FREE, // accumulated deallocated size for internal lfalloc needs + CT_SMALL_ALLOC, // accumulated allocated size for fixed-size blocks + CT_SMALL_FREE, // accumulated deallocated size for fixed-size blocks + CT_LARGE_ALLOC, // accumulated allocated size for large blocks + CT_LARGE_FREE, // accumulated deallocated size for large blocks + CT_SLOW_ALLOC_CNT, // number of slow (not LF) allocations + CT_DEGRAGMENT_CNT, // number of memory defragmentations + CT_MAX + }; + + i64 GetAllocationCounterFast(ELFAllocCounter counter); + i64 GetAllocationCounterFull(ELFAllocCounter counter); + + //////////////////////////////////////////////////////////////////////////////// + // Allocation statistics could be tracked on per-tag basis + + int SetThreadAllocTag(int tag); + + class TScopedTag { + private: + int PrevTag; + + public: + explicit TScopedTag(int tag) { + PrevTag = SetThreadAllocTag(tag); + } + + ~TScopedTag() { + SetThreadAllocTag(PrevTag); + } + }; + + struct TPerTagAllocInfo { + ssize_t Count; + ssize_t Size; + }; + + TArrayPtr<TPerTagAllocInfo> GetPerTagAllocInfo( + bool flushPerThreadCounters, + int& maxTag, + int& numSizes); + + //////////////////////////////////////////////////////////////////////////////// + // Allocation sampling could be used to collect detailed information + + bool SetProfileCurrentThread(bool newVal); + bool SetProfileAllThreads(bool newVal); + bool SetAllocationSamplingEnabled(bool newVal); + + size_t SetAllocationSampleRate(size_t newVal); + size_t SetAllocationSampleMaxSize(size_t newVal); + +#define DBG_ALLOC_INVALID_COOKIE (-1) + + using TAllocationCallback = int(int tag, size_t size, int sizeIdx); + using TDeallocationCallback = void(int cookie, int tag, size_t size, int sizeIdx); + + TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal); + TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal); + +} diff --git a/library/cpp/lfalloc/dbg_info/ya.make b/library/cpp/lfalloc/dbg_info/ya.make new file mode 100644 index 0000000000..efecba5993 --- /dev/null +++ b/library/cpp/lfalloc/dbg_info/ya.make @@ -0,0 +1,15 @@ +LIBRARY() + +OWNER(vskipin) + +PEERDIR( + library/cpp/malloc/api +) + +SRCS( + dbg_info.cpp +) + +SET(IDE_FOLDER "util") + +END() diff --git a/library/cpp/lfalloc/lf_allocX64.cpp b/library/cpp/lfalloc/lf_allocX64.cpp new file mode 100644 index 0000000000..2eb90761fe --- /dev/null +++ b/library/cpp/lfalloc/lf_allocX64.cpp @@ -0,0 +1,144 @@ +#include "lf_allocX64.h" + +////////////////////////////////////////////////////////////////////////// +// hooks +#if defined(USE_INTELCC) || defined(_darwin_) || defined(_freebsd_) || defined(_STLPORT_VERSION) +#define OP_THROWNOTHING noexcept +#else +#define OP_THROWNOTHING +#endif + +#ifndef _darwin_ +#if !defined(YMAKE) +void* operator new(size_t size) { + return LFAlloc(size); +} + +void* operator new(size_t size, const std::nothrow_t&) OP_THROWNOTHING { + return LFAlloc(size); +} + +void operator delete(void* p)OP_THROWNOTHING { + LFFree(p); +} + +void operator delete(void* p, const std::nothrow_t&)OP_THROWNOTHING { + LFFree(p); +} + +void* operator new[](size_t size) { + return LFAlloc(size); +} + +void* operator new[](size_t size, const std::nothrow_t&) OP_THROWNOTHING { + return LFAlloc(size); +} + +void operator delete[](void* p) OP_THROWNOTHING { + LFFree(p); +} + +void operator delete[](void* p, const std::nothrow_t&) OP_THROWNOTHING { + LFFree(p); +} +#endif + +//#ifndef _MSC_VER + +extern "C" void* malloc(size_t size) { + return LFAlloc(size); +} + +extern "C" void* valloc(size_t size) { + return LFVAlloc(size); +} + +extern "C" int posix_memalign(void** memptr, size_t alignment, size_t size) { + return LFPosixMemalign(memptr, alignment, size); +} + +extern "C" void* memalign(size_t alignment, size_t size) { + void* ptr; + int res = LFPosixMemalign(&ptr, alignment, size); + return res ? nullptr : ptr; +} + +extern "C" void* aligned_alloc(size_t alignment, size_t size) { + return memalign(alignment, size); +} + +#if !defined(_MSC_VER) && !defined(_freebsd_) +// Workaround for pthread_create bug in linux. +extern "C" void* __libc_memalign(size_t alignment, size_t size) { + return memalign(alignment, size); +} +#endif + +extern "C" void free(void* ptr) { + LFFree(ptr); +} + +extern "C" void* calloc(size_t n, size_t elem_size) { + // Overflow check + const size_t size = n * elem_size; + if (elem_size != 0 && size / elem_size != n) + return nullptr; + + void* result = LFAlloc(size); + if (result != nullptr) { + memset(result, 0, size); + } + return result; +} + +extern "C" void cfree(void* ptr) { + LFFree(ptr); +} + +extern "C" void* realloc(void* old_ptr, size_t new_size) { + if (old_ptr == nullptr) { + void* result = LFAlloc(new_size); + return result; + } + if (new_size == 0) { + LFFree(old_ptr); + return nullptr; + } + + void* new_ptr = LFAlloc(new_size); + if (new_ptr == nullptr) { + return nullptr; + } + size_t old_size = LFGetSize(old_ptr); + memcpy(new_ptr, old_ptr, ((old_size < new_size) ? old_size : new_size)); + LFFree(old_ptr); + return new_ptr; +} + +extern "C" size_t malloc_usable_size(void* ptr) { + if (ptr == nullptr) { + return 0; + } + return LFGetSize(ptr); +} + +NMalloc::TMallocInfo NMalloc::MallocInfo() { + NMalloc::TMallocInfo r; +#if defined(LFALLOC_DBG) + r.Name = "lfalloc_dbg"; +#elif defined(LFALLOC_YT) + r.Name = "lfalloc_yt"; +#else + r.Name = "lfalloc"; +#endif + r.SetParam = &LFAlloc_SetParam; + r.GetParam = &LFAlloc_GetParam; + return r; +} +#else +NMalloc::TMallocInfo NMalloc::MallocInfo() { + NMalloc::TMallocInfo r; + r.Name = "system-darwin"; + return r; +} +#endif diff --git a/library/cpp/lfalloc/lf_allocX64.h b/library/cpp/lfalloc/lf_allocX64.h new file mode 100644 index 0000000000..fd2a906d6f --- /dev/null +++ b/library/cpp/lfalloc/lf_allocX64.h @@ -0,0 +1,1926 @@ +#pragma once + +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> + +#include <library/cpp/malloc/api/malloc.h> + +#include <util/system/compat.h> +#include <util/system/compiler.h> +#include <util/system/types.h> + +#ifdef _MSC_VER +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS +#endif +#ifdef _M_X64 +#define _64_ +#endif +#include <intrin.h> +#define WIN32_LEAN_AND_MEAN +#include <Windows.h> +#pragma intrinsic(_InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd) + +#include <new> +#include <assert.h> +#include <errno.h> + +#define PERTHREAD __declspec(thread) +#define _win_ +#define Y_FORCE_INLINE __forceinline + +using TAtomic = volatile long; + +static inline long AtomicAdd(TAtomic& a, long b) { + return _InterlockedExchangeAdd(&a, b) + b; +} + +static inline long AtomicSub(TAtomic& a, long b) { + return AtomicAdd(a, -b); +} + +#pragma comment(lib, "synchronization.lib") + +#ifndef NDEBUG +#define Y_ASSERT_NOBT(x) \ + { \ + if (IsDebuggerPresent()) { \ + if (!(x)) \ + __debugbreak(); \ + } else \ + assert(x); \ + } +#else +#define Y_ASSERT_NOBT(x) ((void)0) +#endif + +#else + +#include <util/system/defaults.h> +#include <util/system/atomic.h> +#include <util/system/yassert.h> + +#if !defined(NDEBUG) && !defined(__GCCXML__) +#define Y_ASSERT_NOBT(a) \ + do { \ + try { \ + if (Y_UNLIKELY(!(a))) { \ + if (YaIsDebuggerPresent()) \ + __debugbreak(); \ + else { \ + assert(false && (a)); \ + } \ + } \ + } catch (...) { \ + if (YaIsDebuggerPresent()) \ + __debugbreak(); \ + else { \ + assert(false && "Exception during assert"); \ + } \ + } \ + } while (0) +#else +#define Y_ASSERT_NOBT(a) \ + do { \ + if (false) { \ + bool __xxx = static_cast<bool>(a); \ + Y_UNUSED(__xxx); \ + } \ + } while (0) +#endif + +#include <pthread.h> +#include <sys/mman.h> +#include <stdlib.h> +#include <memory.h> +#include <new> +#include <errno.h> + +#if defined(_linux_) +#include <linux/futex.h> +#include <sys/syscall.h> +#if !defined(MADV_HUGEPAGE) +#define MADV_HUGEPAGE 14 +#endif +#if !defined(MAP_HUGETLB) +#define MAP_HUGETLB 0x40000 +#endif +#endif + +#define PERTHREAD __thread + +#endif + +#ifndef _darwin_ + +#ifndef Y_ARRAY_SIZE +#define Y_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) +#endif + +#ifndef NDEBUG +#define DBG_FILL_MEMORY +static bool FillMemoryOnAllocation = true; +#endif + +static bool TransparentHugePages = false; // force MADV_HUGEPAGE for large allocs +static bool MapHugeTLB = false; // force MAP_HUGETLB for small allocs +static bool EnableDefrag = true; + +// Buffers that are larger than this size will not be filled with 0xcf +#ifndef DBG_FILL_MAX_SIZE +#define DBG_FILL_MAX_SIZE 0x01000000000000ULL +#endif + +template <class T> +inline T* DoCas(T* volatile* target, T* exchange, T* compare) { +#if defined(__has_builtin) && __has_builtin(__sync_val_compare_and_swap) + return __sync_val_compare_and_swap(target, compare, exchange); +#elif defined(_WIN32) +#ifdef _64_ + return (T*)_InterlockedCompareExchange64((__int64*)target, (__int64)exchange, (__int64)compare); +#else + //return (T*)InterlockedCompareExchangePointer(targetVoidP, exchange, compare); + return (T*)_InterlockedCompareExchange((LONG*)target, (LONG)exchange, (LONG)compare); +#endif +#elif defined(__i386) || defined(__x86_64__) + union { + T* volatile* NP; + void* volatile* VoidP; + } gccSucks; + gccSucks.NP = target; + void* volatile* targetVoidP = gccSucks.VoidP; + + __asm__ __volatile__( + "lock\n\t" + "cmpxchg %2,%0\n\t" + : "+m"(*(targetVoidP)), "+a"(compare) + : "r"(exchange) + : "cc", "memory"); + return compare; +#else +#error inline_cas not defined for this platform +#endif +} + +#ifdef _64_ +const uintptr_t N_MAX_WORKSET_SIZE = 0x100000000ll * 200; +const uintptr_t N_HUGE_AREA_FINISH = 0x700000000000ll; +#ifndef _freebsd_ +const uintptr_t LINUX_MMAP_AREA_START = 0x100000000ll; +static uintptr_t volatile linuxAllocPointer = LINUX_MMAP_AREA_START; +static uintptr_t volatile linuxAllocPointerHuge = LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE; +#endif +#else +const uintptr_t N_MAX_WORKSET_SIZE = 0xffffffff; +#endif +#define ALLOC_START ((char*)0) + +const size_t N_CHUNK_SIZE = 1024 * 1024; +const size_t N_CHUNKS = N_MAX_WORKSET_SIZE / N_CHUNK_SIZE; +const size_t N_LARGE_ALLOC_SIZE = N_CHUNK_SIZE * 128; + +// map size idx to size in bytes +#ifdef LFALLOC_YT +const int N_SIZES = 27; +#else +const int N_SIZES = 25; +#endif +const int nSizeIdxToSize[N_SIZES] = { + -1, +#if defined(_64_) + 16, 16, 32, 32, 48, 64, 96, 128, +#else + 8, + 16, + 24, + 32, + 48, + 64, + 96, + 128, +#endif + 192, 256, 384, 512, 768, 1024, 1536, 2048, + 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768, +#ifdef LFALLOC_YT + 49152, 65536 +#endif +}; +#ifdef LFALLOC_YT +const size_t N_MAX_FAST_SIZE = 65536; +#else +const size_t N_MAX_FAST_SIZE = 32768; +#endif +const unsigned char size2idxArr1[64 + 1] = { + 1, +#if defined(_64_) + 2, 2, 4, 4, // 16, 16, 32, 32 +#else + 1, 2, 3, 4, // 8, 16, 24, 32 +#endif + 5, 5, 6, 6, // 48, 64 + 7, 7, 7, 7, 8, 8, 8, 8, // 96, 128 + 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, // 192, 256 + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, // 384 + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12 // 512 +}; +#ifdef LFALLOC_YT +const unsigned char size2idxArr2[256] = { +#else +const unsigned char size2idxArr2[128] = { +#endif + 12, 12, 13, 14, // 512, 512, 768, 1024 + 15, 15, 16, 16, // 1536, 2048 + 17, 17, 17, 17, 18, 18, 18, 18, // 3072, 4096 + 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, // 6144, 8192 + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, // 12288 + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, // 16384 + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, // 24576 + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, // 32768 +#ifdef LFALLOC_YT + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, // 49152 + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, // 65536 +#endif +}; + +// map entry number to size idx +// special size idx's: 0 = not used, -1 = mem locked, but not allocated +static volatile char chunkSizeIdx[N_CHUNKS]; +const int FREE_CHUNK_ARR_BUF = 0x20000; // this is effectively 128G of free memory (with 1M chunks), should not be exhausted actually +static volatile uintptr_t freeChunkArr[FREE_CHUNK_ARR_BUF]; +static volatile int freeChunkCount; + +static void AddFreeChunk(uintptr_t chunkId) { + chunkSizeIdx[chunkId] = -1; + if (Y_UNLIKELY(freeChunkCount == FREE_CHUNK_ARR_BUF)) + NMalloc::AbortFromCorruptedAllocator("free chunks array overflowed"); + freeChunkArr[freeChunkCount++] = chunkId; +} + +static bool GetFreeChunk(uintptr_t* res) { + if (freeChunkCount == 0) { + *res = 0; + return false; + } + *res = freeChunkArr[--freeChunkCount]; + return true; +} + +////////////////////////////////////////////////////////////////////////// +enum ELFAllocCounter { + CT_USER_ALLOC, // accumulated size requested by user code + CT_MMAP, // accumulated mmapped size + CT_MMAP_CNT, // number of mmapped regions + CT_MUNMAP, // accumulated unmmapped size + CT_MUNMAP_CNT, // number of munmaped regions + CT_SYSTEM_ALLOC, // accumulated allocated size for internal lfalloc needs + CT_SYSTEM_FREE, // accumulated deallocated size for internal lfalloc needs + CT_SMALL_ALLOC, // accumulated allocated size for fixed-size blocks + CT_SMALL_FREE, // accumulated deallocated size for fixed-size blocks + CT_LARGE_ALLOC, // accumulated allocated size for large blocks + CT_LARGE_FREE, // accumulated deallocated size for large blocks + CT_SLOW_ALLOC_CNT, // number of slow (not LF) allocations + CT_DEGRAGMENT_CNT, // number of memory defragmentations + CT_MAX +}; + +static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value); + +////////////////////////////////////////////////////////////////////////// +enum EMMapMode { + MM_NORMAL, // memory for small allocs + MM_HUGE // memory for large allocs +}; + +#ifndef _MSC_VER +inline void VerifyMmapResult(void* result) { + if (Y_UNLIKELY(result == MAP_FAILED)) + NMalloc::AbortFromCorruptedAllocator("negative size requested? or just out of mem"); +} +#endif + +#if !defined(_MSC_VER) && !defined(_freebsd_) && defined(_64_) +static char* AllocWithMMapLinuxImpl(uintptr_t sz, EMMapMode mode) { + char* volatile* areaPtr; + char* areaStart; + uintptr_t areaFinish; + + int mapProt = PROT_READ | PROT_WRITE; + int mapFlags = MAP_PRIVATE | MAP_ANON; + + if (mode == MM_HUGE) { + areaPtr = reinterpret_cast<char* volatile*>(&linuxAllocPointerHuge); + areaStart = reinterpret_cast<char*>(LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE); + areaFinish = N_HUGE_AREA_FINISH; + } else { + areaPtr = reinterpret_cast<char* volatile*>(&linuxAllocPointer); + areaStart = reinterpret_cast<char*>(LINUX_MMAP_AREA_START); + areaFinish = N_MAX_WORKSET_SIZE; + + if (MapHugeTLB) { + mapFlags |= MAP_HUGETLB; + } + } + + bool wrapped = false; + for (;;) { + char* prevAllocPtr = *areaPtr; + char* nextAllocPtr = prevAllocPtr + sz; + if (uintptr_t(nextAllocPtr - (char*)nullptr) >= areaFinish) { + if (Y_UNLIKELY(wrapped)) { + NMalloc::AbortFromCorruptedAllocator("virtual memory is over fragmented"); + } + // wrap after all area is used + DoCas(areaPtr, areaStart, prevAllocPtr); + wrapped = true; + continue; + } + + if (DoCas(areaPtr, nextAllocPtr, prevAllocPtr) != prevAllocPtr) + continue; + + char* largeBlock = (char*)mmap(prevAllocPtr, sz, mapProt, mapFlags, -1, 0); + VerifyMmapResult(largeBlock); + if (largeBlock == prevAllocPtr) + return largeBlock; + if (largeBlock) + munmap(largeBlock, sz); + + if (sz < 0x80000) { + // skip utilized area with big steps + DoCas(areaPtr, nextAllocPtr + 0x10 * 0x10000, nextAllocPtr); + } + } +} +#endif + +static char* AllocWithMMap(uintptr_t sz, EMMapMode mode) { + (void)mode; +#ifdef _MSC_VER + char* largeBlock = (char*)VirtualAlloc(0, sz, MEM_RESERVE, PAGE_READWRITE); + if (Y_UNLIKELY(largeBlock == nullptr)) + NMalloc::AbortFromCorruptedAllocator("out of memory"); + if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE)) + NMalloc::AbortFromCorruptedAllocator("out of working set, something has broken"); +#else +#if defined(_freebsd_) || !defined(_64_) + char* largeBlock = (char*)mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + VerifyMmapResult(largeBlock); + if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE)) + NMalloc::AbortFromCorruptedAllocator("out of working set, something has broken"); +#else + char* largeBlock = AllocWithMMapLinuxImpl(sz, mode); + if (TransparentHugePages) { + madvise(largeBlock, sz, MADV_HUGEPAGE); + } +#endif +#endif + Y_ASSERT_NOBT(largeBlock); + IncrementCounter(CT_MMAP, sz); + IncrementCounter(CT_MMAP_CNT, 1); + return largeBlock; +} + +enum class ELarge : ui8 { + Free = 0, // block in free cache + Alloc = 1, // block is allocated + Gone = 2, // block was unmapped +}; + +struct TLargeBlk { + + static TLargeBlk* As(void *raw) { + return reinterpret_cast<TLargeBlk*>((char*)raw - 4096ll); + } + + static const TLargeBlk* As(const void *raw) { + return reinterpret_cast<const TLargeBlk*>((const char*)raw - 4096ll); + } + + void SetSize(size_t bytes, size_t pages) { + Pages = pages; + Bytes = bytes; + } + + void Mark(ELarge state) { + const ui64 marks[] = { + 0x8b38aa5ca4953c98, // ELarge::Free + 0xf916d33584eb5087, // ELarge::Alloc + 0xd33b0eca7651bc3f // ELarge::Gone + }; + + Token = size_t(marks[ui8(state)]); + } + + size_t Pages; // Total pages allocated with mmap like call + size_t Bytes; // Actually requested bytes by user + size_t Token; // Block state token, see ELarge enum. +}; + + +static void LargeBlockUnmap(void* p, size_t pages) { + const auto bytes = (pages + 1) * uintptr_t(4096); + + IncrementCounter(CT_MUNMAP, bytes); + IncrementCounter(CT_MUNMAP_CNT, 1); +#ifdef _MSC_VER + Y_ASSERT_NOBT(0); +#else + TLargeBlk::As(p)->Mark(ELarge::Gone); + munmap((char*)p - 4096ll, bytes); +#endif +} + +////////////////////////////////////////////////////////////////////////// +const size_t LB_BUF_SIZE = 250; +const size_t LB_BUF_HASH = 977; +static int LB_LIMIT_TOTAL_SIZE = 500 * 1024 * 1024 / 4096; // do not keep more then this mem total in lbFreePtrs[] +static void* volatile lbFreePtrs[LB_BUF_HASH][LB_BUF_SIZE]; +static TAtomic lbFreePageCount; + + +static void* LargeBlockAlloc(size_t _nSize, ELFAllocCounter counter) { + size_t pgCount = (_nSize + 4095) / 4096; +#ifdef _MSC_VER + char* pRes = (char*)VirtualAlloc(0, (pgCount + 1) * 4096ll, MEM_COMMIT, PAGE_READWRITE); + if (Y_UNLIKELY(pRes == 0)) { + NMalloc::AbortFromCorruptedAllocator("out of memory"); + } +#else + + IncrementCounter(counter, pgCount * 4096ll); + IncrementCounter(CT_SYSTEM_ALLOC, 4096ll); + + int lbHash = pgCount % LB_BUF_HASH; + for (int i = 0; i < LB_BUF_SIZE; ++i) { + void* p = lbFreePtrs[lbHash][i]; + if (p == nullptr) + continue; + if (DoCas(&lbFreePtrs[lbHash][i], (void*)nullptr, p) == p) { + size_t realPageCount = TLargeBlk::As(p)->Pages; + if (realPageCount == pgCount) { + AtomicAdd(lbFreePageCount, -pgCount); + TLargeBlk::As(p)->Mark(ELarge::Alloc); + return p; + } else { + if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) != (void*)nullptr) { + // block was freed while we were busy + AtomicAdd(lbFreePageCount, -realPageCount); + LargeBlockUnmap(p, realPageCount); + --i; + } + } + } + } + char* pRes = AllocWithMMap((pgCount + 1) * 4096ll, MM_HUGE); +#endif + pRes += 4096ll; + TLargeBlk::As(pRes)->SetSize(_nSize, pgCount); + TLargeBlk::As(pRes)->Mark(ELarge::Alloc); + + return pRes; +} + +#ifndef _MSC_VER +static void FreeAllLargeBlockMem() { + for (auto& lbFreePtr : lbFreePtrs) { + for (int i = 0; i < LB_BUF_SIZE; ++i) { + void* p = lbFreePtr[i]; + if (p == nullptr) + continue; + if (DoCas(&lbFreePtr[i], (void*)nullptr, p) == p) { + int pgCount = TLargeBlk::As(p)->Pages; + AtomicAdd(lbFreePageCount, -pgCount); + LargeBlockUnmap(p, pgCount); + } + } + } +} +#endif + +static void LargeBlockFree(void* p, ELFAllocCounter counter) { + if (p == nullptr) + return; +#ifdef _MSC_VER + VirtualFree((char*)p - 4096ll, 0, MEM_RELEASE); +#else + size_t pgCount = TLargeBlk::As(p)->Pages; + + TLargeBlk::As(p)->Mark(ELarge::Free); + IncrementCounter(counter, pgCount * 4096ll); + IncrementCounter(CT_SYSTEM_FREE, 4096ll); + + if (lbFreePageCount > LB_LIMIT_TOTAL_SIZE) + FreeAllLargeBlockMem(); + int lbHash = pgCount % LB_BUF_HASH; + for (int i = 0; i < LB_BUF_SIZE; ++i) { + if (lbFreePtrs[lbHash][i] == nullptr) { + if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) == nullptr) { + AtomicAdd(lbFreePageCount, pgCount); + return; + } + } + } + + LargeBlockUnmap(p, pgCount); +#endif +} + +static void* SystemAlloc(size_t _nSize) { + //HeapAlloc(GetProcessHeap(), HEAP_GENERATE_EXCEPTIONS, _nSize); + return LargeBlockAlloc(_nSize, CT_SYSTEM_ALLOC); +} +static void SystemFree(void* p) { + //HeapFree(GetProcessHeap(), 0, p); + LargeBlockFree(p, CT_SYSTEM_FREE); +} + + +////////////////////////////////////////////////////////////////////////// +char* const LF_LOCK_FREE = ((char*)0) + 0; +char* const LF_LOCK_LOCKED = ((char*)0) + 1; +char* const LF_LOCK_FUTEX_WAIT = ((char*)0) + 2; +static bool LFHasFutex = true; +static bool LFCheckedWinVersion = false; + +// TLFLockData has to be zero-initialized explicitly https://en.cppreference.com/w/cpp/language/zero_initialization +// otherwise constructor TLFLockData() for global var might be called after first use +struct TLFLockData +{ + char* Pad1[15]; + char* volatile LockVar; // = LF_LOCK_FREE; // no constructor, zero-initialize manually + char* Pad2[15]; + + bool TryLock() + { + return (LockVar == LF_LOCK_FREE && DoCas(&LockVar, LF_LOCK_LOCKED, LF_LOCK_FREE) == LF_LOCK_FREE); + } + + void FutexWait() + { +#ifdef _win_ + if (!LFCheckedWinVersion) { + OSVERSIONINFOA ver; + memset(&ver, 0, sizeof(ver)); + ver.dwOSVersionInfoSize = sizeof(OSVERSIONINFOA); + GetVersionExA(&ver); + LFHasFutex = (ver.dwMajorVersion > 6) || (ver.dwMajorVersion == 6 && ver.dwMinorVersion >= 2); + LFCheckedWinVersion = true; + } + if (LFHasFutex) { + if (LockVar == LF_LOCK_LOCKED) { + DoCas(&LockVar, LF_LOCK_FUTEX_WAIT, LF_LOCK_LOCKED); + } + if (LockVar == LF_LOCK_FUTEX_WAIT) { + char* lockedValue = LF_LOCK_FUTEX_WAIT; + WaitOnAddress(&LockVar, &lockedValue, sizeof(LockVar), INFINITE); + } + } else { + SwitchToThread(); + } +#elif defined(_linux_) + if (LFHasFutex) { + if (LockVar == LF_LOCK_LOCKED) { + DoCas(&LockVar, LF_LOCK_FUTEX_WAIT, LF_LOCK_LOCKED); + } + if (LockVar == LF_LOCK_FUTEX_WAIT) { + // linux allow only int variables checks, here we pretend low bits of LockVar are int + syscall(SYS_futex, &LockVar, FUTEX_WAIT_PRIVATE, *(int*)&LF_LOCK_FUTEX_WAIT, 0, 0, 0); + } + } else { + sched_yield(); + } +#else + sched_yield(); +#endif + } + + void Unlock() + { + Y_ASSERT_NOBT(LockVar != LF_LOCK_FREE); + if (DoCas(&LockVar, LF_LOCK_FREE, LF_LOCK_LOCKED) != LF_LOCK_LOCKED) { + Y_ASSERT_NOBT(LockVar == LF_LOCK_FUTEX_WAIT && LFHasFutex); + LockVar = LF_LOCK_FREE; +#ifdef _win_ + WakeByAddressAll((PVOID)&LockVar); +#elif defined(_linux_) + syscall(SYS_futex, &LockVar, FUTEX_WAKE_PRIVATE, INT_MAX, 0, 0, 0); +#endif + } + } +}; + +static TLFLockData LFGlobalLock; + + +class TLFLockHolder { + TLFLockData *LockData = nullptr; + int Attempt = 0; + int SleepMask = 0x7f; + +public: + TLFLockHolder() {} + TLFLockHolder(TLFLockData *lk) { + while (!TryLock(lk)); + } + bool TryLock(TLFLockData *lk) + { + Y_ASSERT_NOBT(LockData == nullptr); + if (lk->TryLock()) { + LockData = lk; + return true; + } + if ((++Attempt & SleepMask) == 0) { + lk->FutexWait(); + SleepMask = (SleepMask * 2 + 1) & 0x7fff; + } else { +#ifdef _MSC_VER + _mm_pause(); +#elif defined(__i386) || defined(__x86_64__) + __asm__ __volatile__("pause"); +#endif + } + return false; + } + ~TLFLockHolder() { + if (LockData) { + LockData->Unlock(); + } + } +}; + +////////////////////////////////////////////////////////////////////////// +class TLFAllocFreeList { + struct TNode { + TNode* Next; + }; + + TNode* volatile Head; + TNode* volatile Pending; + TAtomic PendingToFreeListCounter; + TAtomic AllocCount; + void* Padding; + + static Y_FORCE_INLINE void Enqueue(TNode* volatile* headPtr, TNode* n) { + for (;;) { + TNode* volatile prevHead = *headPtr; + n->Next = prevHead; + if (DoCas(headPtr, n, prevHead) == prevHead) + break; + } + } + Y_FORCE_INLINE void* DoAlloc() { + TNode* res; + for (res = Head; res; res = Head) { + TNode* keepNext = res->Next; + if (DoCas(&Head, keepNext, res) == res) { + //Y_VERIFY(keepNext == res->Next); + break; + } + } + return res; + } + void FreeList(TNode* fl) { + if (!fl) + return; + TNode* flTail = fl; + while (flTail->Next) + flTail = flTail->Next; + for (;;) { + TNode* volatile prevHead = Head; + flTail->Next = prevHead; + if (DoCas(&Head, fl, prevHead) == prevHead) + break; + } + } + +public: + Y_FORCE_INLINE void Free(void* ptr) { + TNode* newFree = (TNode*)ptr; + if (AtomicAdd(AllocCount, 0) == 0) + Enqueue(&Head, newFree); + else + Enqueue(&Pending, newFree); + } + Y_FORCE_INLINE void* Alloc() { + TAtomic keepCounter = AtomicAdd(PendingToFreeListCounter, 0); + TNode* fl = Pending; + if (AtomicAdd(AllocCount, 1) == 1) { + // No other allocs in progress. + // If (keepCounter == PendingToFreeListCounter) then Pending was not freed by other threads. + // Hence Pending is not used in any concurrent DoAlloc() atm and can be safely moved to FreeList + if (fl && keepCounter == AtomicAdd(PendingToFreeListCounter, 0) && DoCas(&Pending, (TNode*)nullptr, fl) == fl) { + // pick first element from Pending and return it + void* res = fl; + fl = fl->Next; + // if there are other elements in Pending list, add them to main free list + FreeList(fl); + AtomicAdd(PendingToFreeListCounter, 1); + AtomicAdd(AllocCount, -1); + return res; + } + } + void* res = DoAlloc(); + AtomicAdd(AllocCount, -1); + return res; + } + void* GetWholeList() { + TNode* res; + for (res = Head; res; res = Head) { + if (DoCas(&Head, (TNode*)nullptr, res) == res) + break; + } + return res; + } + void ReturnWholeList(void* ptr) { + while (AtomicAdd(AllocCount, 0) != 0) // theoretically can run into problems with parallel DoAlloc() + ; //ThreadYield(); + for (;;) { + TNode* prevHead = Head; + if (DoCas(&Head, (TNode*)ptr, prevHead) == prevHead) { + FreeList(prevHead); + break; + } + } + } +}; + +///////////////////////////////////////////////////////////////////////// +static TLFAllocFreeList globalFreeLists[N_SIZES]; +static char* volatile globalCurrentPtr[N_SIZES]; +static TLFAllocFreeList blockFreeList; + +// globalFreeLists[] contains TFreeListGroup, each of them points up to 15 free blocks +const int FL_GROUP_SIZE = 15; +struct TFreeListGroup { + TFreeListGroup* Next; + char* Ptrs[FL_GROUP_SIZE]; +}; +#ifdef _64_ +const int FREE_LIST_GROUP_SIZEIDX = 8; +#else +const int FREE_LIST_GROUP_SIZEIDX = 6; +#endif + +////////////////////////////////////////////////////////////////////////// +// find free chunks and reset chunk size so they can be reused by different sized allocations +// do not look at blockFreeList (TFreeListGroup has same size for any allocations) +static bool DefragmentMem() { + if (!EnableDefrag) { + return false; + } + + IncrementCounter(CT_DEGRAGMENT_CNT, 1); + + int* nFreeCount = (int*)SystemAlloc(N_CHUNKS * sizeof(int)); + if (Y_UNLIKELY(!nFreeCount)) { + //__debugbreak(); + NMalloc::AbortFromCorruptedAllocator("debugbreak"); + } + memset(nFreeCount, 0, N_CHUNKS * sizeof(int)); + + TFreeListGroup* wholeLists[N_SIZES]; + for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) { + wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList(); + for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) { + for (auto pData : g->Ptrs) { + if (pData) { + uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE; + ++nFreeCount[nChunk]; + Y_ASSERT_NOBT(chunkSizeIdx[nChunk] == nSizeIdx); + } + } + } + } + + bool bRes = false; + for (size_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) { + int fc = nFreeCount[nChunk]; + nFreeCount[nChunk] = 0; + if (chunkSizeIdx[nChunk] <= 0) + continue; + int nEntries = N_CHUNK_SIZE / nSizeIdxToSize[static_cast<int>(chunkSizeIdx[nChunk])]; + Y_ASSERT_NOBT(fc <= nEntries); // can not have more free blocks then total count + if (fc == nEntries) { + bRes = true; + nFreeCount[nChunk] = 1; + } + } + if (bRes) { + for (auto& wholeList : wholeLists) { + TFreeListGroup** ppPtr = &wholeList; + while (*ppPtr) { + TFreeListGroup* g = *ppPtr; + int dst = 0; + for (auto pData : g->Ptrs) { + if (pData) { + uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE; + if (nFreeCount[nChunk] == 0) + g->Ptrs[dst++] = pData; // block is not freed, keep pointer + } + } + if (dst == 0) { + // no valid pointers in group, free it + *ppPtr = g->Next; + blockFreeList.Free(g); + } else { + // reset invalid pointers to 0 + for (int i = dst; i < FL_GROUP_SIZE; ++i) + g->Ptrs[i] = nullptr; + ppPtr = &g->Next; + } + } + } + for (uintptr_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) { + if (!nFreeCount[nChunk]) + continue; + char* pStart = ALLOC_START + nChunk * N_CHUNK_SIZE; +#ifdef _win_ + VirtualFree(pStart, N_CHUNK_SIZE, MEM_DECOMMIT); +#elif defined(_freebsd_) + madvise(pStart, N_CHUNK_SIZE, MADV_FREE); +#else + madvise(pStart, N_CHUNK_SIZE, MADV_DONTNEED); +#endif + AddFreeChunk(nChunk); + } + } + + for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) + globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]); + + SystemFree(nFreeCount); + return bRes; +} + +static Y_FORCE_INLINE void* LFAllocFromCurrentChunk(int nSizeIdx, int blockSize, int count) { + char* volatile* pFreeArray = &globalCurrentPtr[nSizeIdx]; + while (char* newBlock = *pFreeArray) { + char* nextFree = newBlock + blockSize * count; + + // check if there is space in chunk + char* globalEndPtr = ALLOC_START + ((newBlock - ALLOC_START) & ~((uintptr_t)N_CHUNK_SIZE - 1)) + N_CHUNK_SIZE; + if (nextFree >= globalEndPtr) { + if (nextFree > globalEndPtr) + break; + nextFree = nullptr; // it was last block in chunk + } + if (DoCas(pFreeArray, nextFree, newBlock) == newBlock) + return newBlock; + } + return nullptr; +} + +enum EDefrag { + MEM_DEFRAG, + NO_MEM_DEFRAG, +}; + +static void* SlowLFAlloc(int nSizeIdx, int blockSize, EDefrag defrag) { + IncrementCounter(CT_SLOW_ALLOC_CNT, 1); + + TLFLockHolder ls; + for (;;) { + bool locked = ls.TryLock(&LFGlobalLock); + void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1); + if (res) { + return res; // might happen when other thread allocated new current chunk + } + if (locked) { + break; + } + } + for (;;) { + uintptr_t nChunk; + if (GetFreeChunk(&nChunk)) { + char* newPlace = ALLOC_START + nChunk * N_CHUNK_SIZE; +#ifdef _MSC_VER + void* pTest = VirtualAlloc(newPlace, N_CHUNK_SIZE, MEM_COMMIT, PAGE_READWRITE); + Y_ASSERT_NOBT(pTest == newPlace); +#endif + chunkSizeIdx[nChunk] = (char)nSizeIdx; + globalCurrentPtr[nSizeIdx] = newPlace + blockSize; + return newPlace; + } + + // out of luck, try to defrag + if (defrag == MEM_DEFRAG && DefragmentMem()) { + continue; + } + + char* largeBlock = AllocWithMMap(N_LARGE_ALLOC_SIZE, MM_NORMAL); + uintptr_t addr = ((largeBlock - ALLOC_START) + N_CHUNK_SIZE - 1) & (~(N_CHUNK_SIZE - 1)); + uintptr_t endAddr = ((largeBlock - ALLOC_START) + N_LARGE_ALLOC_SIZE) & (~(N_CHUNK_SIZE - 1)); + for (uintptr_t p = addr; p < endAddr; p += N_CHUNK_SIZE) { + uintptr_t chunk = p / N_CHUNK_SIZE; + Y_ASSERT_NOBT(chunk * N_CHUNK_SIZE == p); + Y_ASSERT_NOBT(chunkSizeIdx[chunk] == 0); + AddFreeChunk(chunk); + } + } + return nullptr; +} + +// allocate single block +static Y_FORCE_INLINE void* LFAllocNoCache(int nSizeIdx, EDefrag defrag) { + int blockSize = nSizeIdxToSize[nSizeIdx]; + void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1); + if (res) + return res; + + return SlowLFAlloc(nSizeIdx, blockSize, defrag); +} + +// allocate multiple blocks, returns number of blocks allocated (max FL_GROUP_SIZE) +// buf should have space for at least FL_GROUP_SIZE elems +static Y_FORCE_INLINE int LFAllocNoCacheMultiple(int nSizeIdx, char** buf) { + int blockSize = nSizeIdxToSize[nSizeIdx]; + void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, FL_GROUP_SIZE); + if (res) { + char* resPtr = (char*)res; + for (int k = 0; k < FL_GROUP_SIZE; ++k) { + buf[k] = resPtr; + resPtr += blockSize; + } + return FL_GROUP_SIZE; + } + buf[0] = (char*)SlowLFAlloc(nSizeIdx, blockSize, MEM_DEFRAG); + return 1; +} + +// take several blocks from global free list (max FL_GROUP_SIZE blocks), returns number of blocks taken +// buf should have space for at least FL_GROUP_SIZE elems +static Y_FORCE_INLINE int TakeBlocksFromGlobalFreeList(int nSizeIdx, char** buf) { + TLFAllocFreeList& fl = globalFreeLists[nSizeIdx]; + TFreeListGroup* g = (TFreeListGroup*)fl.Alloc(); + if (g) { + int resCount = 0; + for (auto& ptr : g->Ptrs) { + if (ptr) + buf[resCount++] = ptr; + else + break; + } + blockFreeList.Free(g); + return resCount; + } + return 0; +} + +// add several blocks to global free list +static Y_FORCE_INLINE void PutBlocksToGlobalFreeList(ptrdiff_t nSizeIdx, char** buf, int count) { + for (int startIdx = 0; startIdx < count;) { + TFreeListGroup* g = (TFreeListGroup*)blockFreeList.Alloc(); + Y_ASSERT_NOBT(sizeof(TFreeListGroup) == nSizeIdxToSize[FREE_LIST_GROUP_SIZEIDX]); + if (!g) { + g = (TFreeListGroup*)LFAllocNoCache(FREE_LIST_GROUP_SIZEIDX, NO_MEM_DEFRAG); + } + + int groupSize = count - startIdx; + if (groupSize > FL_GROUP_SIZE) + groupSize = FL_GROUP_SIZE; + for (int i = 0; i < groupSize; ++i) + g->Ptrs[i] = buf[startIdx + i]; + for (int i = groupSize; i < FL_GROUP_SIZE; ++i) + g->Ptrs[i] = nullptr; + + // add free group to the global list + TLFAllocFreeList& fl = globalFreeLists[nSizeIdx]; + fl.Free(g); + + startIdx += groupSize; + } +} + +////////////////////////////////////////////////////////////////////////// +static TAtomic GlobalCounters[CT_MAX]; +const int MAX_LOCAL_UPDATES = 100; +const intptr_t MAX_LOCAL_DELTA = 1*1024*1024; + +struct TLocalCounter { + intptr_t Value; + int Updates; + TAtomic* Parent; + + Y_FORCE_INLINE void Init(TAtomic* parent) { + Parent = parent; + Value = 0; + Updates = 0; + } + + Y_FORCE_INLINE void Increment(size_t value) { + Value += value; + if (++Updates > MAX_LOCAL_UPDATES || Value > MAX_LOCAL_DELTA) { + Flush(); + } + } + + Y_FORCE_INLINE void Flush() { + AtomicAdd(*Parent, Value); + Value = 0; + Updates = 0; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// DBG stuff +//////////////////////////////////////////////////////////////////////////////// + +#if defined(LFALLOC_DBG) + +struct TPerTagAllocCounter { + TAtomic Size; + TAtomic Count; + + Y_FORCE_INLINE void Alloc(size_t size) { + AtomicAdd(Size, size); + AtomicAdd(Count, 1); + } + + Y_FORCE_INLINE void Free(size_t size) { + AtomicSub(Size, size); + AtomicSub(Count, 1); + } +}; + +struct TLocalPerTagAllocCounter { + intptr_t Size; + int Count; + int Updates; + + Y_FORCE_INLINE void Init() { + Size = 0; + Count = 0; + Updates = 0; + } + + Y_FORCE_INLINE void Alloc(TPerTagAllocCounter& parent, size_t size) { + Size += size; + ++Count; + if (++Updates > MAX_LOCAL_UPDATES) { + Flush(parent); + } + } + + Y_FORCE_INLINE void Free(TPerTagAllocCounter& parent, size_t size) { + Size -= size; + --Count; + if (++Updates > MAX_LOCAL_UPDATES) { + Flush(parent); + } + } + + Y_FORCE_INLINE void Flush(TPerTagAllocCounter& parent) { + AtomicAdd(parent.Size, Size); + Size = 0; + AtomicAdd(parent.Count, Count); + Count = 0; + Updates = 0; + } +}; + +static const int DBG_ALLOC_MAX_TAG = 1000; +static const int DBG_ALLOC_ALIGNED_TAG = 0xF0000000; +static const int DBG_ALLOC_NUM_SIZES = 30; +static TPerTagAllocCounter GlobalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES]; + +#endif // LFALLOC_DBG + +////////////////////////////////////////////////////////////////////////// +const int THREAD_BUF = 256; +static int borderSizes[N_SIZES]; +const int MAX_MEM_PER_SIZE_PER_THREAD = 512 * 1024; +struct TThreadAllocInfo { + // FreePtrs - pointers to first free blocks in per thread block list + // LastFreePtrs - pointers to last blocks in lists, may be invalid if FreePtr is zero + char* FreePtrs[N_SIZES][THREAD_BUF]; + int FreePtrIndex[N_SIZES]; + TThreadAllocInfo* pNextInfo; + TLocalCounter LocalCounters[CT_MAX]; + +#if defined(LFALLOC_DBG) + TLocalPerTagAllocCounter LocalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES]; +#endif +#ifdef _win_ + HANDLE hThread; +#endif + + void Init(TThreadAllocInfo** pHead) { + memset(this, 0, sizeof(*this)); + for (auto& i : FreePtrIndex) + i = THREAD_BUF; +#ifdef _win_ + BOOL b = DuplicateHandle( + GetCurrentProcess(), GetCurrentThread(), + GetCurrentProcess(), &hThread, + 0, FALSE, DUPLICATE_SAME_ACCESS); + Y_ASSERT_NOBT(b); +#endif + pNextInfo = *pHead; + *pHead = this; + for (int k = 0; k < N_SIZES; ++k) { + int maxCount = MAX_MEM_PER_SIZE_PER_THREAD / nSizeIdxToSize[k]; + if (maxCount > THREAD_BUF) + maxCount = THREAD_BUF; + borderSizes[k] = THREAD_BUF - maxCount; + } + for (int i = 0; i < CT_MAX; ++i) { + LocalCounters[i].Init(&GlobalCounters[i]); + } +#if defined(LFALLOC_DBG) + for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) { + for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) { + auto& local = LocalPerTagAllocCounters[tag][sizeIdx]; + local.Init(); + } + } +#endif + } + void Done() { + for (auto sizeIdx : FreePtrIndex) { + Y_ASSERT_NOBT(sizeIdx == THREAD_BUF); + } + for (auto& localCounter : LocalCounters) { + localCounter.Flush(); + } +#if defined(LFALLOC_DBG) + for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) { + for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) { + auto& local = LocalPerTagAllocCounters[tag][sizeIdx]; + auto& global = GlobalPerTagAllocCounters[tag][sizeIdx]; + local.Flush(global); + } + } +#endif +#ifdef _win_ + if (hThread) + CloseHandle(hThread); +#endif + } +}; +PERTHREAD TThreadAllocInfo* pThreadInfo; +static TThreadAllocInfo* pThreadInfoList; + +static TLFLockData LFLockThreadInfo; + +static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value) { +#ifdef LFALLOC_YT + TThreadAllocInfo* thr = pThreadInfo; + if (thr) { + thr->LocalCounters[counter].Increment(value); + } else { + AtomicAdd(GlobalCounters[counter], value); + } +#endif +} + +extern "C" i64 GetLFAllocCounterFast(int counter) { +#ifdef LFALLOC_YT + return GlobalCounters[counter]; +#else + return 0; +#endif +} + +extern "C" i64 GetLFAllocCounterFull(int counter) { +#ifdef LFALLOC_YT + i64 ret = GlobalCounters[counter]; + { + TLFLockHolder ll(&LFLockThreadInfo); + for (TThreadAllocInfo** p = &pThreadInfoList; *p;) { + TThreadAllocInfo* pInfo = *p; + ret += pInfo->LocalCounters[counter].Value; + p = &pInfo->pNextInfo; + } + } + return ret; +#else + return 0; +#endif +} + +static void MoveSingleThreadFreeToGlobal(TThreadAllocInfo* pInfo) { + for (int sizeIdx = 0; sizeIdx < N_SIZES; ++sizeIdx) { + int& freePtrIdx = pInfo->FreePtrIndex[sizeIdx]; + char** freePtrs = pInfo->FreePtrs[sizeIdx]; + PutBlocksToGlobalFreeList(sizeIdx, freePtrs + freePtrIdx, THREAD_BUF - freePtrIdx); + freePtrIdx = THREAD_BUF; + } +} + +#ifdef _win_ +static bool IsDeadThread(TThreadAllocInfo* pInfo) { + DWORD dwExit; + bool isDead = !GetExitCodeThread(pInfo->hThread, &dwExit) || dwExit != STILL_ACTIVE; + return isDead; +} + +static void CleanupAfterDeadThreads() { + TLFLockHolder ll(&LFLockThreadInfo); + for (TThreadAllocInfo** p = &pThreadInfoList; *p;) { + TThreadAllocInfo* pInfo = *p; + if (IsDeadThread(pInfo)) { + MoveSingleThreadFreeToGlobal(pInfo); + pInfo->Done(); + *p = pInfo->pNextInfo; + SystemFree(pInfo); + } else + p = &pInfo->pNextInfo; + } +} +#endif + +#ifndef _win_ +static pthread_key_t ThreadCacheCleaner; +static void* volatile ThreadCacheCleanerStarted; // 0 = not started, -1 = started, -2 = is starting +static PERTHREAD bool IsStoppingThread; + +static void FreeThreadCache(void*) { + TThreadAllocInfo* pToDelete = nullptr; + { + TLFLockHolder ll(&LFLockThreadInfo); + pToDelete = pThreadInfo; + if (pToDelete == nullptr) + return; + + // remove from the list + for (TThreadAllocInfo** p = &pThreadInfoList; *p; p = &(*p)->pNextInfo) { + if (*p == pToDelete) { + *p = pToDelete->pNextInfo; + break; + } + } + IsStoppingThread = true; + pThreadInfo = nullptr; + } + + // free per thread buf + MoveSingleThreadFreeToGlobal(pToDelete); + pToDelete->Done(); + SystemFree(pToDelete); +} +#endif + +static void AllocThreadInfo() { +#ifndef _win_ + if (DoCas(&ThreadCacheCleanerStarted, (void*)-2, (void*)nullptr) == (void*)nullptr) { + pthread_key_create(&ThreadCacheCleaner, FreeThreadCache); + ThreadCacheCleanerStarted = (void*)-1; + } + if (ThreadCacheCleanerStarted != (void*)-1) + return; // do not use ThreadCacheCleaner until it is constructed + + { + if (IsStoppingThread) + return; + TLFLockHolder ll(&LFLockThreadInfo); + if (IsStoppingThread) // better safe than sorry + return; + + pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo)); + pThreadInfo->Init(&pThreadInfoList); + } + pthread_setspecific(ThreadCacheCleaner, (void*)-1); // without value destructor will not be called +#else + CleanupAfterDeadThreads(); + { + pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo)); + TLFLockHolder ll(&LFLockThreadInfo); + pThreadInfo->Init(&pThreadInfoList); + } +#endif +} + + ////////////////////////////////////////////////////////////////////////// + // DBG stuff + ////////////////////////////////////////////////////////////////////////// + +#if defined(LFALLOC_DBG) + +struct TAllocHeader { + uint64_t Size; + int Tag; + int Cookie; +}; + +// should be power of 2 +static_assert(sizeof(TAllocHeader) == 16); + +static inline void* GetAllocPtr(TAllocHeader* p) { + return p + 1; +} + +static inline TAllocHeader* GetAllocHeader(void* p) { + auto* header = ((TAllocHeader*)p) - 1; + if (header->Tag == DBG_ALLOC_ALIGNED_TAG) { + return (TAllocHeader*)header->Size; + } + + return header; +} + +PERTHREAD int AllocationTag; +extern "C" int SetThreadAllocTag(int tag) { + int prevTag = AllocationTag; + if (tag < DBG_ALLOC_MAX_TAG && tag >= 0) { + AllocationTag = tag; + } + return prevTag; +} + +PERTHREAD bool ProfileCurrentThread; +extern "C" bool SetProfileCurrentThread(bool newVal) { + bool prevVal = ProfileCurrentThread; + ProfileCurrentThread = newVal; + return prevVal; +} + +static volatile bool ProfileAllThreads; +extern "C" bool SetProfileAllThreads(bool newVal) { + bool prevVal = ProfileAllThreads; + ProfileAllThreads = newVal; + return prevVal; +} + +static volatile bool AllocationSamplingEnabled; +extern "C" bool SetAllocationSamplingEnabled(bool newVal) { + bool prevVal = AllocationSamplingEnabled; + AllocationSamplingEnabled = newVal; + return prevVal; +} + +static size_t AllocationSampleRate = 1000; +extern "C" size_t SetAllocationSampleRate(size_t newVal) { + size_t prevVal = AllocationSampleRate; + AllocationSampleRate = newVal; + return prevVal; +} + +static size_t AllocationSampleMaxSize = N_MAX_FAST_SIZE; +extern "C" size_t SetAllocationSampleMaxSize(size_t newVal) { + size_t prevVal = AllocationSampleMaxSize; + AllocationSampleMaxSize = newVal; + return prevVal; +} + +using TAllocationCallback = int(int tag, size_t size, int sizeIdx); +static TAllocationCallback* AllocationCallback; +extern "C" TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal) { + TAllocationCallback* prevVal = AllocationCallback; + AllocationCallback = newVal; + return prevVal; +} + +using TDeallocationCallback = void(int cookie, int tag, size_t size, int sizeIdx); +static TDeallocationCallback* DeallocationCallback; +extern "C" TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal) { + TDeallocationCallback* prevVal = DeallocationCallback; + DeallocationCallback = newVal; + return prevVal; +} + +PERTHREAD TAtomic AllocationsCount; +PERTHREAD bool InAllocationCallback; + +static const int DBG_ALLOC_INVALID_COOKIE = -1; +static inline int SampleAllocation(TAllocHeader* p, int sizeIdx) { + int cookie = DBG_ALLOC_INVALID_COOKIE; + if (AllocationSamplingEnabled && (ProfileCurrentThread || ProfileAllThreads) && !InAllocationCallback) { + if (p->Size > AllocationSampleMaxSize || ++AllocationsCount % AllocationSampleRate == 0) { + if (AllocationCallback) { + InAllocationCallback = true; + cookie = AllocationCallback(p->Tag, p->Size, sizeIdx); + InAllocationCallback = false; + } + } + } + return cookie; +} + +static inline void SampleDeallocation(TAllocHeader* p, int sizeIdx) { + if (p->Cookie != DBG_ALLOC_INVALID_COOKIE && !InAllocationCallback) { + if (DeallocationCallback) { + InAllocationCallback = true; + DeallocationCallback(p->Cookie, p->Tag, p->Size, sizeIdx); + InAllocationCallback = false; + } + } +} + +static inline void TrackPerTagAllocation(TAllocHeader* p, int sizeIdx) { + if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) { + Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES); + auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx]; + + TThreadAllocInfo* thr = pThreadInfo; + if (thr) { + auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx]; + local.Alloc(global, p->Size); + } else { + global.Alloc(p->Size); + } + } +} + +static inline void TrackPerTagDeallocation(TAllocHeader* p, int sizeIdx) { + if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) { + Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES); + auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx]; + + TThreadAllocInfo* thr = pThreadInfo; + if (thr) { + auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx]; + local.Free(global, p->Size); + } else { + global.Free(p->Size); + } + } +} + +static void* TrackAllocation(void* ptr, size_t size, int sizeIdx) { + TAllocHeader* p = (TAllocHeader*)ptr; + p->Size = size; + p->Tag = AllocationTag; + p->Cookie = SampleAllocation(p, sizeIdx); + TrackPerTagAllocation(p, sizeIdx); + return GetAllocPtr(p); +} + +static void TrackDeallocation(void* ptr, int sizeIdx) { + TAllocHeader* p = (TAllocHeader*)ptr; + SampleDeallocation(p, sizeIdx); + TrackPerTagDeallocation(p, sizeIdx); +} + +struct TPerTagAllocInfo { + ssize_t Count; + ssize_t Size; +}; + +extern "C" void GetPerTagAllocInfo( + bool flushPerThreadCounters, + TPerTagAllocInfo* info, + int& maxTag, + int& numSizes) { + maxTag = DBG_ALLOC_MAX_TAG; + numSizes = DBG_ALLOC_NUM_SIZES; + + if (info) { + if (flushPerThreadCounters) { + TLFLockHolder ll(&LFLockThreadInfo); + for (TThreadAllocInfo** p = &pThreadInfoList; *p;) { + TThreadAllocInfo* pInfo = *p; + for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) { + for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) { + auto& local = pInfo->LocalPerTagAllocCounters[tag][sizeIdx]; + auto& global = GlobalPerTagAllocCounters[tag][sizeIdx]; + local.Flush(global); + } + } + p = &pInfo->pNextInfo; + } + } + + for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) { + for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) { + auto& global = GlobalPerTagAllocCounters[tag][sizeIdx]; + auto& res = info[tag * DBG_ALLOC_NUM_SIZES + sizeIdx]; + res.Count = global.Count; + res.Size = global.Size; + } + } + } +} + +#endif // LFALLOC_DBG + +////////////////////////////////////////////////////////////////////////// +static Y_FORCE_INLINE void* LFAllocImpl(size_t _nSize) { +#if defined(LFALLOC_DBG) + size_t size = _nSize; + _nSize += sizeof(TAllocHeader); +#endif + + IncrementCounter(CT_USER_ALLOC, _nSize); + + int nSizeIdx; + if (_nSize > 512) { + if (_nSize > N_MAX_FAST_SIZE) { + void* ptr = LargeBlockAlloc(_nSize, CT_LARGE_ALLOC); +#if defined(LFALLOC_DBG) + ptr = TrackAllocation(ptr, size, N_SIZES); +#endif + return ptr; + } + nSizeIdx = size2idxArr2[(_nSize - 1) >> 8]; + } else + nSizeIdx = size2idxArr1[1 + (((int)_nSize - 1) >> 3)]; + + IncrementCounter(CT_SMALL_ALLOC, nSizeIdxToSize[nSizeIdx]); + + // check per thread buffer + TThreadAllocInfo* thr = pThreadInfo; + if (!thr) { + AllocThreadInfo(); + thr = pThreadInfo; + if (!thr) { + void* ptr = LFAllocNoCache(nSizeIdx, MEM_DEFRAG); +#if defined(LFALLOC_DBG) + ptr = TrackAllocation(ptr, size, nSizeIdx); +#endif + return ptr; + } + } + { + int& freePtrIdx = thr->FreePtrIndex[nSizeIdx]; + if (freePtrIdx < THREAD_BUF) { + void* ptr = thr->FreePtrs[nSizeIdx][freePtrIdx++]; +#if defined(LFALLOC_DBG) + ptr = TrackAllocation(ptr, size, nSizeIdx); +#endif + return ptr; + } + + // try to alloc from global free list + char* buf[FL_GROUP_SIZE]; + int count = TakeBlocksFromGlobalFreeList(nSizeIdx, buf); + if (count == 0) { + count = LFAllocNoCacheMultiple(nSizeIdx, buf); + if (count == 0) { + NMalloc::AbortFromCorruptedAllocator("no way LFAllocNoCacheMultiple() can fail"); + } + } + char** dstBuf = thr->FreePtrs[nSizeIdx] + freePtrIdx - 1; + for (int i = 0; i < count - 1; ++i) + dstBuf[-i] = buf[i]; + freePtrIdx -= count - 1; + void* ptr = buf[count - 1]; +#if defined(LFALLOC_DBG) + ptr = TrackAllocation(ptr, size, nSizeIdx); +#endif + return ptr; + } +} + +static Y_FORCE_INLINE void* LFAlloc(size_t _nSize) { + void* res = LFAllocImpl(_nSize); +#ifdef DBG_FILL_MEMORY + if (FillMemoryOnAllocation && res && (_nSize <= DBG_FILL_MAX_SIZE)) { + memset(res, 0xcf, _nSize); + } +#endif + return res; +} + +static Y_FORCE_INLINE void LFFree(void* p) { +#if defined(LFALLOC_DBG) + if (p == nullptr) + return; + p = GetAllocHeader(p); +#endif + + uintptr_t chkOffset = ((char*)p - ALLOC_START) - 1ll; + if (chkOffset >= N_MAX_WORKSET_SIZE) { + if (p == nullptr) + return; +#if defined(LFALLOC_DBG) + TrackDeallocation(p, N_SIZES); +#endif + LargeBlockFree(p, CT_LARGE_FREE); + return; + } + + uintptr_t chunk = ((char*)p - ALLOC_START) / N_CHUNK_SIZE; + ptrdiff_t nSizeIdx = chunkSizeIdx[chunk]; + if (nSizeIdx <= 0) { +#if defined(LFALLOC_DBG) + TrackDeallocation(p, N_SIZES); +#endif + LargeBlockFree(p, CT_LARGE_FREE); + return; + } + +#if defined(LFALLOC_DBG) + TrackDeallocation(p, nSizeIdx); +#endif + +#ifdef DBG_FILL_MEMORY + memset(p, 0xfe, nSizeIdxToSize[nSizeIdx]); +#endif + + IncrementCounter(CT_SMALL_FREE, nSizeIdxToSize[nSizeIdx]); + + // try to store info to per thread buf + TThreadAllocInfo* thr = pThreadInfo; + if (thr) { + int& freePtrIdx = thr->FreePtrIndex[nSizeIdx]; + if (freePtrIdx > borderSizes[nSizeIdx]) { + thr->FreePtrs[nSizeIdx][--freePtrIdx] = (char*)p; + return; + } + + // move several pointers to global free list + int freeCount = FL_GROUP_SIZE; + if (freeCount > THREAD_BUF - freePtrIdx) + freeCount = THREAD_BUF - freePtrIdx; + char** freePtrs = thr->FreePtrs[nSizeIdx]; + PutBlocksToGlobalFreeList(nSizeIdx, freePtrs + freePtrIdx, freeCount); + freePtrIdx += freeCount; + + freePtrs[--freePtrIdx] = (char*)p; + + } else { + AllocThreadInfo(); + PutBlocksToGlobalFreeList(nSizeIdx, (char**)&p, 1); + } +} + +static size_t LFGetSize(const void* p) { +#if defined(LFALLOC_DBG) + if (p == nullptr) + return 0; + return GetAllocHeader(const_cast<void*>(p))->Size; +#endif + + uintptr_t chkOffset = ((const char*)p - ALLOC_START); + if (chkOffset >= N_MAX_WORKSET_SIZE) { + if (p == nullptr) + return 0; + return TLargeBlk::As(p)->Pages * 4096ll; + } + uintptr_t chunk = ((const char*)p - ALLOC_START) / N_CHUNK_SIZE; + ptrdiff_t nSizeIdx = chunkSizeIdx[chunk]; + if (nSizeIdx <= 0) + return TLargeBlk::As(p)->Pages * 4096ll; + return nSizeIdxToSize[nSizeIdx]; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Output mem alloc stats +const int N_PAGE_SIZE = 4096; +static void DebugTraceMMgr(const char* pszFormat, ...) // __cdecl +{ + static char buff[20000]; + va_list va; + // + va_start(va, pszFormat); + vsprintf(buff, pszFormat, va); + va_end(va); +// +#ifdef _win_ + OutputDebugStringA(buff); +#else + fputs(buff, stderr); +#endif +} + +struct TChunkStats { + char *Start, *Finish; + i64 Size; + char* Entries; + i64 FreeCount; + + TChunkStats(size_t chunk, i64 size, char* entries) + : Size(size) + , Entries(entries) + , FreeCount(0) + { + Start = ALLOC_START + chunk * N_CHUNK_SIZE; + Finish = Start + N_CHUNK_SIZE; + } + void CheckBlock(char* pBlock) { + if (pBlock && pBlock >= Start && pBlock < Finish) { + ++FreeCount; + i64 nShift = pBlock - Start; + i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1); + Entries[nOffsetInStep / Size] = 1; + } + } + void SetGlobalFree(char* ptr) { + i64 nShift = ptr - Start; + i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1); + while (nOffsetInStep + Size <= N_CHUNK_SIZE) { + ++FreeCount; + Entries[nOffsetInStep / Size] = 1; + nOffsetInStep += Size; + } + } +}; + +static void DumpMemoryBlockUtilizationLocked() { + TFreeListGroup* wholeLists[N_SIZES]; + for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) { + wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList(); + } + char* bfList = (char*)blockFreeList.GetWholeList(); + + DebugTraceMMgr("memory blocks utilisation stats:\n"); + i64 nTotalAllocated = 0, nTotalFree = 0, nTotalBadPages = 0, nTotalPages = 0, nTotalUsed = 0, nTotalLocked = 0; + i64 nTotalGroupBlocks = 0; + char* entries; + entries = (char*)SystemAlloc((N_CHUNK_SIZE / 4)); + for (size_t k = 0; k < N_CHUNKS; ++k) { + if (chunkSizeIdx[k] <= 0) { + if (chunkSizeIdx[k] == -1) + nTotalLocked += N_CHUNK_SIZE; + continue; + } + i64 nSizeIdx = chunkSizeIdx[k]; + i64 nSize = nSizeIdxToSize[nSizeIdx]; + TChunkStats cs(k, nSize, entries); + int nEntriesTotal = N_CHUNK_SIZE / nSize; + memset(entries, 0, nEntriesTotal); + for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) { + for (auto& ptr : g->Ptrs) + cs.CheckBlock(ptr); + } + TChunkStats csGB(k, nSize, entries); + if (nSizeIdx == FREE_LIST_GROUP_SIZEIDX) { + for (auto g : wholeLists) { + for (; g; g = g->Next) + csGB.CheckBlock((char*)g); + } + for (char* blk = bfList; blk; blk = *(char**)blk) + csGB.CheckBlock(blk); + nTotalGroupBlocks += csGB.FreeCount * nSize; + } + if (((globalCurrentPtr[nSizeIdx] - ALLOC_START) / N_CHUNK_SIZE) == k) + cs.SetGlobalFree(globalCurrentPtr[nSizeIdx]); + nTotalUsed += (nEntriesTotal - cs.FreeCount - csGB.FreeCount) * nSize; + + char pages[N_CHUNK_SIZE / N_PAGE_SIZE]; + memset(pages, 0, sizeof(pages)); + for (int i = 0, nShift = 0; i < nEntriesTotal; ++i, nShift += nSize) { + int nBit = 0; + if (entries[i]) + nBit = 1; // free entry + else + nBit = 2; // used entry + for (i64 nDelta = nSize - 1; nDelta >= 0; nDelta -= N_PAGE_SIZE) + pages[(nShift + nDelta) / N_PAGE_SIZE] |= nBit; + } + i64 nBadPages = 0; + for (auto page : pages) { + nBadPages += page == 3; + nTotalPages += page != 1; + } + DebugTraceMMgr("entry = %lld; size = %lld; free = %lld; system %lld; utilisation = %g%%, fragmentation = %g%%\n", + k, nSize, cs.FreeCount * nSize, csGB.FreeCount * nSize, + (N_CHUNK_SIZE - cs.FreeCount * nSize) * 100.0f / N_CHUNK_SIZE, 100.0f * nBadPages / Y_ARRAY_SIZE(pages)); + nTotalAllocated += N_CHUNK_SIZE; + nTotalFree += cs.FreeCount * nSize; + nTotalBadPages += nBadPages; + } + SystemFree(entries); + DebugTraceMMgr("Total allocated = %llu, free = %lld, system = %lld, locked for future use %lld, utilisation = %g, fragmentation = %g\n", + nTotalAllocated, nTotalFree, nTotalGroupBlocks, nTotalLocked, + 100.0f * (nTotalAllocated - nTotalFree) / nTotalAllocated, 100.0f * nTotalBadPages / nTotalPages); + DebugTraceMMgr("Total %lld bytes used, %lld bytes in used pages\n", nTotalUsed, nTotalPages * N_PAGE_SIZE); + + for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) + globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]); + blockFreeList.ReturnWholeList(bfList); +} + +void FlushThreadFreeList() { + if (pThreadInfo) + MoveSingleThreadFreeToGlobal(pThreadInfo); +} + +void DumpMemoryBlockUtilization() { + // move current thread free to global lists to get better statistics + FlushThreadFreeList(); + { + TLFLockHolder ls(&LFGlobalLock); + DumpMemoryBlockUtilizationLocked(); + } +} + +////////////////////////////////////////////////////////////////////////// +// malloc api + +static bool LFAlloc_SetParam(const char* param, const char* value) { + if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE")) { + LB_LIMIT_TOTAL_SIZE = atoi(value); + return true; + } + if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE_BYTES")) { + LB_LIMIT_TOTAL_SIZE = (atoi(value) + N_PAGE_SIZE - 1) / N_PAGE_SIZE; + return true; + } +#ifdef DBG_FILL_MEMORY + if (!strcmp(param, "FillMemoryOnAllocation")) { + FillMemoryOnAllocation = !strcmp(value, "true"); + return true; + } +#endif + if (!strcmp(param, "TransparentHugePages")) { + TransparentHugePages = !strcmp(value, "true"); + return true; + } + if (!strcmp(param, "MapHugeTLB")) { + MapHugeTLB = !strcmp(value, "true"); + return true; + } + if (!strcmp(param, "EnableDefrag")) { + EnableDefrag = !strcmp(value, "true"); + return true; + } + return false; +}; + +static const char* LFAlloc_GetParam(const char* param) { + struct TParam { + const char* Name; + const char* Value; + }; + + static const TParam Params[] = { + {"GetLFAllocCounterFast", (const char*)&GetLFAllocCounterFast}, + {"GetLFAllocCounterFull", (const char*)&GetLFAllocCounterFull}, +#if defined(LFALLOC_DBG) + {"SetThreadAllocTag", (const char*)&SetThreadAllocTag}, + {"SetProfileCurrentThread", (const char*)&SetProfileCurrentThread}, + {"SetProfileAllThreads", (const char*)&SetProfileAllThreads}, + {"SetAllocationSamplingEnabled", (const char*)&SetAllocationSamplingEnabled}, + {"SetAllocationSampleRate", (const char*)&SetAllocationSampleRate}, + {"SetAllocationSampleMaxSize", (const char*)&SetAllocationSampleMaxSize}, + {"SetAllocationCallback", (const char*)&SetAllocationCallback}, + {"SetDeallocationCallback", (const char*)&SetDeallocationCallback}, + {"GetPerTagAllocInfo", (const char*)&GetPerTagAllocInfo}, +#endif // LFALLOC_DBG + }; + + for (int i = 0; i < Y_ARRAY_SIZE(Params); ++i) { + if (strcmp(param, Params[i].Name) == 0) { + return Params[i].Value; + } + } + return nullptr; +} + +static Y_FORCE_INLINE int LFPosixMemalign(void** memptr, size_t alignment, size_t size) { + if (Y_UNLIKELY(alignment > 4096)) { + const char* error = "Larger alignment are not guaranteed with this implementation\n"; +#ifdef _win_ + OutputDebugStringA(error); +#endif + NMalloc::AbortFromCorruptedAllocator(error); + } + size_t bigsize = size; + if (bigsize <= alignment) { + bigsize = alignment; + } else if (bigsize < 2 * alignment) { + bigsize = 2 * alignment; + } +#if defined(LFALLOC_DBG) + if (alignment > sizeof(TAllocHeader)) { + bigsize += alignment; + } +#endif + + *memptr = LFAlloc(bigsize); + +#if defined(LFALLOC_DBG) + if (alignment > sizeof(TAllocHeader)) { + // memptr isn't aligned due to alloc header + const auto* header = GetAllocHeader(*memptr); + *memptr = (void*)((const char*) (*memptr) + alignment - sizeof(TAllocHeader)); + + // make fake header to retrieve original header ptr on dealloc + auto* next = GetAllocHeader(*memptr); + next->Tag = DBG_ALLOC_ALIGNED_TAG; + next->Size = (uint64_t)header; + next->Cookie = 0; + } +#endif + + Y_ASSERT_NOBT((intptr_t)*memptr % alignment == 0); + return 0; +} + +static Y_FORCE_INLINE void* LFVAlloc(size_t size) { + const size_t pg = N_PAGE_SIZE; + void* p = nullptr; + +#if defined(LFALLOC_DBG) + LFPosixMemalign(&p, pg, size); +#else + size_t bigsize = (size + pg - 1) & (~(pg - 1)); + p = LFAlloc(bigsize); +#endif + + Y_ASSERT_NOBT((intptr_t)p % N_PAGE_SIZE == 0); + return p; +} + +#endif diff --git a/library/cpp/lfalloc/ya.make b/library/cpp/lfalloc/ya.make new file mode 100644 index 0000000000..cace05f9d8 --- /dev/null +++ b/library/cpp/lfalloc/ya.make @@ -0,0 +1,25 @@ +LIBRARY() + +OWNER(gulin) + +NO_UTIL() + +NO_COMPILER_WARNINGS() + +IF (ARCH_AARCH64) + PEERDIR( + contrib/libs/jemalloc + ) +ELSE() + SRCS( + lf_allocX64.cpp + ) +ENDIF() + +PEERDIR( + library/cpp/malloc/api +) + +SET(IDE_FOLDER "util") + +END() diff --git a/library/cpp/lfalloc/yt/ya.make b/library/cpp/lfalloc/yt/ya.make new file mode 100644 index 0000000000..8c1a4f8a72 --- /dev/null +++ b/library/cpp/lfalloc/yt/ya.make @@ -0,0 +1,29 @@ +LIBRARY() + +OWNER(a-romanov) + +NO_UTIL() + +NO_COMPILER_WARNINGS() + +IF (ARCH_AARCH64) + PEERDIR( + contrib/libs/jemalloc + ) +ELSE() + IF ("${YMAKE}" MATCHES "devtools") + CFLAGS(-DYMAKE=1) + ENDIF() + CXXFLAGS(-DLFALLOC_YT) + SRCS( + ../lf_allocX64.cpp + ) +ENDIF() + +PEERDIR( + library/cpp/malloc/api +) + +SET(IDE_FOLDER "util") + +END() |