aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/lfalloc
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/lfalloc
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/lfalloc')
-rw-r--r--library/cpp/lfalloc/alloc_profiler/align_ut.cpp23
-rw-r--r--library/cpp/lfalloc/alloc_profiler/profiler.cpp81
-rw-r--r--library/cpp/lfalloc/alloc_profiler/profiler.h45
-rw-r--r--library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp76
-rw-r--r--library/cpp/lfalloc/alloc_profiler/stackcollect.cpp332
-rw-r--r--library/cpp/lfalloc/alloc_profiler/stackcollect.h107
-rw-r--r--library/cpp/lfalloc/alloc_profiler/ut/ya.make22
-rw-r--r--library/cpp/lfalloc/alloc_profiler/ya.make17
-rw-r--r--library/cpp/lfalloc/dbg/ya.make32
-rw-r--r--library/cpp/lfalloc/dbg_info/dbg_info.cpp124
-rw-r--r--library/cpp/lfalloc/dbg_info/dbg_info.h77
-rw-r--r--library/cpp/lfalloc/dbg_info/ya.make15
-rw-r--r--library/cpp/lfalloc/lf_allocX64.cpp144
-rw-r--r--library/cpp/lfalloc/lf_allocX64.h1926
-rw-r--r--library/cpp/lfalloc/ya.make25
-rw-r--r--library/cpp/lfalloc/yt/ya.make29
16 files changed, 3075 insertions, 0 deletions
diff --git a/library/cpp/lfalloc/alloc_profiler/align_ut.cpp b/library/cpp/lfalloc/alloc_profiler/align_ut.cpp
new file mode 100644
index 0000000000..db9b17b95b
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/align_ut.cpp
@@ -0,0 +1,23 @@
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/scope.h>
+
+Y_UNIT_TEST_SUITE(MemAlign) {
+ Y_UNIT_TEST(ShouldAlign)
+ {
+ for (ui64 size = 8; size <= 32 * 1024; size *= 8) {
+ for (ui64 align = 8; align <= 4096; align *=2) {
+ void* ptr = nullptr;
+
+ int res = posix_memalign(&ptr, align, size);
+ UNIT_ASSERT_C(res == 0 && ptr != nullptr, "memalign failed");
+
+ Y_DEFER {
+ free(ptr);
+ };
+
+ UNIT_ASSERT_C((uintptr_t)ptr % align == 0, "non aligned memory");
+ }
+ }
+ }
+}
diff --git a/library/cpp/lfalloc/alloc_profiler/profiler.cpp b/library/cpp/lfalloc/alloc_profiler/profiler.cpp
new file mode 100644
index 0000000000..0e30927a5a
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/profiler.cpp
@@ -0,0 +1,81 @@
+#include "profiler.h"
+
+#include "stackcollect.h"
+
+#include <util/generic/algorithm.h>
+#include <util/generic/singleton.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/stream/str.h>
+
+namespace NAllocProfiler {
+
+namespace {
+
+static TAllocationStackCollector& AllocationStackCollector()
+{
+ return *Singleton<TAllocationStackCollector>();
+}
+
+int AllocationCallback(int tag, size_t size, int sizeIdx)
+{
+ Y_UNUSED(sizeIdx);
+
+ static const size_t STACK_FRAMES_COUNT = 32;
+ static const size_t STACK_FRAMES_SKIP = 1;
+
+ void* frames[STACK_FRAMES_COUNT];
+ size_t frameCount = BackTrace(frames, Y_ARRAY_SIZE(frames));
+ if (frameCount <= STACK_FRAMES_SKIP) {
+ return -1;
+ }
+
+ void** stack = &frames[STACK_FRAMES_SKIP];
+ frameCount -= STACK_FRAMES_SKIP;
+
+ auto& collector = AllocationStackCollector();
+ return collector.Alloc(stack, frameCount, tag, size);
+}
+
+void DeallocationCallback(int stackId, int tag, size_t size, int sizeIdx)
+{
+ Y_UNUSED(tag);
+ Y_UNUSED(sizeIdx);
+
+ auto& collector = AllocationStackCollector();
+ collector.Free(stackId, size);
+}
+
+} // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+
+bool StartAllocationSampling(bool profileAllThreads)
+{
+ auto& collector = AllocationStackCollector();
+ collector.Clear();
+
+ NAllocDbg::SetProfileAllThreads(profileAllThreads);
+ NAllocDbg::SetAllocationCallback(AllocationCallback);
+ NAllocDbg::SetDeallocationCallback(DeallocationCallback);
+ NAllocDbg::SetAllocationSamplingEnabled(true);
+ return true;
+}
+
+bool StopAllocationSampling(IAllocationStatsDumper &out, int count)
+{
+ NAllocDbg::SetAllocationCallback(nullptr);
+ NAllocDbg::SetDeallocationCallback(nullptr);
+ NAllocDbg::SetAllocationSamplingEnabled(false);
+
+ auto& collector = AllocationStackCollector();
+ collector.Dump(count, out);
+ return true;
+}
+
+bool StopAllocationSampling(IOutputStream& out, int count) {
+ TAllocationStatsDumper dumper(out);
+ return StopAllocationSampling(dumper, count);
+}
+
+} // namespace NProfiler
diff --git a/library/cpp/lfalloc/alloc_profiler/profiler.h b/library/cpp/lfalloc/alloc_profiler/profiler.h
new file mode 100644
index 0000000000..4ea49b9dcc
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/profiler.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include "stackcollect.h"
+
+#include <library/cpp/lfalloc/dbg_info/dbg_info.h>
+
+#include <util/generic/noncopyable.h>
+#include <util/stream/output.h>
+
+namespace NAllocProfiler {
+
+////////////////////////////////////////////////////////////////////////////////
+
+inline int SetCurrentScopeTag(int value)
+{
+ return NAllocDbg::SetThreadAllocTag(value);
+}
+
+inline bool SetProfileCurrentThread(bool value)
+{
+ return NAllocDbg::SetProfileCurrentThread(value);
+}
+
+bool StartAllocationSampling(bool profileAllThreads = false);
+bool StopAllocationSampling(IAllocationStatsDumper& out, int count = 100);
+bool StopAllocationSampling(IOutputStream& out, int count = 100);
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TProfilingScope: private TNonCopyable {
+private:
+ const int Prev;
+
+public:
+ explicit TProfilingScope(int value)
+ : Prev(SetCurrentScopeTag(value))
+ {}
+
+ ~TProfilingScope()
+ {
+ SetCurrentScopeTag(Prev);
+ }
+};
+
+} // namespace NAllocProfiler
diff --git a/library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp b/library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp
new file mode 100644
index 0000000000..4341dda6ed
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp
@@ -0,0 +1,76 @@
+#include "profiler.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+namespace NAllocProfiler {
+
+////////////////////////////////////////////////////////////////////////////////
+
+Y_UNIT_TEST_SUITE(Profiler) {
+ Y_UNIT_TEST(StackCollection)
+ {
+ TStringStream str;
+
+ NAllocProfiler::StartAllocationSampling(true);
+ TVector<TAutoPtr<int>> test;
+ // Do many allocations and no deallocations
+ for (int i = 0; i < 10000; ++i) {
+ test.push_back(new int);
+ }
+ NAllocProfiler::StopAllocationSampling(str);
+ //Cout << str.Str() << Endl;
+
+#if !defined(ARCH_AARCH64)
+ /* Check that output resembles this:
+
+ STACK #2: 0 Allocs: 10 Frees: 0 CurrentSize: 40
+ 0000000000492353 ??
+ 000000000048781F operator new(unsigned long) +1807
+ 00000000003733FA NAllocProfiler::NTestSuiteProfiler::TTestCaseStackCollection::Execute_(NUnitTest::TTestContext&) +218
+ 00000000004A1938 NUnitTest::TTestBase::Run(std::__y1::function<void ()>, TString, char const*, bool) +120
+ 0000000000375656 NAllocProfiler::NTestSuiteProfiler::TCurrentTest::Execute() +342
+ 00000000004A20CF NUnitTest::TTestFactory::Execute() +847
+ 000000000049922D NUnitTest::RunMain(int, char**) +1965
+ 00007FF665778F45 __libc_start_main +245
+ */
+
+ UNIT_ASSERT_STRING_CONTAINS(str.Str(), "StackCollection");
+ UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NUnitTest::TTestBase::Run");
+ UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NAllocProfiler::NTestSuiteProfiler::TCurrentTest::Execute");
+ UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NUnitTest::TTestFactory::Execute");
+ UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NUnitTest::RunMain");
+#endif
+ }
+
+ class TAllocDumper : public NAllocProfiler::TAllocationStatsDumper {
+ public:
+ explicit TAllocDumper(IOutputStream& out) : NAllocProfiler::TAllocationStatsDumper(out) {}
+
+ TString FormatTag(int tag) override {
+ UNIT_ASSERT_VALUES_EQUAL(tag, 42);
+ return "TAG_NAME_42";
+ }
+ };
+
+ Y_UNIT_TEST(TagNames)
+ {
+ TStringStream str;
+
+ NAllocProfiler::StartAllocationSampling(true);
+ TVector<TAutoPtr<int>> test;
+ NAllocProfiler::TProfilingScope scope(42);
+ // Do many allocations and no deallocations
+ for (int i = 0; i < 10000; ++i) {
+ test.push_back(new int);
+ }
+
+ TAllocDumper dumper(str);
+ NAllocProfiler::StopAllocationSampling(dumper);
+
+#if !defined(ARCH_AARCH64)
+ UNIT_ASSERT_STRING_CONTAINS(str.Str(), "TAG_NAME_42");
+#endif
+ }
+}
+
+}
diff --git a/library/cpp/lfalloc/alloc_profiler/stackcollect.cpp b/library/cpp/lfalloc/alloc_profiler/stackcollect.cpp
new file mode 100644
index 0000000000..fded4e2fd1
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/stackcollect.cpp
@@ -0,0 +1,332 @@
+#include "stackcollect.h"
+
+#include "profiler.h"
+
+#include <util/generic/algorithm.h>
+#include <util/generic/vector.h>
+#include <util/stream/format.h>
+#include <util/stream/str.h>
+#include <util/string/cast.h>
+#include <util/string/printf.h>
+#include <util/system/backtrace.h>
+#include <util/system/spinlock.h>
+#include <util/system/yassert.h>
+
+
+namespace NAllocProfiler {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+class TStackCollector: private TNonCopyable {
+public:
+ struct TFrameInfo {
+ int PrevInd;
+ void* Addr;
+ int Tag;
+ T Stats;
+
+ void Clear()
+ {
+ PrevInd = 0;
+ Addr = nullptr;
+ Tag = 0;
+ Stats.Clear();
+ }
+ };
+
+private:
+ static const size_t STACKS_HASH_MAP_SIZE = 256 * 1024;
+ TFrameInfo Frames[STACKS_HASH_MAP_SIZE];
+
+ ui64 Samples; // Saved samples count
+ ui64 UniqueSamples; // Number of unique addresses
+ ui64 UsedSlots; // Number of occupied slots in the hashtable
+ ui64 DroppedSamples; // Number of unsaved addresses
+ ui64 SearchSkipCount; // Total number of linear hash table probes due to collisions
+
+ TAdaptiveLock Lock;
+
+public:
+ TStackCollector()
+ {
+ Clear();
+ }
+
+ int AddStack(void** stack, size_t frameCount, int tag)
+ {
+ Y_ASSERT(frameCount > 0);
+
+ int prevInd = -1;
+ with_lock (Lock) {
+ for (int i = frameCount - 1; i >= 0; --i) {
+ prevInd = AddFrame(stack[i], prevInd, ((i == 0) ? tag : 0), (i == 0));
+ if (prevInd == -1) {
+ break;
+ }
+ }
+ }
+ return prevInd;
+ }
+
+ T& GetStats(int stackId)
+ {
+ Y_ASSERT(stackId >= 0 && (size_t)stackId < Y_ARRAY_SIZE(Frames));
+ Y_ASSERT(!IsSlotEmpty(stackId));
+
+ return Frames[stackId].Stats;
+ }
+
+ const TFrameInfo* GetFrames() const
+ {
+ return Frames;
+ }
+
+ size_t GetFramesCount() const
+ {
+ return Y_ARRAY_SIZE(Frames);
+ }
+
+ void BackTrace(const TFrameInfo* stack, TStackVec<void*, 64>& frames) const
+ {
+ frames.clear();
+ for (size_t i = 0; i < 100; ++i) {
+ frames.push_back(stack->Addr);
+ int prevInd = stack->PrevInd;
+ if (prevInd == -1) {
+ break;
+ }
+ stack = &Frames[prevInd];
+ }
+ }
+
+ void Clear()
+ {
+ for (auto& frame: Frames) {
+ frame.Clear();
+ }
+
+ Samples = 0;
+ DroppedSamples = 0;
+ UniqueSamples = 0;
+ UsedSlots = 0;
+ SearchSkipCount = 0;
+ }
+
+private:
+ // Hash function applied to the addresses
+ static ui32 Hash(void* addr, int prevInd, int tag)
+ {
+ return (((size_t)addr + ((size_t)addr / STACKS_HASH_MAP_SIZE)) + prevInd + tag) % STACKS_HASH_MAP_SIZE;
+ }
+
+ static bool EqualFrame(const TFrameInfo& frame, void* addr, int prevInd, int tag)
+ {
+ return (frame.Addr == addr && frame.PrevInd == prevInd && frame.Tag == tag);
+ }
+
+ bool IsSlotEmpty(ui32 slot) const
+ {
+ return Frames[slot].Addr == 0;
+ }
+
+ bool InsertsAllowed() const
+ {
+ return UsedSlots < STACKS_HASH_MAP_SIZE / 2;
+ }
+
+ // returns the index in the hashmap
+ int AddFrame(void* addr, int prevFrameIndex, int tag, bool last)
+ {
+ ui32 slot = Hash(addr, prevFrameIndex, tag);
+ ui32 prevSlot = (slot - 1) % STACKS_HASH_MAP_SIZE;
+
+ while (!EqualFrame(Frames[slot], addr, prevFrameIndex, tag) && !IsSlotEmpty(slot) && slot != prevSlot) {
+ slot = (slot + 1) % STACKS_HASH_MAP_SIZE;
+ SearchSkipCount++;
+ }
+
+ if (EqualFrame(Frames[slot], addr, prevFrameIndex, tag)) {
+ if (last) {
+ ++Samples;
+ }
+ } else if (InsertsAllowed() && IsSlotEmpty(slot)) {
+ // add new sample
+ Frames[slot].Clear();
+ Frames[slot].Addr = addr;
+ Frames[slot].PrevInd = prevFrameIndex;
+ Frames[slot].Tag = tag;
+ ++UsedSlots;
+ if (last) {
+ ++UniqueSamples;
+ ++Samples;
+ }
+ } else {
+ // don't insert new sample if the search is becoming too slow
+ ++DroppedSamples;
+ return -1;
+ }
+
+ return slot;
+ }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TAllocationStackCollector::TImpl: public TStackCollector<TStats> {
+ using TBase = TStackCollector<TStats>;
+
+private:
+ TStats Total;
+
+public:
+ int Alloc(void** stack, size_t frameCount, int tag, size_t size)
+ {
+ int stackId = TBase::AddStack(stack, frameCount, tag);
+ if (stackId >= 0) {
+ TBase::GetStats(stackId).Alloc(size);
+ Total.Alloc(size);
+ }
+ return stackId;
+ }
+
+ void Free(int stackId, size_t size)
+ {
+ TBase::GetStats(stackId).Free(size);
+ Total.Free(size);
+ }
+
+ void Clear()
+ {
+ TBase::Clear();
+ Total.Clear();
+ }
+
+ void Dump(int count, IAllocationStatsDumper& out) const
+ {
+ const TFrameInfo* frames = TBase::GetFrames();
+ size_t framesCount = TBase::GetFramesCount();
+
+ TVector<const TFrameInfo*> stacks;
+ for (size_t i = 0; i < framesCount; ++i) {
+ if (frames[i].Stats.Allocs) {
+ stacks.push_back(&frames[i]);
+ }
+ }
+
+ Sort(stacks, [] (const TFrameInfo* l, const TFrameInfo* r) {
+ const auto& ls = l->Stats;
+ const auto& rs = r->Stats;
+ return ls.CurrentSize != rs.CurrentSize
+ ? ls.CurrentSize > rs.CurrentSize
+ : ls.Allocs != rs.Allocs
+ ? ls.Allocs > rs.Allocs
+ : ls.Frees > rs.Frees;
+ });
+
+ out.DumpTotal(Total);
+
+ TAllocationInfo allocInfo;
+ int printedCount = 0;
+ for (const TFrameInfo* stack: stacks) {
+ allocInfo.Clear();
+ allocInfo.Tag = stack->Tag;
+ allocInfo.Stats = stack->Stats;
+ TBase::BackTrace(stack, allocInfo.Stack);
+
+ out.DumpEntry(allocInfo);
+
+ if (++printedCount >= count) {
+ break;
+ }
+ }
+ }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+TAllocationStackCollector::TAllocationStackCollector()
+ : Impl(new TImpl())
+{}
+
+TAllocationStackCollector::~TAllocationStackCollector()
+{}
+
+int TAllocationStackCollector::Alloc(void** stack, size_t frameCount, int tag, size_t size)
+{
+ return Impl->Alloc(stack, frameCount, tag, size);
+}
+
+void TAllocationStackCollector::Free(int stackId, size_t size)
+{
+ Impl->Free(stackId, size);
+}
+
+void TAllocationStackCollector::Clear()
+{
+ Impl->Clear();
+}
+
+void TAllocationStackCollector::Dump(int count, IAllocationStatsDumper &out) const
+{
+ Impl->Dump(count, out);
+}
+
+
+TString IAllocationStatsDumper::FormatTag(int tag) {
+ return ToString(tag);
+}
+
+TString IAllocationStatsDumper::FormatSize(intptr_t sz) {
+ return ToString(sz);
+}
+
+
+TAllocationStatsDumper::TAllocationStatsDumper(IOutputStream& out)
+ : PrintedCount(0)
+ , Out(out)
+ , SymbolCache(2048)
+{}
+
+void TAllocationStatsDumper::DumpTotal(const TStats& total) {
+ Out << "TOTAL"
+ << "\tAllocs: " << total.Allocs
+ << "\tFrees: " << total.Frees
+ << "\tCurrentSize: " << FormatSize(total.CurrentSize)
+ << Endl;
+}
+
+void TAllocationStatsDumper::DumpEntry(const TAllocationInfo& allocInfo) {
+ Out << Endl
+ << "STACK #" << PrintedCount+1 << ": " << FormatTag(allocInfo.Tag)
+ << "\tAllocs: " << allocInfo.Stats.Allocs
+ << "\tFrees: " << allocInfo.Stats.Frees
+ << "\tCurrentSize: " << FormatSize(allocInfo.Stats.CurrentSize)
+ << Endl;
+ FormatBackTrace(allocInfo.Stack.data(), allocInfo.Stack.size());
+ PrintedCount++;
+}
+
+void TAllocationStatsDumper::FormatBackTrace(void* const* stack, size_t sz) {
+ char name[1024];
+ for (size_t i = 0; i < sz; ++i) {
+ TSymbol symbol;
+ auto it = SymbolCache.Find(stack[i]);
+ if (it != SymbolCache.End()) {
+ symbol = it.Value();
+ } else {
+ TResolvedSymbol rs = ResolveSymbol(stack[i], name, sizeof(name));
+ symbol = {rs.NearestSymbol, rs.Name};
+ SymbolCache.Insert(stack[i], symbol);
+ }
+
+ Out << Hex((intptr_t)stack[i], HF_FULL) << "\t" << symbol.Name;
+ intptr_t offset = (intptr_t)stack[i] - (intptr_t)symbol.Address;
+ if (offset)
+ Out << " +" << offset;
+ Out << Endl;
+ }
+}
+
+} // namespace NAllocProfiler
diff --git a/library/cpp/lfalloc/alloc_profiler/stackcollect.h b/library/cpp/lfalloc/alloc_profiler/stackcollect.h
new file mode 100644
index 0000000000..80715ed7cb
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/stackcollect.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <library/cpp/containers/stack_vector/stack_vec.h>
+#include <library/cpp/cache/cache.h>
+
+#include <util/generic/noncopyable.h>
+#include <util/generic/ptr.h>
+#include <util/stream/output.h>
+
+namespace NAllocProfiler {
+
+struct TStats {
+ intptr_t Allocs = 0;
+ intptr_t Frees = 0;
+ intptr_t CurrentSize = 0;
+
+ void Clear()
+ {
+ Allocs = 0;
+ Frees = 0;
+ CurrentSize = 0;
+ }
+
+ void Alloc(size_t size)
+ {
+ AtomicIncrement(Allocs);
+ AtomicAdd(CurrentSize, size);
+ }
+
+ void Free(size_t size)
+ {
+ AtomicIncrement(Frees);
+ AtomicSub(CurrentSize, size);
+ }
+};
+
+struct TAllocationInfo {
+ int Tag;
+ TStats Stats;
+ TStackVec<void*, 64> Stack;
+
+ void Clear() {
+ Tag = 0;
+ Stats.Clear();
+ Stack.clear();
+ }
+};
+
+
+class IAllocationStatsDumper {
+public:
+ virtual ~IAllocationStatsDumper() = default;
+
+ // Total stats
+ virtual void DumpTotal(const TStats& total) = 0;
+
+ // Stats for individual stack
+ virtual void DumpEntry(const TAllocationInfo& allocInfo) = 0;
+
+ // App-specific tag printer
+ virtual TString FormatTag(int tag);
+
+ // Size printer (e.g. "10KB", "100MB", "over 9000")
+ virtual TString FormatSize(intptr_t sz);
+};
+
+// Default implementation
+class TAllocationStatsDumper: public IAllocationStatsDumper {
+public:
+ explicit TAllocationStatsDumper(IOutputStream& out);
+ void DumpTotal(const TStats& total) override;
+ void DumpEntry(const TAllocationInfo& allocInfo) override;
+
+private:
+ void FormatBackTrace(void* const* stack, size_t sz);
+
+private:
+ struct TSymbol {
+ const void* Address;
+ TString Name;
+ };
+
+ size_t PrintedCount;
+ IOutputStream& Out;
+ TLFUCache<void*, TSymbol> SymbolCache;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TAllocationStackCollector: private TNonCopyable {
+private:
+ class TImpl;
+ THolder<TImpl> Impl;
+
+public:
+ TAllocationStackCollector();
+ ~TAllocationStackCollector();
+
+ int Alloc(void** stack, size_t frameCount, int tag, size_t size);
+ void Free(int stackId, size_t size);
+
+ void Clear();
+
+ void Dump(int count, IAllocationStatsDumper& out) const;
+};
+
+} // namespace NAllocProfiler
diff --git a/library/cpp/lfalloc/alloc_profiler/ut/ya.make b/library/cpp/lfalloc/alloc_profiler/ut/ya.make
new file mode 100644
index 0000000000..8a7daa74af
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/ut/ya.make
@@ -0,0 +1,22 @@
+UNITTEST_FOR(library/cpp/lfalloc/alloc_profiler)
+
+OWNER(g:rtmr g:kikimr)
+
+PEERDIR(
+ library/cpp/testing/unittest
+)
+
+IF (ARCH_AARCH64)
+ PEERDIR(
+ contrib/libs/jemalloc
+ )
+ELSE()
+ ALLOCATOR(LF_DBG)
+ENDIF()
+
+SRCS(
+ profiler_ut.cpp
+ align_ut.cpp
+)
+
+END()
diff --git a/library/cpp/lfalloc/alloc_profiler/ya.make b/library/cpp/lfalloc/alloc_profiler/ya.make
new file mode 100644
index 0000000000..0f58d91767
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/ya.make
@@ -0,0 +1,17 @@
+LIBRARY()
+
+OWNER(g:rtmr g:kikimr)
+
+SRCS(
+ profiler.cpp
+ stackcollect.cpp
+)
+
+PEERDIR(
+ library/cpp/lfalloc/dbg_info
+ library/cpp/cache
+)
+
+END()
+
+RECURSE(ut)
diff --git a/library/cpp/lfalloc/dbg/ya.make b/library/cpp/lfalloc/dbg/ya.make
new file mode 100644
index 0000000000..3dce653a8c
--- /dev/null
+++ b/library/cpp/lfalloc/dbg/ya.make
@@ -0,0 +1,32 @@
+LIBRARY()
+
+OWNER(vskipin)
+
+NO_UTIL()
+
+NO_COMPILER_WARNINGS()
+
+IF (ARCH_AARCH64)
+ PEERDIR(
+ contrib/libs/jemalloc
+ )
+ELSE()
+ IF ("${YMAKE}" MATCHES "devtools")
+ CFLAGS(-DYMAKE=1)
+ ENDIF()
+ CXXFLAGS(
+ -DLFALLOC_DBG
+ -DLFALLOC_YT
+ )
+ SRCS(
+ ../lf_allocX64.cpp
+ )
+ENDIF()
+
+PEERDIR(
+ library/cpp/malloc/api
+)
+
+SET(IDE_FOLDER "util")
+
+END()
diff --git a/library/cpp/lfalloc/dbg_info/dbg_info.cpp b/library/cpp/lfalloc/dbg_info/dbg_info.cpp
new file mode 100644
index 0000000000..1fb9f7ad93
--- /dev/null
+++ b/library/cpp/lfalloc/dbg_info/dbg_info.cpp
@@ -0,0 +1,124 @@
+#include "dbg_info.h"
+
+#include <library/cpp/malloc/api/malloc.h>
+
+namespace NAllocDbg {
+ ////////////////////////////////////////////////////////////////////////////////
+
+ using TGetAllocationCounter = i64(int counter);
+
+ using TSetThreadAllocTag = int(int tag);
+ using TGetPerTagAllocInfo = void(
+ bool flushPerThreadCounters,
+ TPerTagAllocInfo* info,
+ int& maxTag,
+ int& numSizes);
+
+ using TSetProfileCurrentThread = bool(bool newVal);
+ using TSetProfileAllThreads = bool(bool newVal);
+ using TSetAllocationSamplingEnabled = bool(bool newVal);
+
+ using TSetAllocationSampleRate = size_t(size_t newVal);
+ using TSetAllocationSampleMaxSize = size_t(size_t newVal);
+
+ using TSetAllocationCallback = TAllocationCallback*(TAllocationCallback* newVal);
+ using TSetDeallocationCallback = TDeallocationCallback*(TDeallocationCallback* newVal);
+
+ struct TAllocFn {
+ TGetAllocationCounter* GetAllocationCounterFast = nullptr;
+ TGetAllocationCounter* GetAllocationCounterFull = nullptr;
+
+ TSetThreadAllocTag* SetThreadAllocTag = nullptr;
+ TGetPerTagAllocInfo* GetPerTagAllocInfo = nullptr;
+
+ TSetProfileCurrentThread* SetProfileCurrentThread = nullptr;
+ TSetProfileAllThreads* SetProfileAllThreads = nullptr;
+ TSetAllocationSamplingEnabled* SetAllocationSamplingEnabled = nullptr;
+
+ TSetAllocationSampleRate* SetAllocationSampleRate = nullptr;
+ TSetAllocationSampleMaxSize* SetAllocationSampleMaxSize = nullptr;
+
+ TSetAllocationCallback* SetAllocationCallback = nullptr;
+ TSetDeallocationCallback* SetDeallocationCallback = nullptr;
+
+ TAllocFn() {
+ auto mallocInfo = NMalloc::MallocInfo();
+
+ GetAllocationCounterFast = (TGetAllocationCounter*)mallocInfo.GetParam("GetLFAllocCounterFast");
+ GetAllocationCounterFull = (TGetAllocationCounter*)mallocInfo.GetParam("GetLFAllocCounterFull");
+
+ SetThreadAllocTag = (TSetThreadAllocTag*)mallocInfo.GetParam("SetThreadAllocTag");
+ GetPerTagAllocInfo = (TGetPerTagAllocInfo*)mallocInfo.GetParam("GetPerTagAllocInfo");
+
+ SetProfileCurrentThread = (TSetProfileCurrentThread*)mallocInfo.GetParam("SetProfileCurrentThread");
+ SetProfileAllThreads = (TSetProfileAllThreads*)mallocInfo.GetParam("SetProfileAllThreads");
+ SetAllocationSamplingEnabled = (TSetAllocationSamplingEnabled*)mallocInfo.GetParam("SetAllocationSamplingEnabled");
+
+ SetAllocationSampleRate = (TSetAllocationSampleRate*)mallocInfo.GetParam("SetAllocationSampleRate");
+ SetAllocationSampleMaxSize = (TSetAllocationSampleMaxSize*)mallocInfo.GetParam("SetAllocationSampleMaxSize");
+
+ SetAllocationCallback = (TSetAllocationCallback*)mallocInfo.GetParam("SetAllocationCallback");
+ SetDeallocationCallback = (TSetDeallocationCallback*)mallocInfo.GetParam("SetDeallocationCallback");
+ }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static TAllocFn AllocFn;
+
+ i64 GetAllocationCounterFast(ELFAllocCounter counter) {
+ return AllocFn.GetAllocationCounterFast ? AllocFn.GetAllocationCounterFast(counter) : 0;
+ }
+
+ i64 GetAllocationCounterFull(ELFAllocCounter counter) {
+ return AllocFn.GetAllocationCounterFull ? AllocFn.GetAllocationCounterFull(counter) : 0;
+ }
+
+ int SetThreadAllocTag(int tag) {
+ return AllocFn.SetThreadAllocTag ? AllocFn.SetThreadAllocTag(tag) : 0;
+ }
+
+ TArrayPtr<TPerTagAllocInfo> GetPerTagAllocInfo(
+ bool flushPerThreadCounters,
+ int& maxTag,
+ int& numSizes) {
+ if (AllocFn.GetPerTagAllocInfo) {
+ AllocFn.GetPerTagAllocInfo(flushPerThreadCounters, nullptr, maxTag, numSizes);
+ TArrayPtr<TPerTagAllocInfo> info = new TPerTagAllocInfo[maxTag * numSizes];
+ AllocFn.GetPerTagAllocInfo(flushPerThreadCounters, info.Get(), maxTag, numSizes);
+ return info;
+ }
+ maxTag = 0;
+ numSizes = 0;
+ return nullptr;
+ }
+
+ bool SetProfileCurrentThread(bool newVal) {
+ return AllocFn.SetProfileCurrentThread ? AllocFn.SetProfileCurrentThread(newVal) : false;
+ }
+
+ bool SetProfileAllThreads(bool newVal) {
+ return AllocFn.SetProfileAllThreads ? AllocFn.SetProfileAllThreads(newVal) : false;
+ }
+
+ bool SetAllocationSamplingEnabled(bool newVal) {
+ return AllocFn.SetAllocationSamplingEnabled ? AllocFn.SetAllocationSamplingEnabled(newVal) : false;
+ }
+
+ size_t SetAllocationSampleRate(size_t newVal) {
+ return AllocFn.SetAllocationSampleRate ? AllocFn.SetAllocationSampleRate(newVal) : 0;
+ }
+
+ size_t SetAllocationSampleMaxSize(size_t newVal) {
+ return AllocFn.SetAllocationSampleMaxSize ? AllocFn.SetAllocationSampleMaxSize(newVal) : 0;
+ }
+
+ TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal) {
+ return AllocFn.SetAllocationCallback ? AllocFn.SetAllocationCallback(newVal) : nullptr;
+ }
+
+ TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal) {
+ return AllocFn.SetDeallocationCallback ? AllocFn.SetDeallocationCallback(newVal) : nullptr;
+ }
+
+}
diff --git a/library/cpp/lfalloc/dbg_info/dbg_info.h b/library/cpp/lfalloc/dbg_info/dbg_info.h
new file mode 100644
index 0000000000..071562a81a
--- /dev/null
+++ b/library/cpp/lfalloc/dbg_info/dbg_info.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <util/generic/ptr.h>
+#include <util/system/types.h>
+
+namespace NAllocDbg {
+ ////////////////////////////////////////////////////////////////////////////////
+ // Allocation statistics
+
+ enum ELFAllocCounter {
+ CT_USER_ALLOC, // accumulated size requested by user code
+ CT_MMAP, // accumulated mmapped size
+ CT_MMAP_CNT, // number of mmapped regions
+ CT_MUNMAP, // accumulated unmmapped size
+ CT_MUNMAP_CNT, // number of munmaped regions
+ CT_SYSTEM_ALLOC, // accumulated allocated size for internal lfalloc needs
+ CT_SYSTEM_FREE, // accumulated deallocated size for internal lfalloc needs
+ CT_SMALL_ALLOC, // accumulated allocated size for fixed-size blocks
+ CT_SMALL_FREE, // accumulated deallocated size for fixed-size blocks
+ CT_LARGE_ALLOC, // accumulated allocated size for large blocks
+ CT_LARGE_FREE, // accumulated deallocated size for large blocks
+ CT_SLOW_ALLOC_CNT, // number of slow (not LF) allocations
+ CT_DEGRAGMENT_CNT, // number of memory defragmentations
+ CT_MAX
+ };
+
+ i64 GetAllocationCounterFast(ELFAllocCounter counter);
+ i64 GetAllocationCounterFull(ELFAllocCounter counter);
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Allocation statistics could be tracked on per-tag basis
+
+ int SetThreadAllocTag(int tag);
+
+ class TScopedTag {
+ private:
+ int PrevTag;
+
+ public:
+ explicit TScopedTag(int tag) {
+ PrevTag = SetThreadAllocTag(tag);
+ }
+
+ ~TScopedTag() {
+ SetThreadAllocTag(PrevTag);
+ }
+ };
+
+ struct TPerTagAllocInfo {
+ ssize_t Count;
+ ssize_t Size;
+ };
+
+ TArrayPtr<TPerTagAllocInfo> GetPerTagAllocInfo(
+ bool flushPerThreadCounters,
+ int& maxTag,
+ int& numSizes);
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Allocation sampling could be used to collect detailed information
+
+ bool SetProfileCurrentThread(bool newVal);
+ bool SetProfileAllThreads(bool newVal);
+ bool SetAllocationSamplingEnabled(bool newVal);
+
+ size_t SetAllocationSampleRate(size_t newVal);
+ size_t SetAllocationSampleMaxSize(size_t newVal);
+
+#define DBG_ALLOC_INVALID_COOKIE (-1)
+
+ using TAllocationCallback = int(int tag, size_t size, int sizeIdx);
+ using TDeallocationCallback = void(int cookie, int tag, size_t size, int sizeIdx);
+
+ TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal);
+ TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal);
+
+}
diff --git a/library/cpp/lfalloc/dbg_info/ya.make b/library/cpp/lfalloc/dbg_info/ya.make
new file mode 100644
index 0000000000..efecba5993
--- /dev/null
+++ b/library/cpp/lfalloc/dbg_info/ya.make
@@ -0,0 +1,15 @@
+LIBRARY()
+
+OWNER(vskipin)
+
+PEERDIR(
+ library/cpp/malloc/api
+)
+
+SRCS(
+ dbg_info.cpp
+)
+
+SET(IDE_FOLDER "util")
+
+END()
diff --git a/library/cpp/lfalloc/lf_allocX64.cpp b/library/cpp/lfalloc/lf_allocX64.cpp
new file mode 100644
index 0000000000..2eb90761fe
--- /dev/null
+++ b/library/cpp/lfalloc/lf_allocX64.cpp
@@ -0,0 +1,144 @@
+#include "lf_allocX64.h"
+
+//////////////////////////////////////////////////////////////////////////
+// hooks
+#if defined(USE_INTELCC) || defined(_darwin_) || defined(_freebsd_) || defined(_STLPORT_VERSION)
+#define OP_THROWNOTHING noexcept
+#else
+#define OP_THROWNOTHING
+#endif
+
+#ifndef _darwin_
+#if !defined(YMAKE)
+void* operator new(size_t size) {
+ return LFAlloc(size);
+}
+
+void* operator new(size_t size, const std::nothrow_t&) OP_THROWNOTHING {
+ return LFAlloc(size);
+}
+
+void operator delete(void* p)OP_THROWNOTHING {
+ LFFree(p);
+}
+
+void operator delete(void* p, const std::nothrow_t&)OP_THROWNOTHING {
+ LFFree(p);
+}
+
+void* operator new[](size_t size) {
+ return LFAlloc(size);
+}
+
+void* operator new[](size_t size, const std::nothrow_t&) OP_THROWNOTHING {
+ return LFAlloc(size);
+}
+
+void operator delete[](void* p) OP_THROWNOTHING {
+ LFFree(p);
+}
+
+void operator delete[](void* p, const std::nothrow_t&) OP_THROWNOTHING {
+ LFFree(p);
+}
+#endif
+
+//#ifndef _MSC_VER
+
+extern "C" void* malloc(size_t size) {
+ return LFAlloc(size);
+}
+
+extern "C" void* valloc(size_t size) {
+ return LFVAlloc(size);
+}
+
+extern "C" int posix_memalign(void** memptr, size_t alignment, size_t size) {
+ return LFPosixMemalign(memptr, alignment, size);
+}
+
+extern "C" void* memalign(size_t alignment, size_t size) {
+ void* ptr;
+ int res = LFPosixMemalign(&ptr, alignment, size);
+ return res ? nullptr : ptr;
+}
+
+extern "C" void* aligned_alloc(size_t alignment, size_t size) {
+ return memalign(alignment, size);
+}
+
+#if !defined(_MSC_VER) && !defined(_freebsd_)
+// Workaround for pthread_create bug in linux.
+extern "C" void* __libc_memalign(size_t alignment, size_t size) {
+ return memalign(alignment, size);
+}
+#endif
+
+extern "C" void free(void* ptr) {
+ LFFree(ptr);
+}
+
+extern "C" void* calloc(size_t n, size_t elem_size) {
+ // Overflow check
+ const size_t size = n * elem_size;
+ if (elem_size != 0 && size / elem_size != n)
+ return nullptr;
+
+ void* result = LFAlloc(size);
+ if (result != nullptr) {
+ memset(result, 0, size);
+ }
+ return result;
+}
+
+extern "C" void cfree(void* ptr) {
+ LFFree(ptr);
+}
+
+extern "C" void* realloc(void* old_ptr, size_t new_size) {
+ if (old_ptr == nullptr) {
+ void* result = LFAlloc(new_size);
+ return result;
+ }
+ if (new_size == 0) {
+ LFFree(old_ptr);
+ return nullptr;
+ }
+
+ void* new_ptr = LFAlloc(new_size);
+ if (new_ptr == nullptr) {
+ return nullptr;
+ }
+ size_t old_size = LFGetSize(old_ptr);
+ memcpy(new_ptr, old_ptr, ((old_size < new_size) ? old_size : new_size));
+ LFFree(old_ptr);
+ return new_ptr;
+}
+
+extern "C" size_t malloc_usable_size(void* ptr) {
+ if (ptr == nullptr) {
+ return 0;
+ }
+ return LFGetSize(ptr);
+}
+
+NMalloc::TMallocInfo NMalloc::MallocInfo() {
+ NMalloc::TMallocInfo r;
+#if defined(LFALLOC_DBG)
+ r.Name = "lfalloc_dbg";
+#elif defined(LFALLOC_YT)
+ r.Name = "lfalloc_yt";
+#else
+ r.Name = "lfalloc";
+#endif
+ r.SetParam = &LFAlloc_SetParam;
+ r.GetParam = &LFAlloc_GetParam;
+ return r;
+}
+#else
+NMalloc::TMallocInfo NMalloc::MallocInfo() {
+ NMalloc::TMallocInfo r;
+ r.Name = "system-darwin";
+ return r;
+}
+#endif
diff --git a/library/cpp/lfalloc/lf_allocX64.h b/library/cpp/lfalloc/lf_allocX64.h
new file mode 100644
index 0000000000..fd2a906d6f
--- /dev/null
+++ b/library/cpp/lfalloc/lf_allocX64.h
@@ -0,0 +1,1926 @@
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+
+#include <library/cpp/malloc/api/malloc.h>
+
+#include <util/system/compat.h>
+#include <util/system/compiler.h>
+#include <util/system/types.h>
+
+#ifdef _MSC_VER
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+#ifdef _M_X64
+#define _64_
+#endif
+#include <intrin.h>
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#pragma intrinsic(_InterlockedCompareExchange)
+#pragma intrinsic(_InterlockedExchangeAdd)
+
+#include <new>
+#include <assert.h>
+#include <errno.h>
+
+#define PERTHREAD __declspec(thread)
+#define _win_
+#define Y_FORCE_INLINE __forceinline
+
+using TAtomic = volatile long;
+
+static inline long AtomicAdd(TAtomic& a, long b) {
+ return _InterlockedExchangeAdd(&a, b) + b;
+}
+
+static inline long AtomicSub(TAtomic& a, long b) {
+ return AtomicAdd(a, -b);
+}
+
+#pragma comment(lib, "synchronization.lib")
+
+#ifndef NDEBUG
+#define Y_ASSERT_NOBT(x) \
+ { \
+ if (IsDebuggerPresent()) { \
+ if (!(x)) \
+ __debugbreak(); \
+ } else \
+ assert(x); \
+ }
+#else
+#define Y_ASSERT_NOBT(x) ((void)0)
+#endif
+
+#else
+
+#include <util/system/defaults.h>
+#include <util/system/atomic.h>
+#include <util/system/yassert.h>
+
+#if !defined(NDEBUG) && !defined(__GCCXML__)
+#define Y_ASSERT_NOBT(a) \
+ do { \
+ try { \
+ if (Y_UNLIKELY(!(a))) { \
+ if (YaIsDebuggerPresent()) \
+ __debugbreak(); \
+ else { \
+ assert(false && (a)); \
+ } \
+ } \
+ } catch (...) { \
+ if (YaIsDebuggerPresent()) \
+ __debugbreak(); \
+ else { \
+ assert(false && "Exception during assert"); \
+ } \
+ } \
+ } while (0)
+#else
+#define Y_ASSERT_NOBT(a) \
+ do { \
+ if (false) { \
+ bool __xxx = static_cast<bool>(a); \
+ Y_UNUSED(__xxx); \
+ } \
+ } while (0)
+#endif
+
+#include <pthread.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <new>
+#include <errno.h>
+
+#if defined(_linux_)
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#if !defined(MADV_HUGEPAGE)
+#define MADV_HUGEPAGE 14
+#endif
+#if !defined(MAP_HUGETLB)
+#define MAP_HUGETLB 0x40000
+#endif
+#endif
+
+#define PERTHREAD __thread
+
+#endif
+
+#ifndef _darwin_
+
+#ifndef Y_ARRAY_SIZE
+#define Y_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+#endif
+
+#ifndef NDEBUG
+#define DBG_FILL_MEMORY
+static bool FillMemoryOnAllocation = true;
+#endif
+
+static bool TransparentHugePages = false; // force MADV_HUGEPAGE for large allocs
+static bool MapHugeTLB = false; // force MAP_HUGETLB for small allocs
+static bool EnableDefrag = true;
+
+// Buffers that are larger than this size will not be filled with 0xcf
+#ifndef DBG_FILL_MAX_SIZE
+#define DBG_FILL_MAX_SIZE 0x01000000000000ULL
+#endif
+
+template <class T>
+inline T* DoCas(T* volatile* target, T* exchange, T* compare) {
+#if defined(__has_builtin) && __has_builtin(__sync_val_compare_and_swap)
+ return __sync_val_compare_and_swap(target, compare, exchange);
+#elif defined(_WIN32)
+#ifdef _64_
+ return (T*)_InterlockedCompareExchange64((__int64*)target, (__int64)exchange, (__int64)compare);
+#else
+ //return (T*)InterlockedCompareExchangePointer(targetVoidP, exchange, compare);
+ return (T*)_InterlockedCompareExchange((LONG*)target, (LONG)exchange, (LONG)compare);
+#endif
+#elif defined(__i386) || defined(__x86_64__)
+ union {
+ T* volatile* NP;
+ void* volatile* VoidP;
+ } gccSucks;
+ gccSucks.NP = target;
+ void* volatile* targetVoidP = gccSucks.VoidP;
+
+ __asm__ __volatile__(
+ "lock\n\t"
+ "cmpxchg %2,%0\n\t"
+ : "+m"(*(targetVoidP)), "+a"(compare)
+ : "r"(exchange)
+ : "cc", "memory");
+ return compare;
+#else
+#error inline_cas not defined for this platform
+#endif
+}
+
+#ifdef _64_
+const uintptr_t N_MAX_WORKSET_SIZE = 0x100000000ll * 200;
+const uintptr_t N_HUGE_AREA_FINISH = 0x700000000000ll;
+#ifndef _freebsd_
+const uintptr_t LINUX_MMAP_AREA_START = 0x100000000ll;
+static uintptr_t volatile linuxAllocPointer = LINUX_MMAP_AREA_START;
+static uintptr_t volatile linuxAllocPointerHuge = LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE;
+#endif
+#else
+const uintptr_t N_MAX_WORKSET_SIZE = 0xffffffff;
+#endif
+#define ALLOC_START ((char*)0)
+
+const size_t N_CHUNK_SIZE = 1024 * 1024;
+const size_t N_CHUNKS = N_MAX_WORKSET_SIZE / N_CHUNK_SIZE;
+const size_t N_LARGE_ALLOC_SIZE = N_CHUNK_SIZE * 128;
+
+// map size idx to size in bytes
+#ifdef LFALLOC_YT
+const int N_SIZES = 27;
+#else
+const int N_SIZES = 25;
+#endif
+const int nSizeIdxToSize[N_SIZES] = {
+ -1,
+#if defined(_64_)
+ 16, 16, 32, 32, 48, 64, 96, 128,
+#else
+ 8,
+ 16,
+ 24,
+ 32,
+ 48,
+ 64,
+ 96,
+ 128,
+#endif
+ 192, 256, 384, 512, 768, 1024, 1536, 2048,
+ 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768,
+#ifdef LFALLOC_YT
+ 49152, 65536
+#endif
+};
+#ifdef LFALLOC_YT
+const size_t N_MAX_FAST_SIZE = 65536;
+#else
+const size_t N_MAX_FAST_SIZE = 32768;
+#endif
+const unsigned char size2idxArr1[64 + 1] = {
+ 1,
+#if defined(_64_)
+ 2, 2, 4, 4, // 16, 16, 32, 32
+#else
+ 1, 2, 3, 4, // 8, 16, 24, 32
+#endif
+ 5, 5, 6, 6, // 48, 64
+ 7, 7, 7, 7, 8, 8, 8, 8, // 96, 128
+ 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, // 192, 256
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, // 384
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12 // 512
+};
+#ifdef LFALLOC_YT
+const unsigned char size2idxArr2[256] = {
+#else
+const unsigned char size2idxArr2[128] = {
+#endif
+ 12, 12, 13, 14, // 512, 512, 768, 1024
+ 15, 15, 16, 16, // 1536, 2048
+ 17, 17, 17, 17, 18, 18, 18, 18, // 3072, 4096
+ 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, // 6144, 8192
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, // 12288
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, // 16384
+ 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, // 24576
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, // 32768
+#ifdef LFALLOC_YT
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, // 49152
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, // 65536
+#endif
+};
+
+// map entry number to size idx
+// special size idx's: 0 = not used, -1 = mem locked, but not allocated
+static volatile char chunkSizeIdx[N_CHUNKS];
+const int FREE_CHUNK_ARR_BUF = 0x20000; // this is effectively 128G of free memory (with 1M chunks), should not be exhausted actually
+static volatile uintptr_t freeChunkArr[FREE_CHUNK_ARR_BUF];
+static volatile int freeChunkCount;
+
+static void AddFreeChunk(uintptr_t chunkId) {
+ chunkSizeIdx[chunkId] = -1;
+ if (Y_UNLIKELY(freeChunkCount == FREE_CHUNK_ARR_BUF))
+ NMalloc::AbortFromCorruptedAllocator("free chunks array overflowed");
+ freeChunkArr[freeChunkCount++] = chunkId;
+}
+
+static bool GetFreeChunk(uintptr_t* res) {
+ if (freeChunkCount == 0) {
+ *res = 0;
+ return false;
+ }
+ *res = freeChunkArr[--freeChunkCount];
+ return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+enum ELFAllocCounter {
+ CT_USER_ALLOC, // accumulated size requested by user code
+ CT_MMAP, // accumulated mmapped size
+ CT_MMAP_CNT, // number of mmapped regions
+ CT_MUNMAP, // accumulated unmmapped size
+ CT_MUNMAP_CNT, // number of munmaped regions
+ CT_SYSTEM_ALLOC, // accumulated allocated size for internal lfalloc needs
+ CT_SYSTEM_FREE, // accumulated deallocated size for internal lfalloc needs
+ CT_SMALL_ALLOC, // accumulated allocated size for fixed-size blocks
+ CT_SMALL_FREE, // accumulated deallocated size for fixed-size blocks
+ CT_LARGE_ALLOC, // accumulated allocated size for large blocks
+ CT_LARGE_FREE, // accumulated deallocated size for large blocks
+ CT_SLOW_ALLOC_CNT, // number of slow (not LF) allocations
+ CT_DEGRAGMENT_CNT, // number of memory defragmentations
+ CT_MAX
+};
+
+static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value);
+
+//////////////////////////////////////////////////////////////////////////
+enum EMMapMode {
+ MM_NORMAL, // memory for small allocs
+ MM_HUGE // memory for large allocs
+};
+
+#ifndef _MSC_VER
+inline void VerifyMmapResult(void* result) {
+ if (Y_UNLIKELY(result == MAP_FAILED))
+ NMalloc::AbortFromCorruptedAllocator("negative size requested? or just out of mem");
+}
+#endif
+
+#if !defined(_MSC_VER) && !defined(_freebsd_) && defined(_64_)
+static char* AllocWithMMapLinuxImpl(uintptr_t sz, EMMapMode mode) {
+ char* volatile* areaPtr;
+ char* areaStart;
+ uintptr_t areaFinish;
+
+ int mapProt = PROT_READ | PROT_WRITE;
+ int mapFlags = MAP_PRIVATE | MAP_ANON;
+
+ if (mode == MM_HUGE) {
+ areaPtr = reinterpret_cast<char* volatile*>(&linuxAllocPointerHuge);
+ areaStart = reinterpret_cast<char*>(LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE);
+ areaFinish = N_HUGE_AREA_FINISH;
+ } else {
+ areaPtr = reinterpret_cast<char* volatile*>(&linuxAllocPointer);
+ areaStart = reinterpret_cast<char*>(LINUX_MMAP_AREA_START);
+ areaFinish = N_MAX_WORKSET_SIZE;
+
+ if (MapHugeTLB) {
+ mapFlags |= MAP_HUGETLB;
+ }
+ }
+
+ bool wrapped = false;
+ for (;;) {
+ char* prevAllocPtr = *areaPtr;
+ char* nextAllocPtr = prevAllocPtr + sz;
+ if (uintptr_t(nextAllocPtr - (char*)nullptr) >= areaFinish) {
+ if (Y_UNLIKELY(wrapped)) {
+ NMalloc::AbortFromCorruptedAllocator("virtual memory is over fragmented");
+ }
+ // wrap after all area is used
+ DoCas(areaPtr, areaStart, prevAllocPtr);
+ wrapped = true;
+ continue;
+ }
+
+ if (DoCas(areaPtr, nextAllocPtr, prevAllocPtr) != prevAllocPtr)
+ continue;
+
+ char* largeBlock = (char*)mmap(prevAllocPtr, sz, mapProt, mapFlags, -1, 0);
+ VerifyMmapResult(largeBlock);
+ if (largeBlock == prevAllocPtr)
+ return largeBlock;
+ if (largeBlock)
+ munmap(largeBlock, sz);
+
+ if (sz < 0x80000) {
+ // skip utilized area with big steps
+ DoCas(areaPtr, nextAllocPtr + 0x10 * 0x10000, nextAllocPtr);
+ }
+ }
+}
+#endif
+
+static char* AllocWithMMap(uintptr_t sz, EMMapMode mode) {
+ (void)mode;
+#ifdef _MSC_VER
+ char* largeBlock = (char*)VirtualAlloc(0, sz, MEM_RESERVE, PAGE_READWRITE);
+ if (Y_UNLIKELY(largeBlock == nullptr))
+ NMalloc::AbortFromCorruptedAllocator("out of memory");
+ if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE))
+ NMalloc::AbortFromCorruptedAllocator("out of working set, something has broken");
+#else
+#if defined(_freebsd_) || !defined(_64_)
+ char* largeBlock = (char*)mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ VerifyMmapResult(largeBlock);
+ if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE))
+ NMalloc::AbortFromCorruptedAllocator("out of working set, something has broken");
+#else
+ char* largeBlock = AllocWithMMapLinuxImpl(sz, mode);
+ if (TransparentHugePages) {
+ madvise(largeBlock, sz, MADV_HUGEPAGE);
+ }
+#endif
+#endif
+ Y_ASSERT_NOBT(largeBlock);
+ IncrementCounter(CT_MMAP, sz);
+ IncrementCounter(CT_MMAP_CNT, 1);
+ return largeBlock;
+}
+
+enum class ELarge : ui8 {
+ Free = 0, // block in free cache
+ Alloc = 1, // block is allocated
+ Gone = 2, // block was unmapped
+};
+
+struct TLargeBlk {
+
+ static TLargeBlk* As(void *raw) {
+ return reinterpret_cast<TLargeBlk*>((char*)raw - 4096ll);
+ }
+
+ static const TLargeBlk* As(const void *raw) {
+ return reinterpret_cast<const TLargeBlk*>((const char*)raw - 4096ll);
+ }
+
+ void SetSize(size_t bytes, size_t pages) {
+ Pages = pages;
+ Bytes = bytes;
+ }
+
+ void Mark(ELarge state) {
+ const ui64 marks[] = {
+ 0x8b38aa5ca4953c98, // ELarge::Free
+ 0xf916d33584eb5087, // ELarge::Alloc
+ 0xd33b0eca7651bc3f // ELarge::Gone
+ };
+
+ Token = size_t(marks[ui8(state)]);
+ }
+
+ size_t Pages; // Total pages allocated with mmap like call
+ size_t Bytes; // Actually requested bytes by user
+ size_t Token; // Block state token, see ELarge enum.
+};
+
+
+static void LargeBlockUnmap(void* p, size_t pages) {
+ const auto bytes = (pages + 1) * uintptr_t(4096);
+
+ IncrementCounter(CT_MUNMAP, bytes);
+ IncrementCounter(CT_MUNMAP_CNT, 1);
+#ifdef _MSC_VER
+ Y_ASSERT_NOBT(0);
+#else
+ TLargeBlk::As(p)->Mark(ELarge::Gone);
+ munmap((char*)p - 4096ll, bytes);
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////
+const size_t LB_BUF_SIZE = 250;
+const size_t LB_BUF_HASH = 977;
+static int LB_LIMIT_TOTAL_SIZE = 500 * 1024 * 1024 / 4096; // do not keep more then this mem total in lbFreePtrs[]
+static void* volatile lbFreePtrs[LB_BUF_HASH][LB_BUF_SIZE];
+static TAtomic lbFreePageCount;
+
+
+static void* LargeBlockAlloc(size_t _nSize, ELFAllocCounter counter) {
+ size_t pgCount = (_nSize + 4095) / 4096;
+#ifdef _MSC_VER
+ char* pRes = (char*)VirtualAlloc(0, (pgCount + 1) * 4096ll, MEM_COMMIT, PAGE_READWRITE);
+ if (Y_UNLIKELY(pRes == 0)) {
+ NMalloc::AbortFromCorruptedAllocator("out of memory");
+ }
+#else
+
+ IncrementCounter(counter, pgCount * 4096ll);
+ IncrementCounter(CT_SYSTEM_ALLOC, 4096ll);
+
+ int lbHash = pgCount % LB_BUF_HASH;
+ for (int i = 0; i < LB_BUF_SIZE; ++i) {
+ void* p = lbFreePtrs[lbHash][i];
+ if (p == nullptr)
+ continue;
+ if (DoCas(&lbFreePtrs[lbHash][i], (void*)nullptr, p) == p) {
+ size_t realPageCount = TLargeBlk::As(p)->Pages;
+ if (realPageCount == pgCount) {
+ AtomicAdd(lbFreePageCount, -pgCount);
+ TLargeBlk::As(p)->Mark(ELarge::Alloc);
+ return p;
+ } else {
+ if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) != (void*)nullptr) {
+ // block was freed while we were busy
+ AtomicAdd(lbFreePageCount, -realPageCount);
+ LargeBlockUnmap(p, realPageCount);
+ --i;
+ }
+ }
+ }
+ }
+ char* pRes = AllocWithMMap((pgCount + 1) * 4096ll, MM_HUGE);
+#endif
+ pRes += 4096ll;
+ TLargeBlk::As(pRes)->SetSize(_nSize, pgCount);
+ TLargeBlk::As(pRes)->Mark(ELarge::Alloc);
+
+ return pRes;
+}
+
+#ifndef _MSC_VER
+static void FreeAllLargeBlockMem() {
+ for (auto& lbFreePtr : lbFreePtrs) {
+ for (int i = 0; i < LB_BUF_SIZE; ++i) {
+ void* p = lbFreePtr[i];
+ if (p == nullptr)
+ continue;
+ if (DoCas(&lbFreePtr[i], (void*)nullptr, p) == p) {
+ int pgCount = TLargeBlk::As(p)->Pages;
+ AtomicAdd(lbFreePageCount, -pgCount);
+ LargeBlockUnmap(p, pgCount);
+ }
+ }
+ }
+}
+#endif
+
+static void LargeBlockFree(void* p, ELFAllocCounter counter) {
+ if (p == nullptr)
+ return;
+#ifdef _MSC_VER
+ VirtualFree((char*)p - 4096ll, 0, MEM_RELEASE);
+#else
+ size_t pgCount = TLargeBlk::As(p)->Pages;
+
+ TLargeBlk::As(p)->Mark(ELarge::Free);
+ IncrementCounter(counter, pgCount * 4096ll);
+ IncrementCounter(CT_SYSTEM_FREE, 4096ll);
+
+ if (lbFreePageCount > LB_LIMIT_TOTAL_SIZE)
+ FreeAllLargeBlockMem();
+ int lbHash = pgCount % LB_BUF_HASH;
+ for (int i = 0; i < LB_BUF_SIZE; ++i) {
+ if (lbFreePtrs[lbHash][i] == nullptr) {
+ if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) == nullptr) {
+ AtomicAdd(lbFreePageCount, pgCount);
+ return;
+ }
+ }
+ }
+
+ LargeBlockUnmap(p, pgCount);
+#endif
+}
+
+static void* SystemAlloc(size_t _nSize) {
+ //HeapAlloc(GetProcessHeap(), HEAP_GENERATE_EXCEPTIONS, _nSize);
+ return LargeBlockAlloc(_nSize, CT_SYSTEM_ALLOC);
+}
+static void SystemFree(void* p) {
+ //HeapFree(GetProcessHeap(), 0, p);
+ LargeBlockFree(p, CT_SYSTEM_FREE);
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+char* const LF_LOCK_FREE = ((char*)0) + 0;
+char* const LF_LOCK_LOCKED = ((char*)0) + 1;
+char* const LF_LOCK_FUTEX_WAIT = ((char*)0) + 2;
+static bool LFHasFutex = true;
+static bool LFCheckedWinVersion = false;
+
+// TLFLockData has to be zero-initialized explicitly https://en.cppreference.com/w/cpp/language/zero_initialization
+// otherwise constructor TLFLockData() for global var might be called after first use
+struct TLFLockData
+{
+ char* Pad1[15];
+ char* volatile LockVar; // = LF_LOCK_FREE; // no constructor, zero-initialize manually
+ char* Pad2[15];
+
+ bool TryLock()
+ {
+ return (LockVar == LF_LOCK_FREE && DoCas(&LockVar, LF_LOCK_LOCKED, LF_LOCK_FREE) == LF_LOCK_FREE);
+ }
+
+ void FutexWait()
+ {
+#ifdef _win_
+ if (!LFCheckedWinVersion) {
+ OSVERSIONINFOA ver;
+ memset(&ver, 0, sizeof(ver));
+ ver.dwOSVersionInfoSize = sizeof(OSVERSIONINFOA);
+ GetVersionExA(&ver);
+ LFHasFutex = (ver.dwMajorVersion > 6) || (ver.dwMajorVersion == 6 && ver.dwMinorVersion >= 2);
+ LFCheckedWinVersion = true;
+ }
+ if (LFHasFutex) {
+ if (LockVar == LF_LOCK_LOCKED) {
+ DoCas(&LockVar, LF_LOCK_FUTEX_WAIT, LF_LOCK_LOCKED);
+ }
+ if (LockVar == LF_LOCK_FUTEX_WAIT) {
+ char* lockedValue = LF_LOCK_FUTEX_WAIT;
+ WaitOnAddress(&LockVar, &lockedValue, sizeof(LockVar), INFINITE);
+ }
+ } else {
+ SwitchToThread();
+ }
+#elif defined(_linux_)
+ if (LFHasFutex) {
+ if (LockVar == LF_LOCK_LOCKED) {
+ DoCas(&LockVar, LF_LOCK_FUTEX_WAIT, LF_LOCK_LOCKED);
+ }
+ if (LockVar == LF_LOCK_FUTEX_WAIT) {
+ // linux allow only int variables checks, here we pretend low bits of LockVar are int
+ syscall(SYS_futex, &LockVar, FUTEX_WAIT_PRIVATE, *(int*)&LF_LOCK_FUTEX_WAIT, 0, 0, 0);
+ }
+ } else {
+ sched_yield();
+ }
+#else
+ sched_yield();
+#endif
+ }
+
+ void Unlock()
+ {
+ Y_ASSERT_NOBT(LockVar != LF_LOCK_FREE);
+ if (DoCas(&LockVar, LF_LOCK_FREE, LF_LOCK_LOCKED) != LF_LOCK_LOCKED) {
+ Y_ASSERT_NOBT(LockVar == LF_LOCK_FUTEX_WAIT && LFHasFutex);
+ LockVar = LF_LOCK_FREE;
+#ifdef _win_
+ WakeByAddressAll((PVOID)&LockVar);
+#elif defined(_linux_)
+ syscall(SYS_futex, &LockVar, FUTEX_WAKE_PRIVATE, INT_MAX, 0, 0, 0);
+#endif
+ }
+ }
+};
+
+static TLFLockData LFGlobalLock;
+
+
+class TLFLockHolder {
+ TLFLockData *LockData = nullptr;
+ int Attempt = 0;
+ int SleepMask = 0x7f;
+
+public:
+ TLFLockHolder() {}
+ TLFLockHolder(TLFLockData *lk) {
+ while (!TryLock(lk));
+ }
+ bool TryLock(TLFLockData *lk)
+ {
+ Y_ASSERT_NOBT(LockData == nullptr);
+ if (lk->TryLock()) {
+ LockData = lk;
+ return true;
+ }
+ if ((++Attempt & SleepMask) == 0) {
+ lk->FutexWait();
+ SleepMask = (SleepMask * 2 + 1) & 0x7fff;
+ } else {
+#ifdef _MSC_VER
+ _mm_pause();
+#elif defined(__i386) || defined(__x86_64__)
+ __asm__ __volatile__("pause");
+#endif
+ }
+ return false;
+ }
+ ~TLFLockHolder() {
+ if (LockData) {
+ LockData->Unlock();
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+class TLFAllocFreeList {
+ struct TNode {
+ TNode* Next;
+ };
+
+ TNode* volatile Head;
+ TNode* volatile Pending;
+ TAtomic PendingToFreeListCounter;
+ TAtomic AllocCount;
+ void* Padding;
+
+ static Y_FORCE_INLINE void Enqueue(TNode* volatile* headPtr, TNode* n) {
+ for (;;) {
+ TNode* volatile prevHead = *headPtr;
+ n->Next = prevHead;
+ if (DoCas(headPtr, n, prevHead) == prevHead)
+ break;
+ }
+ }
+ Y_FORCE_INLINE void* DoAlloc() {
+ TNode* res;
+ for (res = Head; res; res = Head) {
+ TNode* keepNext = res->Next;
+ if (DoCas(&Head, keepNext, res) == res) {
+ //Y_VERIFY(keepNext == res->Next);
+ break;
+ }
+ }
+ return res;
+ }
+ void FreeList(TNode* fl) {
+ if (!fl)
+ return;
+ TNode* flTail = fl;
+ while (flTail->Next)
+ flTail = flTail->Next;
+ for (;;) {
+ TNode* volatile prevHead = Head;
+ flTail->Next = prevHead;
+ if (DoCas(&Head, fl, prevHead) == prevHead)
+ break;
+ }
+ }
+
+public:
+ Y_FORCE_INLINE void Free(void* ptr) {
+ TNode* newFree = (TNode*)ptr;
+ if (AtomicAdd(AllocCount, 0) == 0)
+ Enqueue(&Head, newFree);
+ else
+ Enqueue(&Pending, newFree);
+ }
+ Y_FORCE_INLINE void* Alloc() {
+ TAtomic keepCounter = AtomicAdd(PendingToFreeListCounter, 0);
+ TNode* fl = Pending;
+ if (AtomicAdd(AllocCount, 1) == 1) {
+ // No other allocs in progress.
+ // If (keepCounter == PendingToFreeListCounter) then Pending was not freed by other threads.
+ // Hence Pending is not used in any concurrent DoAlloc() atm and can be safely moved to FreeList
+ if (fl && keepCounter == AtomicAdd(PendingToFreeListCounter, 0) && DoCas(&Pending, (TNode*)nullptr, fl) == fl) {
+ // pick first element from Pending and return it
+ void* res = fl;
+ fl = fl->Next;
+ // if there are other elements in Pending list, add them to main free list
+ FreeList(fl);
+ AtomicAdd(PendingToFreeListCounter, 1);
+ AtomicAdd(AllocCount, -1);
+ return res;
+ }
+ }
+ void* res = DoAlloc();
+ AtomicAdd(AllocCount, -1);
+ return res;
+ }
+ void* GetWholeList() {
+ TNode* res;
+ for (res = Head; res; res = Head) {
+ if (DoCas(&Head, (TNode*)nullptr, res) == res)
+ break;
+ }
+ return res;
+ }
+ void ReturnWholeList(void* ptr) {
+ while (AtomicAdd(AllocCount, 0) != 0) // theoretically can run into problems with parallel DoAlloc()
+ ; //ThreadYield();
+ for (;;) {
+ TNode* prevHead = Head;
+ if (DoCas(&Head, (TNode*)ptr, prevHead) == prevHead) {
+ FreeList(prevHead);
+ break;
+ }
+ }
+ }
+};
+
+/////////////////////////////////////////////////////////////////////////
+static TLFAllocFreeList globalFreeLists[N_SIZES];
+static char* volatile globalCurrentPtr[N_SIZES];
+static TLFAllocFreeList blockFreeList;
+
+// globalFreeLists[] contains TFreeListGroup, each of them points up to 15 free blocks
+const int FL_GROUP_SIZE = 15;
+struct TFreeListGroup {
+ TFreeListGroup* Next;
+ char* Ptrs[FL_GROUP_SIZE];
+};
+#ifdef _64_
+const int FREE_LIST_GROUP_SIZEIDX = 8;
+#else
+const int FREE_LIST_GROUP_SIZEIDX = 6;
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+// find free chunks and reset chunk size so they can be reused by different sized allocations
+// do not look at blockFreeList (TFreeListGroup has same size for any allocations)
+static bool DefragmentMem() {
+ if (!EnableDefrag) {
+ return false;
+ }
+
+ IncrementCounter(CT_DEGRAGMENT_CNT, 1);
+
+ int* nFreeCount = (int*)SystemAlloc(N_CHUNKS * sizeof(int));
+ if (Y_UNLIKELY(!nFreeCount)) {
+ //__debugbreak();
+ NMalloc::AbortFromCorruptedAllocator("debugbreak");
+ }
+ memset(nFreeCount, 0, N_CHUNKS * sizeof(int));
+
+ TFreeListGroup* wholeLists[N_SIZES];
+ for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) {
+ wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList();
+ for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) {
+ for (auto pData : g->Ptrs) {
+ if (pData) {
+ uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE;
+ ++nFreeCount[nChunk];
+ Y_ASSERT_NOBT(chunkSizeIdx[nChunk] == nSizeIdx);
+ }
+ }
+ }
+ }
+
+ bool bRes = false;
+ for (size_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) {
+ int fc = nFreeCount[nChunk];
+ nFreeCount[nChunk] = 0;
+ if (chunkSizeIdx[nChunk] <= 0)
+ continue;
+ int nEntries = N_CHUNK_SIZE / nSizeIdxToSize[static_cast<int>(chunkSizeIdx[nChunk])];
+ Y_ASSERT_NOBT(fc <= nEntries); // can not have more free blocks then total count
+ if (fc == nEntries) {
+ bRes = true;
+ nFreeCount[nChunk] = 1;
+ }
+ }
+ if (bRes) {
+ for (auto& wholeList : wholeLists) {
+ TFreeListGroup** ppPtr = &wholeList;
+ while (*ppPtr) {
+ TFreeListGroup* g = *ppPtr;
+ int dst = 0;
+ for (auto pData : g->Ptrs) {
+ if (pData) {
+ uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE;
+ if (nFreeCount[nChunk] == 0)
+ g->Ptrs[dst++] = pData; // block is not freed, keep pointer
+ }
+ }
+ if (dst == 0) {
+ // no valid pointers in group, free it
+ *ppPtr = g->Next;
+ blockFreeList.Free(g);
+ } else {
+ // reset invalid pointers to 0
+ for (int i = dst; i < FL_GROUP_SIZE; ++i)
+ g->Ptrs[i] = nullptr;
+ ppPtr = &g->Next;
+ }
+ }
+ }
+ for (uintptr_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) {
+ if (!nFreeCount[nChunk])
+ continue;
+ char* pStart = ALLOC_START + nChunk * N_CHUNK_SIZE;
+#ifdef _win_
+ VirtualFree(pStart, N_CHUNK_SIZE, MEM_DECOMMIT);
+#elif defined(_freebsd_)
+ madvise(pStart, N_CHUNK_SIZE, MADV_FREE);
+#else
+ madvise(pStart, N_CHUNK_SIZE, MADV_DONTNEED);
+#endif
+ AddFreeChunk(nChunk);
+ }
+ }
+
+ for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx)
+ globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]);
+
+ SystemFree(nFreeCount);
+ return bRes;
+}
+
+static Y_FORCE_INLINE void* LFAllocFromCurrentChunk(int nSizeIdx, int blockSize, int count) {
+ char* volatile* pFreeArray = &globalCurrentPtr[nSizeIdx];
+ while (char* newBlock = *pFreeArray) {
+ char* nextFree = newBlock + blockSize * count;
+
+ // check if there is space in chunk
+ char* globalEndPtr = ALLOC_START + ((newBlock - ALLOC_START) & ~((uintptr_t)N_CHUNK_SIZE - 1)) + N_CHUNK_SIZE;
+ if (nextFree >= globalEndPtr) {
+ if (nextFree > globalEndPtr)
+ break;
+ nextFree = nullptr; // it was last block in chunk
+ }
+ if (DoCas(pFreeArray, nextFree, newBlock) == newBlock)
+ return newBlock;
+ }
+ return nullptr;
+}
+
+enum EDefrag {
+ MEM_DEFRAG,
+ NO_MEM_DEFRAG,
+};
+
+static void* SlowLFAlloc(int nSizeIdx, int blockSize, EDefrag defrag) {
+ IncrementCounter(CT_SLOW_ALLOC_CNT, 1);
+
+ TLFLockHolder ls;
+ for (;;) {
+ bool locked = ls.TryLock(&LFGlobalLock);
+ void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1);
+ if (res) {
+ return res; // might happen when other thread allocated new current chunk
+ }
+ if (locked) {
+ break;
+ }
+ }
+ for (;;) {
+ uintptr_t nChunk;
+ if (GetFreeChunk(&nChunk)) {
+ char* newPlace = ALLOC_START + nChunk * N_CHUNK_SIZE;
+#ifdef _MSC_VER
+ void* pTest = VirtualAlloc(newPlace, N_CHUNK_SIZE, MEM_COMMIT, PAGE_READWRITE);
+ Y_ASSERT_NOBT(pTest == newPlace);
+#endif
+ chunkSizeIdx[nChunk] = (char)nSizeIdx;
+ globalCurrentPtr[nSizeIdx] = newPlace + blockSize;
+ return newPlace;
+ }
+
+ // out of luck, try to defrag
+ if (defrag == MEM_DEFRAG && DefragmentMem()) {
+ continue;
+ }
+
+ char* largeBlock = AllocWithMMap(N_LARGE_ALLOC_SIZE, MM_NORMAL);
+ uintptr_t addr = ((largeBlock - ALLOC_START) + N_CHUNK_SIZE - 1) & (~(N_CHUNK_SIZE - 1));
+ uintptr_t endAddr = ((largeBlock - ALLOC_START) + N_LARGE_ALLOC_SIZE) & (~(N_CHUNK_SIZE - 1));
+ for (uintptr_t p = addr; p < endAddr; p += N_CHUNK_SIZE) {
+ uintptr_t chunk = p / N_CHUNK_SIZE;
+ Y_ASSERT_NOBT(chunk * N_CHUNK_SIZE == p);
+ Y_ASSERT_NOBT(chunkSizeIdx[chunk] == 0);
+ AddFreeChunk(chunk);
+ }
+ }
+ return nullptr;
+}
+
+// allocate single block
+static Y_FORCE_INLINE void* LFAllocNoCache(int nSizeIdx, EDefrag defrag) {
+ int blockSize = nSizeIdxToSize[nSizeIdx];
+ void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1);
+ if (res)
+ return res;
+
+ return SlowLFAlloc(nSizeIdx, blockSize, defrag);
+}
+
+// allocate multiple blocks, returns number of blocks allocated (max FL_GROUP_SIZE)
+// buf should have space for at least FL_GROUP_SIZE elems
+static Y_FORCE_INLINE int LFAllocNoCacheMultiple(int nSizeIdx, char** buf) {
+ int blockSize = nSizeIdxToSize[nSizeIdx];
+ void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, FL_GROUP_SIZE);
+ if (res) {
+ char* resPtr = (char*)res;
+ for (int k = 0; k < FL_GROUP_SIZE; ++k) {
+ buf[k] = resPtr;
+ resPtr += blockSize;
+ }
+ return FL_GROUP_SIZE;
+ }
+ buf[0] = (char*)SlowLFAlloc(nSizeIdx, blockSize, MEM_DEFRAG);
+ return 1;
+}
+
+// take several blocks from global free list (max FL_GROUP_SIZE blocks), returns number of blocks taken
+// buf should have space for at least FL_GROUP_SIZE elems
+static Y_FORCE_INLINE int TakeBlocksFromGlobalFreeList(int nSizeIdx, char** buf) {
+ TLFAllocFreeList& fl = globalFreeLists[nSizeIdx];
+ TFreeListGroup* g = (TFreeListGroup*)fl.Alloc();
+ if (g) {
+ int resCount = 0;
+ for (auto& ptr : g->Ptrs) {
+ if (ptr)
+ buf[resCount++] = ptr;
+ else
+ break;
+ }
+ blockFreeList.Free(g);
+ return resCount;
+ }
+ return 0;
+}
+
+// add several blocks to global free list
+static Y_FORCE_INLINE void PutBlocksToGlobalFreeList(ptrdiff_t nSizeIdx, char** buf, int count) {
+ for (int startIdx = 0; startIdx < count;) {
+ TFreeListGroup* g = (TFreeListGroup*)blockFreeList.Alloc();
+ Y_ASSERT_NOBT(sizeof(TFreeListGroup) == nSizeIdxToSize[FREE_LIST_GROUP_SIZEIDX]);
+ if (!g) {
+ g = (TFreeListGroup*)LFAllocNoCache(FREE_LIST_GROUP_SIZEIDX, NO_MEM_DEFRAG);
+ }
+
+ int groupSize = count - startIdx;
+ if (groupSize > FL_GROUP_SIZE)
+ groupSize = FL_GROUP_SIZE;
+ for (int i = 0; i < groupSize; ++i)
+ g->Ptrs[i] = buf[startIdx + i];
+ for (int i = groupSize; i < FL_GROUP_SIZE; ++i)
+ g->Ptrs[i] = nullptr;
+
+ // add free group to the global list
+ TLFAllocFreeList& fl = globalFreeLists[nSizeIdx];
+ fl.Free(g);
+
+ startIdx += groupSize;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+static TAtomic GlobalCounters[CT_MAX];
+const int MAX_LOCAL_UPDATES = 100;
+const intptr_t MAX_LOCAL_DELTA = 1*1024*1024;
+
+struct TLocalCounter {
+ intptr_t Value;
+ int Updates;
+ TAtomic* Parent;
+
+ Y_FORCE_INLINE void Init(TAtomic* parent) {
+ Parent = parent;
+ Value = 0;
+ Updates = 0;
+ }
+
+ Y_FORCE_INLINE void Increment(size_t value) {
+ Value += value;
+ if (++Updates > MAX_LOCAL_UPDATES || Value > MAX_LOCAL_DELTA) {
+ Flush();
+ }
+ }
+
+ Y_FORCE_INLINE void Flush() {
+ AtomicAdd(*Parent, Value);
+ Value = 0;
+ Updates = 0;
+ }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// DBG stuff
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(LFALLOC_DBG)
+
+struct TPerTagAllocCounter {
+ TAtomic Size;
+ TAtomic Count;
+
+ Y_FORCE_INLINE void Alloc(size_t size) {
+ AtomicAdd(Size, size);
+ AtomicAdd(Count, 1);
+ }
+
+ Y_FORCE_INLINE void Free(size_t size) {
+ AtomicSub(Size, size);
+ AtomicSub(Count, 1);
+ }
+};
+
+struct TLocalPerTagAllocCounter {
+ intptr_t Size;
+ int Count;
+ int Updates;
+
+ Y_FORCE_INLINE void Init() {
+ Size = 0;
+ Count = 0;
+ Updates = 0;
+ }
+
+ Y_FORCE_INLINE void Alloc(TPerTagAllocCounter& parent, size_t size) {
+ Size += size;
+ ++Count;
+ if (++Updates > MAX_LOCAL_UPDATES) {
+ Flush(parent);
+ }
+ }
+
+ Y_FORCE_INLINE void Free(TPerTagAllocCounter& parent, size_t size) {
+ Size -= size;
+ --Count;
+ if (++Updates > MAX_LOCAL_UPDATES) {
+ Flush(parent);
+ }
+ }
+
+ Y_FORCE_INLINE void Flush(TPerTagAllocCounter& parent) {
+ AtomicAdd(parent.Size, Size);
+ Size = 0;
+ AtomicAdd(parent.Count, Count);
+ Count = 0;
+ Updates = 0;
+ }
+};
+
+static const int DBG_ALLOC_MAX_TAG = 1000;
+static const int DBG_ALLOC_ALIGNED_TAG = 0xF0000000;
+static const int DBG_ALLOC_NUM_SIZES = 30;
+static TPerTagAllocCounter GlobalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES];
+
+#endif // LFALLOC_DBG
+
+//////////////////////////////////////////////////////////////////////////
+const int THREAD_BUF = 256;
+static int borderSizes[N_SIZES];
+const int MAX_MEM_PER_SIZE_PER_THREAD = 512 * 1024;
+struct TThreadAllocInfo {
+ // FreePtrs - pointers to first free blocks in per thread block list
+ // LastFreePtrs - pointers to last blocks in lists, may be invalid if FreePtr is zero
+ char* FreePtrs[N_SIZES][THREAD_BUF];
+ int FreePtrIndex[N_SIZES];
+ TThreadAllocInfo* pNextInfo;
+ TLocalCounter LocalCounters[CT_MAX];
+
+#if defined(LFALLOC_DBG)
+ TLocalPerTagAllocCounter LocalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES];
+#endif
+#ifdef _win_
+ HANDLE hThread;
+#endif
+
+ void Init(TThreadAllocInfo** pHead) {
+ memset(this, 0, sizeof(*this));
+ for (auto& i : FreePtrIndex)
+ i = THREAD_BUF;
+#ifdef _win_
+ BOOL b = DuplicateHandle(
+ GetCurrentProcess(), GetCurrentThread(),
+ GetCurrentProcess(), &hThread,
+ 0, FALSE, DUPLICATE_SAME_ACCESS);
+ Y_ASSERT_NOBT(b);
+#endif
+ pNextInfo = *pHead;
+ *pHead = this;
+ for (int k = 0; k < N_SIZES; ++k) {
+ int maxCount = MAX_MEM_PER_SIZE_PER_THREAD / nSizeIdxToSize[k];
+ if (maxCount > THREAD_BUF)
+ maxCount = THREAD_BUF;
+ borderSizes[k] = THREAD_BUF - maxCount;
+ }
+ for (int i = 0; i < CT_MAX; ++i) {
+ LocalCounters[i].Init(&GlobalCounters[i]);
+ }
+#if defined(LFALLOC_DBG)
+ for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+ for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+ auto& local = LocalPerTagAllocCounters[tag][sizeIdx];
+ local.Init();
+ }
+ }
+#endif
+ }
+ void Done() {
+ for (auto sizeIdx : FreePtrIndex) {
+ Y_ASSERT_NOBT(sizeIdx == THREAD_BUF);
+ }
+ for (auto& localCounter : LocalCounters) {
+ localCounter.Flush();
+ }
+#if defined(LFALLOC_DBG)
+ for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+ for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+ auto& local = LocalPerTagAllocCounters[tag][sizeIdx];
+ auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+ local.Flush(global);
+ }
+ }
+#endif
+#ifdef _win_
+ if (hThread)
+ CloseHandle(hThread);
+#endif
+ }
+};
+PERTHREAD TThreadAllocInfo* pThreadInfo;
+static TThreadAllocInfo* pThreadInfoList;
+
+static TLFLockData LFLockThreadInfo;
+
+static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value) {
+#ifdef LFALLOC_YT
+ TThreadAllocInfo* thr = pThreadInfo;
+ if (thr) {
+ thr->LocalCounters[counter].Increment(value);
+ } else {
+ AtomicAdd(GlobalCounters[counter], value);
+ }
+#endif
+}
+
+extern "C" i64 GetLFAllocCounterFast(int counter) {
+#ifdef LFALLOC_YT
+ return GlobalCounters[counter];
+#else
+ return 0;
+#endif
+}
+
+extern "C" i64 GetLFAllocCounterFull(int counter) {
+#ifdef LFALLOC_YT
+ i64 ret = GlobalCounters[counter];
+ {
+ TLFLockHolder ll(&LFLockThreadInfo);
+ for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+ TThreadAllocInfo* pInfo = *p;
+ ret += pInfo->LocalCounters[counter].Value;
+ p = &pInfo->pNextInfo;
+ }
+ }
+ return ret;
+#else
+ return 0;
+#endif
+}
+
+static void MoveSingleThreadFreeToGlobal(TThreadAllocInfo* pInfo) {
+ for (int sizeIdx = 0; sizeIdx < N_SIZES; ++sizeIdx) {
+ int& freePtrIdx = pInfo->FreePtrIndex[sizeIdx];
+ char** freePtrs = pInfo->FreePtrs[sizeIdx];
+ PutBlocksToGlobalFreeList(sizeIdx, freePtrs + freePtrIdx, THREAD_BUF - freePtrIdx);
+ freePtrIdx = THREAD_BUF;
+ }
+}
+
+#ifdef _win_
+static bool IsDeadThread(TThreadAllocInfo* pInfo) {
+ DWORD dwExit;
+ bool isDead = !GetExitCodeThread(pInfo->hThread, &dwExit) || dwExit != STILL_ACTIVE;
+ return isDead;
+}
+
+static void CleanupAfterDeadThreads() {
+ TLFLockHolder ll(&LFLockThreadInfo);
+ for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+ TThreadAllocInfo* pInfo = *p;
+ if (IsDeadThread(pInfo)) {
+ MoveSingleThreadFreeToGlobal(pInfo);
+ pInfo->Done();
+ *p = pInfo->pNextInfo;
+ SystemFree(pInfo);
+ } else
+ p = &pInfo->pNextInfo;
+ }
+}
+#endif
+
+#ifndef _win_
+static pthread_key_t ThreadCacheCleaner;
+static void* volatile ThreadCacheCleanerStarted; // 0 = not started, -1 = started, -2 = is starting
+static PERTHREAD bool IsStoppingThread;
+
+static void FreeThreadCache(void*) {
+ TThreadAllocInfo* pToDelete = nullptr;
+ {
+ TLFLockHolder ll(&LFLockThreadInfo);
+ pToDelete = pThreadInfo;
+ if (pToDelete == nullptr)
+ return;
+
+ // remove from the list
+ for (TThreadAllocInfo** p = &pThreadInfoList; *p; p = &(*p)->pNextInfo) {
+ if (*p == pToDelete) {
+ *p = pToDelete->pNextInfo;
+ break;
+ }
+ }
+ IsStoppingThread = true;
+ pThreadInfo = nullptr;
+ }
+
+ // free per thread buf
+ MoveSingleThreadFreeToGlobal(pToDelete);
+ pToDelete->Done();
+ SystemFree(pToDelete);
+}
+#endif
+
+static void AllocThreadInfo() {
+#ifndef _win_
+ if (DoCas(&ThreadCacheCleanerStarted, (void*)-2, (void*)nullptr) == (void*)nullptr) {
+ pthread_key_create(&ThreadCacheCleaner, FreeThreadCache);
+ ThreadCacheCleanerStarted = (void*)-1;
+ }
+ if (ThreadCacheCleanerStarted != (void*)-1)
+ return; // do not use ThreadCacheCleaner until it is constructed
+
+ {
+ if (IsStoppingThread)
+ return;
+ TLFLockHolder ll(&LFLockThreadInfo);
+ if (IsStoppingThread) // better safe than sorry
+ return;
+
+ pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo));
+ pThreadInfo->Init(&pThreadInfoList);
+ }
+ pthread_setspecific(ThreadCacheCleaner, (void*)-1); // without value destructor will not be called
+#else
+ CleanupAfterDeadThreads();
+ {
+ pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo));
+ TLFLockHolder ll(&LFLockThreadInfo);
+ pThreadInfo->Init(&pThreadInfoList);
+ }
+#endif
+}
+
+ //////////////////////////////////////////////////////////////////////////
+ // DBG stuff
+ //////////////////////////////////////////////////////////////////////////
+
+#if defined(LFALLOC_DBG)
+
+struct TAllocHeader {
+ uint64_t Size;
+ int Tag;
+ int Cookie;
+};
+
+// should be power of 2
+static_assert(sizeof(TAllocHeader) == 16);
+
+static inline void* GetAllocPtr(TAllocHeader* p) {
+ return p + 1;
+}
+
+static inline TAllocHeader* GetAllocHeader(void* p) {
+ auto* header = ((TAllocHeader*)p) - 1;
+ if (header->Tag == DBG_ALLOC_ALIGNED_TAG) {
+ return (TAllocHeader*)header->Size;
+ }
+
+ return header;
+}
+
+PERTHREAD int AllocationTag;
+extern "C" int SetThreadAllocTag(int tag) {
+ int prevTag = AllocationTag;
+ if (tag < DBG_ALLOC_MAX_TAG && tag >= 0) {
+ AllocationTag = tag;
+ }
+ return prevTag;
+}
+
+PERTHREAD bool ProfileCurrentThread;
+extern "C" bool SetProfileCurrentThread(bool newVal) {
+ bool prevVal = ProfileCurrentThread;
+ ProfileCurrentThread = newVal;
+ return prevVal;
+}
+
+static volatile bool ProfileAllThreads;
+extern "C" bool SetProfileAllThreads(bool newVal) {
+ bool prevVal = ProfileAllThreads;
+ ProfileAllThreads = newVal;
+ return prevVal;
+}
+
+static volatile bool AllocationSamplingEnabled;
+extern "C" bool SetAllocationSamplingEnabled(bool newVal) {
+ bool prevVal = AllocationSamplingEnabled;
+ AllocationSamplingEnabled = newVal;
+ return prevVal;
+}
+
+static size_t AllocationSampleRate = 1000;
+extern "C" size_t SetAllocationSampleRate(size_t newVal) {
+ size_t prevVal = AllocationSampleRate;
+ AllocationSampleRate = newVal;
+ return prevVal;
+}
+
+static size_t AllocationSampleMaxSize = N_MAX_FAST_SIZE;
+extern "C" size_t SetAllocationSampleMaxSize(size_t newVal) {
+ size_t prevVal = AllocationSampleMaxSize;
+ AllocationSampleMaxSize = newVal;
+ return prevVal;
+}
+
+using TAllocationCallback = int(int tag, size_t size, int sizeIdx);
+static TAllocationCallback* AllocationCallback;
+extern "C" TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal) {
+ TAllocationCallback* prevVal = AllocationCallback;
+ AllocationCallback = newVal;
+ return prevVal;
+}
+
+using TDeallocationCallback = void(int cookie, int tag, size_t size, int sizeIdx);
+static TDeallocationCallback* DeallocationCallback;
+extern "C" TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal) {
+ TDeallocationCallback* prevVal = DeallocationCallback;
+ DeallocationCallback = newVal;
+ return prevVal;
+}
+
+PERTHREAD TAtomic AllocationsCount;
+PERTHREAD bool InAllocationCallback;
+
+static const int DBG_ALLOC_INVALID_COOKIE = -1;
+static inline int SampleAllocation(TAllocHeader* p, int sizeIdx) {
+ int cookie = DBG_ALLOC_INVALID_COOKIE;
+ if (AllocationSamplingEnabled && (ProfileCurrentThread || ProfileAllThreads) && !InAllocationCallback) {
+ if (p->Size > AllocationSampleMaxSize || ++AllocationsCount % AllocationSampleRate == 0) {
+ if (AllocationCallback) {
+ InAllocationCallback = true;
+ cookie = AllocationCallback(p->Tag, p->Size, sizeIdx);
+ InAllocationCallback = false;
+ }
+ }
+ }
+ return cookie;
+}
+
+static inline void SampleDeallocation(TAllocHeader* p, int sizeIdx) {
+ if (p->Cookie != DBG_ALLOC_INVALID_COOKIE && !InAllocationCallback) {
+ if (DeallocationCallback) {
+ InAllocationCallback = true;
+ DeallocationCallback(p->Cookie, p->Tag, p->Size, sizeIdx);
+ InAllocationCallback = false;
+ }
+ }
+}
+
+static inline void TrackPerTagAllocation(TAllocHeader* p, int sizeIdx) {
+ if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) {
+ Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES);
+ auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx];
+
+ TThreadAllocInfo* thr = pThreadInfo;
+ if (thr) {
+ auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx];
+ local.Alloc(global, p->Size);
+ } else {
+ global.Alloc(p->Size);
+ }
+ }
+}
+
+static inline void TrackPerTagDeallocation(TAllocHeader* p, int sizeIdx) {
+ if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) {
+ Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES);
+ auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx];
+
+ TThreadAllocInfo* thr = pThreadInfo;
+ if (thr) {
+ auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx];
+ local.Free(global, p->Size);
+ } else {
+ global.Free(p->Size);
+ }
+ }
+}
+
+static void* TrackAllocation(void* ptr, size_t size, int sizeIdx) {
+ TAllocHeader* p = (TAllocHeader*)ptr;
+ p->Size = size;
+ p->Tag = AllocationTag;
+ p->Cookie = SampleAllocation(p, sizeIdx);
+ TrackPerTagAllocation(p, sizeIdx);
+ return GetAllocPtr(p);
+}
+
+static void TrackDeallocation(void* ptr, int sizeIdx) {
+ TAllocHeader* p = (TAllocHeader*)ptr;
+ SampleDeallocation(p, sizeIdx);
+ TrackPerTagDeallocation(p, sizeIdx);
+}
+
+struct TPerTagAllocInfo {
+ ssize_t Count;
+ ssize_t Size;
+};
+
+extern "C" void GetPerTagAllocInfo(
+ bool flushPerThreadCounters,
+ TPerTagAllocInfo* info,
+ int& maxTag,
+ int& numSizes) {
+ maxTag = DBG_ALLOC_MAX_TAG;
+ numSizes = DBG_ALLOC_NUM_SIZES;
+
+ if (info) {
+ if (flushPerThreadCounters) {
+ TLFLockHolder ll(&LFLockThreadInfo);
+ for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+ TThreadAllocInfo* pInfo = *p;
+ for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+ for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+ auto& local = pInfo->LocalPerTagAllocCounters[tag][sizeIdx];
+ auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+ local.Flush(global);
+ }
+ }
+ p = &pInfo->pNextInfo;
+ }
+ }
+
+ for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+ for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+ auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+ auto& res = info[tag * DBG_ALLOC_NUM_SIZES + sizeIdx];
+ res.Count = global.Count;
+ res.Size = global.Size;
+ }
+ }
+ }
+}
+
+#endif // LFALLOC_DBG
+
+//////////////////////////////////////////////////////////////////////////
+static Y_FORCE_INLINE void* LFAllocImpl(size_t _nSize) {
+#if defined(LFALLOC_DBG)
+ size_t size = _nSize;
+ _nSize += sizeof(TAllocHeader);
+#endif
+
+ IncrementCounter(CT_USER_ALLOC, _nSize);
+
+ int nSizeIdx;
+ if (_nSize > 512) {
+ if (_nSize > N_MAX_FAST_SIZE) {
+ void* ptr = LargeBlockAlloc(_nSize, CT_LARGE_ALLOC);
+#if defined(LFALLOC_DBG)
+ ptr = TrackAllocation(ptr, size, N_SIZES);
+#endif
+ return ptr;
+ }
+ nSizeIdx = size2idxArr2[(_nSize - 1) >> 8];
+ } else
+ nSizeIdx = size2idxArr1[1 + (((int)_nSize - 1) >> 3)];
+
+ IncrementCounter(CT_SMALL_ALLOC, nSizeIdxToSize[nSizeIdx]);
+
+ // check per thread buffer
+ TThreadAllocInfo* thr = pThreadInfo;
+ if (!thr) {
+ AllocThreadInfo();
+ thr = pThreadInfo;
+ if (!thr) {
+ void* ptr = LFAllocNoCache(nSizeIdx, MEM_DEFRAG);
+#if defined(LFALLOC_DBG)
+ ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+ return ptr;
+ }
+ }
+ {
+ int& freePtrIdx = thr->FreePtrIndex[nSizeIdx];
+ if (freePtrIdx < THREAD_BUF) {
+ void* ptr = thr->FreePtrs[nSizeIdx][freePtrIdx++];
+#if defined(LFALLOC_DBG)
+ ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+ return ptr;
+ }
+
+ // try to alloc from global free list
+ char* buf[FL_GROUP_SIZE];
+ int count = TakeBlocksFromGlobalFreeList(nSizeIdx, buf);
+ if (count == 0) {
+ count = LFAllocNoCacheMultiple(nSizeIdx, buf);
+ if (count == 0) {
+ NMalloc::AbortFromCorruptedAllocator("no way LFAllocNoCacheMultiple() can fail");
+ }
+ }
+ char** dstBuf = thr->FreePtrs[nSizeIdx] + freePtrIdx - 1;
+ for (int i = 0; i < count - 1; ++i)
+ dstBuf[-i] = buf[i];
+ freePtrIdx -= count - 1;
+ void* ptr = buf[count - 1];
+#if defined(LFALLOC_DBG)
+ ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+ return ptr;
+ }
+}
+
+static Y_FORCE_INLINE void* LFAlloc(size_t _nSize) {
+ void* res = LFAllocImpl(_nSize);
+#ifdef DBG_FILL_MEMORY
+ if (FillMemoryOnAllocation && res && (_nSize <= DBG_FILL_MAX_SIZE)) {
+ memset(res, 0xcf, _nSize);
+ }
+#endif
+ return res;
+}
+
+static Y_FORCE_INLINE void LFFree(void* p) {
+#if defined(LFALLOC_DBG)
+ if (p == nullptr)
+ return;
+ p = GetAllocHeader(p);
+#endif
+
+ uintptr_t chkOffset = ((char*)p - ALLOC_START) - 1ll;
+ if (chkOffset >= N_MAX_WORKSET_SIZE) {
+ if (p == nullptr)
+ return;
+#if defined(LFALLOC_DBG)
+ TrackDeallocation(p, N_SIZES);
+#endif
+ LargeBlockFree(p, CT_LARGE_FREE);
+ return;
+ }
+
+ uintptr_t chunk = ((char*)p - ALLOC_START) / N_CHUNK_SIZE;
+ ptrdiff_t nSizeIdx = chunkSizeIdx[chunk];
+ if (nSizeIdx <= 0) {
+#if defined(LFALLOC_DBG)
+ TrackDeallocation(p, N_SIZES);
+#endif
+ LargeBlockFree(p, CT_LARGE_FREE);
+ return;
+ }
+
+#if defined(LFALLOC_DBG)
+ TrackDeallocation(p, nSizeIdx);
+#endif
+
+#ifdef DBG_FILL_MEMORY
+ memset(p, 0xfe, nSizeIdxToSize[nSizeIdx]);
+#endif
+
+ IncrementCounter(CT_SMALL_FREE, nSizeIdxToSize[nSizeIdx]);
+
+ // try to store info to per thread buf
+ TThreadAllocInfo* thr = pThreadInfo;
+ if (thr) {
+ int& freePtrIdx = thr->FreePtrIndex[nSizeIdx];
+ if (freePtrIdx > borderSizes[nSizeIdx]) {
+ thr->FreePtrs[nSizeIdx][--freePtrIdx] = (char*)p;
+ return;
+ }
+
+ // move several pointers to global free list
+ int freeCount = FL_GROUP_SIZE;
+ if (freeCount > THREAD_BUF - freePtrIdx)
+ freeCount = THREAD_BUF - freePtrIdx;
+ char** freePtrs = thr->FreePtrs[nSizeIdx];
+ PutBlocksToGlobalFreeList(nSizeIdx, freePtrs + freePtrIdx, freeCount);
+ freePtrIdx += freeCount;
+
+ freePtrs[--freePtrIdx] = (char*)p;
+
+ } else {
+ AllocThreadInfo();
+ PutBlocksToGlobalFreeList(nSizeIdx, (char**)&p, 1);
+ }
+}
+
+static size_t LFGetSize(const void* p) {
+#if defined(LFALLOC_DBG)
+ if (p == nullptr)
+ return 0;
+ return GetAllocHeader(const_cast<void*>(p))->Size;
+#endif
+
+ uintptr_t chkOffset = ((const char*)p - ALLOC_START);
+ if (chkOffset >= N_MAX_WORKSET_SIZE) {
+ if (p == nullptr)
+ return 0;
+ return TLargeBlk::As(p)->Pages * 4096ll;
+ }
+ uintptr_t chunk = ((const char*)p - ALLOC_START) / N_CHUNK_SIZE;
+ ptrdiff_t nSizeIdx = chunkSizeIdx[chunk];
+ if (nSizeIdx <= 0)
+ return TLargeBlk::As(p)->Pages * 4096ll;
+ return nSizeIdxToSize[nSizeIdx];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Output mem alloc stats
+const int N_PAGE_SIZE = 4096;
+static void DebugTraceMMgr(const char* pszFormat, ...) // __cdecl
+{
+ static char buff[20000];
+ va_list va;
+ //
+ va_start(va, pszFormat);
+ vsprintf(buff, pszFormat, va);
+ va_end(va);
+//
+#ifdef _win_
+ OutputDebugStringA(buff);
+#else
+ fputs(buff, stderr);
+#endif
+}
+
+struct TChunkStats {
+ char *Start, *Finish;
+ i64 Size;
+ char* Entries;
+ i64 FreeCount;
+
+ TChunkStats(size_t chunk, i64 size, char* entries)
+ : Size(size)
+ , Entries(entries)
+ , FreeCount(0)
+ {
+ Start = ALLOC_START + chunk * N_CHUNK_SIZE;
+ Finish = Start + N_CHUNK_SIZE;
+ }
+ void CheckBlock(char* pBlock) {
+ if (pBlock && pBlock >= Start && pBlock < Finish) {
+ ++FreeCount;
+ i64 nShift = pBlock - Start;
+ i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1);
+ Entries[nOffsetInStep / Size] = 1;
+ }
+ }
+ void SetGlobalFree(char* ptr) {
+ i64 nShift = ptr - Start;
+ i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1);
+ while (nOffsetInStep + Size <= N_CHUNK_SIZE) {
+ ++FreeCount;
+ Entries[nOffsetInStep / Size] = 1;
+ nOffsetInStep += Size;
+ }
+ }
+};
+
+static void DumpMemoryBlockUtilizationLocked() {
+ TFreeListGroup* wholeLists[N_SIZES];
+ for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) {
+ wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList();
+ }
+ char* bfList = (char*)blockFreeList.GetWholeList();
+
+ DebugTraceMMgr("memory blocks utilisation stats:\n");
+ i64 nTotalAllocated = 0, nTotalFree = 0, nTotalBadPages = 0, nTotalPages = 0, nTotalUsed = 0, nTotalLocked = 0;
+ i64 nTotalGroupBlocks = 0;
+ char* entries;
+ entries = (char*)SystemAlloc((N_CHUNK_SIZE / 4));
+ for (size_t k = 0; k < N_CHUNKS; ++k) {
+ if (chunkSizeIdx[k] <= 0) {
+ if (chunkSizeIdx[k] == -1)
+ nTotalLocked += N_CHUNK_SIZE;
+ continue;
+ }
+ i64 nSizeIdx = chunkSizeIdx[k];
+ i64 nSize = nSizeIdxToSize[nSizeIdx];
+ TChunkStats cs(k, nSize, entries);
+ int nEntriesTotal = N_CHUNK_SIZE / nSize;
+ memset(entries, 0, nEntriesTotal);
+ for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) {
+ for (auto& ptr : g->Ptrs)
+ cs.CheckBlock(ptr);
+ }
+ TChunkStats csGB(k, nSize, entries);
+ if (nSizeIdx == FREE_LIST_GROUP_SIZEIDX) {
+ for (auto g : wholeLists) {
+ for (; g; g = g->Next)
+ csGB.CheckBlock((char*)g);
+ }
+ for (char* blk = bfList; blk; blk = *(char**)blk)
+ csGB.CheckBlock(blk);
+ nTotalGroupBlocks += csGB.FreeCount * nSize;
+ }
+ if (((globalCurrentPtr[nSizeIdx] - ALLOC_START) / N_CHUNK_SIZE) == k)
+ cs.SetGlobalFree(globalCurrentPtr[nSizeIdx]);
+ nTotalUsed += (nEntriesTotal - cs.FreeCount - csGB.FreeCount) * nSize;
+
+ char pages[N_CHUNK_SIZE / N_PAGE_SIZE];
+ memset(pages, 0, sizeof(pages));
+ for (int i = 0, nShift = 0; i < nEntriesTotal; ++i, nShift += nSize) {
+ int nBit = 0;
+ if (entries[i])
+ nBit = 1; // free entry
+ else
+ nBit = 2; // used entry
+ for (i64 nDelta = nSize - 1; nDelta >= 0; nDelta -= N_PAGE_SIZE)
+ pages[(nShift + nDelta) / N_PAGE_SIZE] |= nBit;
+ }
+ i64 nBadPages = 0;
+ for (auto page : pages) {
+ nBadPages += page == 3;
+ nTotalPages += page != 1;
+ }
+ DebugTraceMMgr("entry = %lld; size = %lld; free = %lld; system %lld; utilisation = %g%%, fragmentation = %g%%\n",
+ k, nSize, cs.FreeCount * nSize, csGB.FreeCount * nSize,
+ (N_CHUNK_SIZE - cs.FreeCount * nSize) * 100.0f / N_CHUNK_SIZE, 100.0f * nBadPages / Y_ARRAY_SIZE(pages));
+ nTotalAllocated += N_CHUNK_SIZE;
+ nTotalFree += cs.FreeCount * nSize;
+ nTotalBadPages += nBadPages;
+ }
+ SystemFree(entries);
+ DebugTraceMMgr("Total allocated = %llu, free = %lld, system = %lld, locked for future use %lld, utilisation = %g, fragmentation = %g\n",
+ nTotalAllocated, nTotalFree, nTotalGroupBlocks, nTotalLocked,
+ 100.0f * (nTotalAllocated - nTotalFree) / nTotalAllocated, 100.0f * nTotalBadPages / nTotalPages);
+ DebugTraceMMgr("Total %lld bytes used, %lld bytes in used pages\n", nTotalUsed, nTotalPages * N_PAGE_SIZE);
+
+ for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx)
+ globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]);
+ blockFreeList.ReturnWholeList(bfList);
+}
+
+void FlushThreadFreeList() {
+ if (pThreadInfo)
+ MoveSingleThreadFreeToGlobal(pThreadInfo);
+}
+
+void DumpMemoryBlockUtilization() {
+ // move current thread free to global lists to get better statistics
+ FlushThreadFreeList();
+ {
+ TLFLockHolder ls(&LFGlobalLock);
+ DumpMemoryBlockUtilizationLocked();
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// malloc api
+
+static bool LFAlloc_SetParam(const char* param, const char* value) {
+ if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE")) {
+ LB_LIMIT_TOTAL_SIZE = atoi(value);
+ return true;
+ }
+ if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE_BYTES")) {
+ LB_LIMIT_TOTAL_SIZE = (atoi(value) + N_PAGE_SIZE - 1) / N_PAGE_SIZE;
+ return true;
+ }
+#ifdef DBG_FILL_MEMORY
+ if (!strcmp(param, "FillMemoryOnAllocation")) {
+ FillMemoryOnAllocation = !strcmp(value, "true");
+ return true;
+ }
+#endif
+ if (!strcmp(param, "TransparentHugePages")) {
+ TransparentHugePages = !strcmp(value, "true");
+ return true;
+ }
+ if (!strcmp(param, "MapHugeTLB")) {
+ MapHugeTLB = !strcmp(value, "true");
+ return true;
+ }
+ if (!strcmp(param, "EnableDefrag")) {
+ EnableDefrag = !strcmp(value, "true");
+ return true;
+ }
+ return false;
+};
+
+static const char* LFAlloc_GetParam(const char* param) {
+ struct TParam {
+ const char* Name;
+ const char* Value;
+ };
+
+ static const TParam Params[] = {
+ {"GetLFAllocCounterFast", (const char*)&GetLFAllocCounterFast},
+ {"GetLFAllocCounterFull", (const char*)&GetLFAllocCounterFull},
+#if defined(LFALLOC_DBG)
+ {"SetThreadAllocTag", (const char*)&SetThreadAllocTag},
+ {"SetProfileCurrentThread", (const char*)&SetProfileCurrentThread},
+ {"SetProfileAllThreads", (const char*)&SetProfileAllThreads},
+ {"SetAllocationSamplingEnabled", (const char*)&SetAllocationSamplingEnabled},
+ {"SetAllocationSampleRate", (const char*)&SetAllocationSampleRate},
+ {"SetAllocationSampleMaxSize", (const char*)&SetAllocationSampleMaxSize},
+ {"SetAllocationCallback", (const char*)&SetAllocationCallback},
+ {"SetDeallocationCallback", (const char*)&SetDeallocationCallback},
+ {"GetPerTagAllocInfo", (const char*)&GetPerTagAllocInfo},
+#endif // LFALLOC_DBG
+ };
+
+ for (int i = 0; i < Y_ARRAY_SIZE(Params); ++i) {
+ if (strcmp(param, Params[i].Name) == 0) {
+ return Params[i].Value;
+ }
+ }
+ return nullptr;
+}
+
+static Y_FORCE_INLINE int LFPosixMemalign(void** memptr, size_t alignment, size_t size) {
+ if (Y_UNLIKELY(alignment > 4096)) {
+ const char* error = "Larger alignment are not guaranteed with this implementation\n";
+#ifdef _win_
+ OutputDebugStringA(error);
+#endif
+ NMalloc::AbortFromCorruptedAllocator(error);
+ }
+ size_t bigsize = size;
+ if (bigsize <= alignment) {
+ bigsize = alignment;
+ } else if (bigsize < 2 * alignment) {
+ bigsize = 2 * alignment;
+ }
+#if defined(LFALLOC_DBG)
+ if (alignment > sizeof(TAllocHeader)) {
+ bigsize += alignment;
+ }
+#endif
+
+ *memptr = LFAlloc(bigsize);
+
+#if defined(LFALLOC_DBG)
+ if (alignment > sizeof(TAllocHeader)) {
+ // memptr isn't aligned due to alloc header
+ const auto* header = GetAllocHeader(*memptr);
+ *memptr = (void*)((const char*) (*memptr) + alignment - sizeof(TAllocHeader));
+
+ // make fake header to retrieve original header ptr on dealloc
+ auto* next = GetAllocHeader(*memptr);
+ next->Tag = DBG_ALLOC_ALIGNED_TAG;
+ next->Size = (uint64_t)header;
+ next->Cookie = 0;
+ }
+#endif
+
+ Y_ASSERT_NOBT((intptr_t)*memptr % alignment == 0);
+ return 0;
+}
+
+static Y_FORCE_INLINE void* LFVAlloc(size_t size) {
+ const size_t pg = N_PAGE_SIZE;
+ void* p = nullptr;
+
+#if defined(LFALLOC_DBG)
+ LFPosixMemalign(&p, pg, size);
+#else
+ size_t bigsize = (size + pg - 1) & (~(pg - 1));
+ p = LFAlloc(bigsize);
+#endif
+
+ Y_ASSERT_NOBT((intptr_t)p % N_PAGE_SIZE == 0);
+ return p;
+}
+
+#endif
diff --git a/library/cpp/lfalloc/ya.make b/library/cpp/lfalloc/ya.make
new file mode 100644
index 0000000000..cace05f9d8
--- /dev/null
+++ b/library/cpp/lfalloc/ya.make
@@ -0,0 +1,25 @@
+LIBRARY()
+
+OWNER(gulin)
+
+NO_UTIL()
+
+NO_COMPILER_WARNINGS()
+
+IF (ARCH_AARCH64)
+ PEERDIR(
+ contrib/libs/jemalloc
+ )
+ELSE()
+ SRCS(
+ lf_allocX64.cpp
+ )
+ENDIF()
+
+PEERDIR(
+ library/cpp/malloc/api
+)
+
+SET(IDE_FOLDER "util")
+
+END()
diff --git a/library/cpp/lfalloc/yt/ya.make b/library/cpp/lfalloc/yt/ya.make
new file mode 100644
index 0000000000..8c1a4f8a72
--- /dev/null
+++ b/library/cpp/lfalloc/yt/ya.make
@@ -0,0 +1,29 @@
+LIBRARY()
+
+OWNER(a-romanov)
+
+NO_UTIL()
+
+NO_COMPILER_WARNINGS()
+
+IF (ARCH_AARCH64)
+ PEERDIR(
+ contrib/libs/jemalloc
+ )
+ELSE()
+ IF ("${YMAKE}" MATCHES "devtools")
+ CFLAGS(-DYMAKE=1)
+ ENDIF()
+ CXXFLAGS(-DLFALLOC_YT)
+ SRCS(
+ ../lf_allocX64.cpp
+ )
+ENDIF()
+
+PEERDIR(
+ library/cpp/malloc/api
+)
+
+SET(IDE_FOLDER "util")
+
+END()