intermediate changes

ref:cde9a383711a11544ce7e107a78147fb96cc4029
author: Devtools Arcadia <arcadia-devtools@yandex-team.ru> 2022-02-07 18:08:42 +0300
committer: Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> 2022-02-07 18:08:42 +0300
commit: 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree: e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/lfalloc
download: ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
16 files changed, 3075 insertions, 0 deletions
diff --git a/library/cpp/lfalloc/alloc_profiler/align_ut.cpp b/library/cpp/lfalloc/alloc_profiler/align_ut.cpp
new file mode 100644
index 0000000000..db9b17b95b
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/align_ut.cpp
@@ -0,0 +1,23 @@
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/scope.h>
+
+Y_UNIT_TEST_SUITE(MemAlign) {
+    Y_UNIT_TEST(ShouldAlign)
+    {
+        for (ui64 size = 8; size <= 32 * 1024; size *= 8) {
+            for (ui64 align = 8; align <= 4096; align *=2) {
+                void* ptr = nullptr;
+
+                int res = posix_memalign(&ptr, align, size);
+                UNIT_ASSERT_C(res == 0 && ptr != nullptr, "memalign failed");
+
+                Y_DEFER {
+                    free(ptr);
+                };
+
+                UNIT_ASSERT_C((uintptr_t)ptr % align == 0, "non aligned memory");
+            }
+        }
+    }
+}
diff --git a/library/cpp/lfalloc/alloc_profiler/profiler.cpp b/library/cpp/lfalloc/alloc_profiler/profiler.cpp
new file mode 100644
index 0000000000..0e30927a5a
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/profiler.cpp
@@ -0,0 +1,81 @@
+#include "profiler.h"
+
+#include "stackcollect.h"
+
+#include <util/generic/algorithm.h>
+#include <util/generic/singleton.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/stream/str.h>
+
+namespace NAllocProfiler {
+
+namespace {
+
+static TAllocationStackCollector& AllocationStackCollector()
+{
+    return *Singleton<TAllocationStackCollector>();
+}
+
+int AllocationCallback(int tag, size_t size, int sizeIdx)
+{
+    Y_UNUSED(sizeIdx);
+
+    static const size_t STACK_FRAMES_COUNT = 32;
+    static const size_t STACK_FRAMES_SKIP = 1;
+
+    void* frames[STACK_FRAMES_COUNT];
+    size_t frameCount = BackTrace(frames, Y_ARRAY_SIZE(frames));
+    if (frameCount <= STACK_FRAMES_SKIP) {
+        return -1;
+    }
+
+    void** stack = &frames[STACK_FRAMES_SKIP];
+    frameCount -= STACK_FRAMES_SKIP;
+
+    auto& collector = AllocationStackCollector();
+    return collector.Alloc(stack, frameCount, tag, size);
+}
+
+void DeallocationCallback(int stackId, int tag, size_t size, int sizeIdx)
+{
+    Y_UNUSED(tag);
+    Y_UNUSED(sizeIdx);
+
+    auto& collector = AllocationStackCollector();
+    collector.Free(stackId, size);
+}
+
+}   // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+
+bool StartAllocationSampling(bool profileAllThreads)
+{
+    auto& collector = AllocationStackCollector();
+    collector.Clear();
+
+    NAllocDbg::SetProfileAllThreads(profileAllThreads);
+    NAllocDbg::SetAllocationCallback(AllocationCallback);
+    NAllocDbg::SetDeallocationCallback(DeallocationCallback);
+    NAllocDbg::SetAllocationSamplingEnabled(true);
+    return true;
+}
+
+bool StopAllocationSampling(IAllocationStatsDumper &out, int count)
+{
+    NAllocDbg::SetAllocationCallback(nullptr);
+    NAllocDbg::SetDeallocationCallback(nullptr);
+    NAllocDbg::SetAllocationSamplingEnabled(false);
+
+    auto& collector = AllocationStackCollector();
+    collector.Dump(count, out);
+    return true;
+}
+
+bool StopAllocationSampling(IOutputStream& out, int count) {
+    TAllocationStatsDumper dumper(out);
+    return StopAllocationSampling(dumper, count);
+}
+
+}   // namespace NProfiler
diff --git a/library/cpp/lfalloc/alloc_profiler/profiler.h b/library/cpp/lfalloc/alloc_profiler/profiler.h
new file mode 100644
index 0000000000..4ea49b9dcc
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/profiler.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include "stackcollect.h"
+
+#include <library/cpp/lfalloc/dbg_info/dbg_info.h>
+
+#include <util/generic/noncopyable.h>
+#include <util/stream/output.h>
+
+namespace NAllocProfiler {
+
+////////////////////////////////////////////////////////////////////////////////
+
+inline int SetCurrentScopeTag(int value)
+{
+    return NAllocDbg::SetThreadAllocTag(value);
+}
+
+inline bool SetProfileCurrentThread(bool value)
+{
+    return NAllocDbg::SetProfileCurrentThread(value);
+}
+
+bool StartAllocationSampling(bool profileAllThreads = false);
+bool StopAllocationSampling(IAllocationStatsDumper& out, int count = 100);
+bool StopAllocationSampling(IOutputStream& out, int count = 100);
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TProfilingScope: private TNonCopyable {
+private:
+    const int Prev;
+
+public:
+    explicit TProfilingScope(int value)
+        : Prev(SetCurrentScopeTag(value))
+    {}
+
+    ~TProfilingScope()
+    {
+        SetCurrentScopeTag(Prev);
+    }
+};
+
+}   // namespace NAllocProfiler
diff --git a/library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp b/library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp
new file mode 100644
index 0000000000..4341dda6ed
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/profiler_ut.cpp
@@ -0,0 +1,76 @@
+#include "profiler.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+namespace NAllocProfiler {
+
+////////////////////////////////////////////////////////////////////////////////
+
+Y_UNIT_TEST_SUITE(Profiler) {
+    Y_UNIT_TEST(StackCollection)
+    {
+        TStringStream str;
+
+        NAllocProfiler::StartAllocationSampling(true);
+        TVector<TAutoPtr<int>> test;
+        // Do many allocations and no deallocations
+        for (int i = 0; i < 10000; ++i) {
+            test.push_back(new int);
+        }
+        NAllocProfiler::StopAllocationSampling(str);
+        //Cout << str.Str() << Endl;
+
+#if !defined(ARCH_AARCH64)
+        /* Check that output resembles this:
+
+            STACK #2: 0     Allocs: 10      Frees: 0        CurrentSize: 40
+            0000000000492353        ??
+            000000000048781F        operator new(unsigned long) +1807
+            00000000003733FA        NAllocProfiler::NTestSuiteProfiler::TTestCaseStackCollection::Execute_(NUnitTest::TTestContext&) +218
+            00000000004A1938        NUnitTest::TTestBase::Run(std::__y1::function<void ()>, TString, char const*, bool) +120
+            0000000000375656        NAllocProfiler::NTestSuiteProfiler::TCurrentTest::Execute() +342
+            00000000004A20CF        NUnitTest::TTestFactory::Execute() +847
+            000000000049922D        NUnitTest::RunMain(int, char**) +1965
+            00007FF665778F45        __libc_start_main +245
+        */
+
+        UNIT_ASSERT_STRING_CONTAINS(str.Str(), "StackCollection");
+        UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NUnitTest::TTestBase::Run");
+        UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NAllocProfiler::NTestSuiteProfiler::TCurrentTest::Execute");
+        UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NUnitTest::TTestFactory::Execute");
+        UNIT_ASSERT_STRING_CONTAINS(str.Str(), "NUnitTest::RunMain");
+#endif
+    }
+
+    class TAllocDumper : public NAllocProfiler::TAllocationStatsDumper {
+    public:
+        explicit TAllocDumper(IOutputStream& out) : NAllocProfiler::TAllocationStatsDumper(out) {}
+
+        TString FormatTag(int tag) override {
+            UNIT_ASSERT_VALUES_EQUAL(tag, 42);
+            return "TAG_NAME_42";
+        }
+    };
+
+    Y_UNIT_TEST(TagNames)
+    {
+        TStringStream str;
+
+        NAllocProfiler::StartAllocationSampling(true);
+        TVector<TAutoPtr<int>> test;
+        NAllocProfiler::TProfilingScope scope(42);
+        // Do many allocations and no deallocations
+        for (int i = 0; i < 10000; ++i) {
+            test.push_back(new int);
+        }
+
+        TAllocDumper dumper(str);
+        NAllocProfiler::StopAllocationSampling(dumper);
+
+#if !defined(ARCH_AARCH64)
+        UNIT_ASSERT_STRING_CONTAINS(str.Str(), "TAG_NAME_42");
+#endif
+    }
+}
+
+}
diff --git a/library/cpp/lfalloc/alloc_profiler/stackcollect.cpp b/library/cpp/lfalloc/alloc_profiler/stackcollect.cpp
new file mode 100644
index 0000000000..fded4e2fd1
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/stackcollect.cpp
@@ -0,0 +1,332 @@
+#include "stackcollect.h"
+
+#include "profiler.h"
+
+#include <util/generic/algorithm.h>
+#include <util/generic/vector.h>
+#include <util/stream/format.h>
+#include <util/stream/str.h>
+#include <util/string/cast.h>
+#include <util/string/printf.h>
+#include <util/system/backtrace.h>
+#include <util/system/spinlock.h>
+#include <util/system/yassert.h>
+
+
+namespace NAllocProfiler {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+class TStackCollector: private TNonCopyable {
+public:
+    struct TFrameInfo {
+        int PrevInd;
+        void* Addr;
+        int Tag;
+        T Stats;
+
+        void Clear()
+        {
+            PrevInd = 0;
+            Addr = nullptr;
+            Tag = 0;
+            Stats.Clear();
+        }
+    };
+
+private:
+    static const size_t STACKS_HASH_MAP_SIZE = 256 * 1024;
+    TFrameInfo Frames[STACKS_HASH_MAP_SIZE];
+
+    ui64 Samples;            // Saved samples count
+    ui64 UniqueSamples;      // Number of unique addresses
+    ui64 UsedSlots;          // Number of occupied slots in the hashtable
+    ui64 DroppedSamples;     // Number of unsaved addresses
+    ui64 SearchSkipCount;    // Total number of linear hash table probes due to collisions
+
+    TAdaptiveLock Lock;
+
+public:
+    TStackCollector()
+    {
+        Clear();
+    }
+
+    int AddStack(void** stack, size_t frameCount, int tag)
+    {
+        Y_ASSERT(frameCount > 0);
+
+        int prevInd = -1;
+        with_lock (Lock) {
+            for (int i = frameCount - 1; i >= 0; --i) {
+                prevInd = AddFrame(stack[i], prevInd, ((i == 0) ? tag : 0), (i == 0));
+                if (prevInd == -1) {
+                    break;
+                }
+            }
+        }
+        return prevInd;
+    }
+
+    T& GetStats(int stackId)
+    {
+        Y_ASSERT(stackId >= 0 && (size_t)stackId < Y_ARRAY_SIZE(Frames));
+        Y_ASSERT(!IsSlotEmpty(stackId));
+
+        return Frames[stackId].Stats;
+    }
+
+    const TFrameInfo* GetFrames() const
+    {
+        return Frames;
+    }
+
+    size_t GetFramesCount() const
+    {
+        return Y_ARRAY_SIZE(Frames);
+    }
+
+    void BackTrace(const TFrameInfo* stack, TStackVec<void*, 64>& frames) const
+    {
+        frames.clear();
+        for (size_t i = 0; i < 100; ++i) {
+            frames.push_back(stack->Addr);
+            int prevInd = stack->PrevInd;
+            if (prevInd == -1) {
+                break;
+            }
+            stack = &Frames[prevInd];
+        }
+    }
+
+    void Clear()
+    {
+        for (auto& frame: Frames) {
+            frame.Clear();
+        }
+
+        Samples = 0;
+        DroppedSamples = 0;
+        UniqueSamples = 0;
+        UsedSlots = 0;
+        SearchSkipCount = 0;
+    }
+
+private:
+    // Hash function applied to the addresses
+    static ui32 Hash(void* addr, int prevInd, int tag)
+    {
+        return (((size_t)addr + ((size_t)addr / STACKS_HASH_MAP_SIZE)) + prevInd + tag) % STACKS_HASH_MAP_SIZE;
+    }
+
+    static bool EqualFrame(const TFrameInfo& frame, void* addr, int prevInd, int tag)
+    {
+        return (frame.Addr == addr && frame.PrevInd == prevInd && frame.Tag == tag);
+    }
+
+    bool IsSlotEmpty(ui32 slot) const
+    {
+        return Frames[slot].Addr == 0;
+    }
+
+    bool InsertsAllowed() const
+    {
+        return UsedSlots < STACKS_HASH_MAP_SIZE / 2;
+    }
+
+    // returns the index in the hashmap
+    int AddFrame(void* addr, int prevFrameIndex, int tag, bool last)
+    {
+        ui32 slot = Hash(addr, prevFrameIndex, tag);
+        ui32 prevSlot = (slot - 1) % STACKS_HASH_MAP_SIZE;
+
+        while (!EqualFrame(Frames[slot], addr, prevFrameIndex, tag) && !IsSlotEmpty(slot) && slot != prevSlot) {
+            slot = (slot + 1) % STACKS_HASH_MAP_SIZE;
+            SearchSkipCount++;
+        }
+
+        if (EqualFrame(Frames[slot], addr, prevFrameIndex, tag)) {
+            if (last) {
+                ++Samples;
+            }
+        } else if (InsertsAllowed() && IsSlotEmpty(slot)) {
+            // add new sample
+            Frames[slot].Clear();
+            Frames[slot].Addr = addr;
+            Frames[slot].PrevInd = prevFrameIndex;
+            Frames[slot].Tag = tag;
+            ++UsedSlots;
+            if (last) {
+                ++UniqueSamples;
+                ++Samples;
+            }
+        } else {
+            // don't insert new sample if the search is becoming too slow
+            ++DroppedSamples;
+            return -1;
+        }
+
+        return slot;
+    }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TAllocationStackCollector::TImpl: public TStackCollector<TStats> {
+    using TBase = TStackCollector<TStats>;
+
+private:
+    TStats Total;
+
+public:
+    int Alloc(void** stack, size_t frameCount, int tag, size_t size)
+    {
+        int stackId = TBase::AddStack(stack, frameCount, tag);
+        if (stackId >= 0) {
+            TBase::GetStats(stackId).Alloc(size);
+            Total.Alloc(size);
+        }
+        return stackId;
+    }
+
+    void Free(int stackId, size_t size)
+    {
+        TBase::GetStats(stackId).Free(size);
+        Total.Free(size);
+    }
+
+    void Clear()
+    {
+        TBase::Clear();
+        Total.Clear();
+    }
+
+    void Dump(int count, IAllocationStatsDumper& out) const
+    {
+        const TFrameInfo* frames = TBase::GetFrames();
+        size_t framesCount = TBase::GetFramesCount();
+
+        TVector<const TFrameInfo*> stacks;
+        for (size_t i = 0; i < framesCount; ++i) {
+            if (frames[i].Stats.Allocs) {
+                stacks.push_back(&frames[i]);
+            }
+        }
+
+        Sort(stacks, [] (const TFrameInfo* l, const TFrameInfo* r) {
+            const auto& ls = l->Stats;
+            const auto& rs = r->Stats;
+            return ls.CurrentSize != rs.CurrentSize
+                ? ls.CurrentSize > rs.CurrentSize
+                : ls.Allocs != rs.Allocs
+                    ? ls.Allocs > rs.Allocs
+                    : ls.Frees > rs.Frees;
+        });
+
+        out.DumpTotal(Total);
+
+        TAllocationInfo allocInfo;
+        int printedCount = 0;
+        for (const TFrameInfo* stack: stacks) {
+            allocInfo.Clear();
+            allocInfo.Tag = stack->Tag;
+            allocInfo.Stats = stack->Stats;
+            TBase::BackTrace(stack, allocInfo.Stack);
+
+            out.DumpEntry(allocInfo);
+
+            if (++printedCount >= count) {
+                break;
+            }
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+TAllocationStackCollector::TAllocationStackCollector()
+    : Impl(new TImpl())
+{}
+
+TAllocationStackCollector::~TAllocationStackCollector()
+{}
+
+int TAllocationStackCollector::Alloc(void** stack, size_t frameCount, int tag, size_t size)
+{
+    return Impl->Alloc(stack, frameCount, tag, size);
+}
+
+void TAllocationStackCollector::Free(int stackId, size_t size)
+{
+    Impl->Free(stackId, size);
+}
+
+void TAllocationStackCollector::Clear()
+{
+    Impl->Clear();
+}
+
+void TAllocationStackCollector::Dump(int count, IAllocationStatsDumper &out) const
+{
+    Impl->Dump(count, out);
+}
+
+
+TString IAllocationStatsDumper::FormatTag(int tag) {
+    return ToString(tag);
+}
+
+TString IAllocationStatsDumper::FormatSize(intptr_t sz) {
+    return ToString(sz);
+}
+
+
+TAllocationStatsDumper::TAllocationStatsDumper(IOutputStream& out)
+    : PrintedCount(0)
+    , Out(out)
+    , SymbolCache(2048)
+{}
+
+void TAllocationStatsDumper::DumpTotal(const TStats& total) {
+    Out << "TOTAL"
+        << "\tAllocs: " << total.Allocs
+        << "\tFrees: " << total.Frees
+        << "\tCurrentSize: " << FormatSize(total.CurrentSize)
+        << Endl;
+}
+
+void TAllocationStatsDumper::DumpEntry(const TAllocationInfo& allocInfo) {
+    Out << Endl
+        << "STACK #" << PrintedCount+1 << ": " << FormatTag(allocInfo.Tag)
+        << "\tAllocs: " << allocInfo.Stats.Allocs
+        << "\tFrees: " << allocInfo.Stats.Frees
+        << "\tCurrentSize: " << FormatSize(allocInfo.Stats.CurrentSize)
+        << Endl;
+    FormatBackTrace(allocInfo.Stack.data(), allocInfo.Stack.size());
+    PrintedCount++;
+}
+
+void TAllocationStatsDumper::FormatBackTrace(void* const* stack, size_t sz) {
+    char name[1024];
+    for (size_t i = 0; i < sz; ++i) {
+        TSymbol symbol;
+        auto it = SymbolCache.Find(stack[i]);
+        if (it != SymbolCache.End()) {
+            symbol = it.Value();
+        } else {
+            TResolvedSymbol rs = ResolveSymbol(stack[i], name, sizeof(name));
+            symbol = {rs.NearestSymbol, rs.Name};
+            SymbolCache.Insert(stack[i], symbol);
+        }
+
+        Out << Hex((intptr_t)stack[i], HF_FULL) << "\t" << symbol.Name;
+        intptr_t offset = (intptr_t)stack[i] - (intptr_t)symbol.Address;
+        if (offset)
+            Out << " +" << offset;
+        Out << Endl;
+    }
+}
+
+}   // namespace NAllocProfiler
diff --git a/library/cpp/lfalloc/alloc_profiler/stackcollect.h b/library/cpp/lfalloc/alloc_profiler/stackcollect.h
new file mode 100644
index 0000000000..80715ed7cb
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/stackcollect.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <library/cpp/containers/stack_vector/stack_vec.h>
+#include <library/cpp/cache/cache.h>
+
+#include <util/generic/noncopyable.h>
+#include <util/generic/ptr.h>
+#include <util/stream/output.h>
+
+namespace NAllocProfiler {
+
+struct TStats {
+    intptr_t Allocs = 0;
+    intptr_t Frees = 0;
+    intptr_t CurrentSize = 0;
+
+    void Clear()
+    {
+        Allocs = 0;
+        Frees = 0;
+        CurrentSize = 0;
+    }
+
+    void Alloc(size_t size)
+    {
+        AtomicIncrement(Allocs);
+        AtomicAdd(CurrentSize, size);
+    }
+
+    void Free(size_t size)
+    {
+        AtomicIncrement(Frees);
+        AtomicSub(CurrentSize, size);
+    }
+};
+
+struct TAllocationInfo {
+    int Tag;
+    TStats Stats;
+    TStackVec<void*, 64> Stack;
+
+    void Clear() {
+        Tag = 0;
+        Stats.Clear();
+        Stack.clear();
+    }
+};
+
+
+class IAllocationStatsDumper {
+public:
+    virtual ~IAllocationStatsDumper() = default;
+
+    // Total stats
+    virtual void DumpTotal(const TStats& total) = 0;
+
+    // Stats for individual stack
+    virtual void DumpEntry(const TAllocationInfo& allocInfo) = 0;
+
+    // App-specific tag printer
+    virtual TString FormatTag(int tag);
+
+    // Size printer (e.g. "10KB", "100MB", "over 9000")
+    virtual TString FormatSize(intptr_t sz);
+};
+
+// Default implementation
+class TAllocationStatsDumper: public IAllocationStatsDumper {
+public:
+    explicit TAllocationStatsDumper(IOutputStream& out);
+    void DumpTotal(const TStats& total) override;
+    void DumpEntry(const TAllocationInfo& allocInfo) override;
+
+private:
+    void FormatBackTrace(void* const* stack, size_t sz);
+
+private:
+    struct TSymbol {
+        const void* Address;
+        TString Name;
+    };
+
+    size_t PrintedCount;
+    IOutputStream& Out;
+    TLFUCache<void*, TSymbol> SymbolCache;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TAllocationStackCollector: private TNonCopyable {
+private:
+    class TImpl;
+    THolder<TImpl> Impl;
+
+public:
+    TAllocationStackCollector();
+    ~TAllocationStackCollector();
+
+    int Alloc(void** stack, size_t frameCount, int tag, size_t size);
+    void Free(int stackId, size_t size);
+
+    void Clear();
+
+    void Dump(int count, IAllocationStatsDumper& out) const;
+};
+
+}   // namespace NAllocProfiler
diff --git a/library/cpp/lfalloc/alloc_profiler/ut/ya.make b/library/cpp/lfalloc/alloc_profiler/ut/ya.make
new file mode 100644
index 0000000000..8a7daa74af
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/ut/ya.make
@@ -0,0 +1,22 @@
+UNITTEST_FOR(library/cpp/lfalloc/alloc_profiler)
+
+OWNER(g:rtmr g:kikimr)
+
+PEERDIR(
+    library/cpp/testing/unittest
+)
+
+IF (ARCH_AARCH64)
+    PEERDIR(
+        contrib/libs/jemalloc
+    )
+ELSE()
+    ALLOCATOR(LF_DBG)
+ENDIF()
+
+SRCS(
+    profiler_ut.cpp
+    align_ut.cpp
+)
+
+END()
diff --git a/library/cpp/lfalloc/alloc_profiler/ya.make b/library/cpp/lfalloc/alloc_profiler/ya.make
new file mode 100644
index 0000000000..0f58d91767
--- /dev/null
+++ b/library/cpp/lfalloc/alloc_profiler/ya.make
@@ -0,0 +1,17 @@
+LIBRARY()
+
+OWNER(g:rtmr g:kikimr)
+
+SRCS(
+    profiler.cpp
+    stackcollect.cpp
+)
+
+PEERDIR(
+    library/cpp/lfalloc/dbg_info
+    library/cpp/cache
+)
+
+END()
+
+RECURSE(ut)
diff --git a/library/cpp/lfalloc/dbg/ya.make b/library/cpp/lfalloc/dbg/ya.make
new file mode 100644
index 0000000000..3dce653a8c
--- /dev/null
+++ b/library/cpp/lfalloc/dbg/ya.make
@@ -0,0 +1,32 @@
+LIBRARY()
+
+OWNER(vskipin)
+
+NO_UTIL()
+
+NO_COMPILER_WARNINGS()
+
+IF (ARCH_AARCH64)
+    PEERDIR(
+        contrib/libs/jemalloc
+    )
+ELSE()
+    IF ("${YMAKE}" MATCHES "devtools")
+        CFLAGS(-DYMAKE=1)
+    ENDIF()
+    CXXFLAGS(
+        -DLFALLOC_DBG
+        -DLFALLOC_YT
+    )
+    SRCS(
+        ../lf_allocX64.cpp
+    )
+ENDIF()
+
+PEERDIR(
+    library/cpp/malloc/api
+)
+
+SET(IDE_FOLDER "util")
+
+END()
diff --git a/library/cpp/lfalloc/dbg_info/dbg_info.cpp b/library/cpp/lfalloc/dbg_info/dbg_info.cpp
new file mode 100644
index 0000000000..1fb9f7ad93
--- /dev/null
+++ b/library/cpp/lfalloc/dbg_info/dbg_info.cpp
@@ -0,0 +1,124 @@
+#include "dbg_info.h"
+
+#include <library/cpp/malloc/api/malloc.h>
+
+namespace NAllocDbg {
+    ////////////////////////////////////////////////////////////////////////////////
+
+    using TGetAllocationCounter = i64(int counter);
+
+    using TSetThreadAllocTag = int(int tag);
+    using TGetPerTagAllocInfo = void(
+        bool flushPerThreadCounters,
+        TPerTagAllocInfo* info,
+        int& maxTag,
+        int& numSizes);
+
+    using TSetProfileCurrentThread = bool(bool newVal);
+    using TSetProfileAllThreads = bool(bool newVal);
+    using TSetAllocationSamplingEnabled = bool(bool newVal);
+
+    using TSetAllocationSampleRate = size_t(size_t newVal);
+    using TSetAllocationSampleMaxSize = size_t(size_t newVal);
+
+    using TSetAllocationCallback = TAllocationCallback*(TAllocationCallback* newVal);
+    using TSetDeallocationCallback = TDeallocationCallback*(TDeallocationCallback* newVal);
+
+    struct TAllocFn {
+        TGetAllocationCounter* GetAllocationCounterFast = nullptr;
+        TGetAllocationCounter* GetAllocationCounterFull = nullptr;
+
+        TSetThreadAllocTag* SetThreadAllocTag = nullptr;
+        TGetPerTagAllocInfo* GetPerTagAllocInfo = nullptr;
+
+        TSetProfileCurrentThread* SetProfileCurrentThread = nullptr;
+        TSetProfileAllThreads* SetProfileAllThreads = nullptr;
+        TSetAllocationSamplingEnabled* SetAllocationSamplingEnabled = nullptr;
+
+        TSetAllocationSampleRate* SetAllocationSampleRate = nullptr;
+        TSetAllocationSampleMaxSize* SetAllocationSampleMaxSize = nullptr;
+
+        TSetAllocationCallback* SetAllocationCallback = nullptr;
+        TSetDeallocationCallback* SetDeallocationCallback = nullptr;
+
+        TAllocFn() {
+            auto mallocInfo = NMalloc::MallocInfo();
+
+            GetAllocationCounterFast = (TGetAllocationCounter*)mallocInfo.GetParam("GetLFAllocCounterFast");
+            GetAllocationCounterFull = (TGetAllocationCounter*)mallocInfo.GetParam("GetLFAllocCounterFull");
+
+            SetThreadAllocTag = (TSetThreadAllocTag*)mallocInfo.GetParam("SetThreadAllocTag");
+            GetPerTagAllocInfo = (TGetPerTagAllocInfo*)mallocInfo.GetParam("GetPerTagAllocInfo");
+
+            SetProfileCurrentThread = (TSetProfileCurrentThread*)mallocInfo.GetParam("SetProfileCurrentThread");
+            SetProfileAllThreads = (TSetProfileAllThreads*)mallocInfo.GetParam("SetProfileAllThreads");
+            SetAllocationSamplingEnabled = (TSetAllocationSamplingEnabled*)mallocInfo.GetParam("SetAllocationSamplingEnabled");
+
+            SetAllocationSampleRate = (TSetAllocationSampleRate*)mallocInfo.GetParam("SetAllocationSampleRate");
+            SetAllocationSampleMaxSize = (TSetAllocationSampleMaxSize*)mallocInfo.GetParam("SetAllocationSampleMaxSize");
+
+            SetAllocationCallback = (TSetAllocationCallback*)mallocInfo.GetParam("SetAllocationCallback");
+            SetDeallocationCallback = (TSetDeallocationCallback*)mallocInfo.GetParam("SetDeallocationCallback");
+        }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static TAllocFn AllocFn;
+
+    i64 GetAllocationCounterFast(ELFAllocCounter counter) {
+        return AllocFn.GetAllocationCounterFast ? AllocFn.GetAllocationCounterFast(counter) : 0;
+    }
+
+    i64 GetAllocationCounterFull(ELFAllocCounter counter) {
+        return AllocFn.GetAllocationCounterFull ? AllocFn.GetAllocationCounterFull(counter) : 0;
+    }
+
+    int SetThreadAllocTag(int tag) {
+        return AllocFn.SetThreadAllocTag ? AllocFn.SetThreadAllocTag(tag) : 0;
+    }
+
+    TArrayPtr<TPerTagAllocInfo> GetPerTagAllocInfo(
+        bool flushPerThreadCounters,
+        int& maxTag,
+        int& numSizes) {
+        if (AllocFn.GetPerTagAllocInfo) {
+            AllocFn.GetPerTagAllocInfo(flushPerThreadCounters, nullptr, maxTag, numSizes);
+            TArrayPtr<TPerTagAllocInfo> info = new TPerTagAllocInfo[maxTag * numSizes];
+            AllocFn.GetPerTagAllocInfo(flushPerThreadCounters, info.Get(), maxTag, numSizes);
+            return info;
+        }
+        maxTag = 0;
+        numSizes = 0;
+        return nullptr;
+    }
+
+    bool SetProfileCurrentThread(bool newVal) {
+        return AllocFn.SetProfileCurrentThread ? AllocFn.SetProfileCurrentThread(newVal) : false;
+    }
+
+    bool SetProfileAllThreads(bool newVal) {
+        return AllocFn.SetProfileAllThreads ? AllocFn.SetProfileAllThreads(newVal) : false;
+    }
+
+    bool SetAllocationSamplingEnabled(bool newVal) {
+        return AllocFn.SetAllocationSamplingEnabled ? AllocFn.SetAllocationSamplingEnabled(newVal) : false;
+    }
+
+    size_t SetAllocationSampleRate(size_t newVal) {
+        return AllocFn.SetAllocationSampleRate ? AllocFn.SetAllocationSampleRate(newVal) : 0;
+    }
+
+    size_t SetAllocationSampleMaxSize(size_t newVal) {
+        return AllocFn.SetAllocationSampleMaxSize ? AllocFn.SetAllocationSampleMaxSize(newVal) : 0;
+    }
+
+    TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal) {
+        return AllocFn.SetAllocationCallback ? AllocFn.SetAllocationCallback(newVal) : nullptr;
+    }
+
+    TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal) {
+        return AllocFn.SetDeallocationCallback ? AllocFn.SetDeallocationCallback(newVal) : nullptr;
+    }
+
+}
diff --git a/library/cpp/lfalloc/dbg_info/dbg_info.h b/library/cpp/lfalloc/dbg_info/dbg_info.h
new file mode 100644
index 0000000000..071562a81a
--- /dev/null
+++ b/library/cpp/lfalloc/dbg_info/dbg_info.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <util/generic/ptr.h>
+#include <util/system/types.h>
+
+namespace NAllocDbg {
+    ////////////////////////////////////////////////////////////////////////////////
+    // Allocation statistics
+
+    enum ELFAllocCounter {
+        CT_USER_ALLOC,     // accumulated size requested by user code
+        CT_MMAP,           // accumulated mmapped size
+        CT_MMAP_CNT,       // number of mmapped regions
+        CT_MUNMAP,         // accumulated unmmapped size
+        CT_MUNMAP_CNT,     // number of munmaped regions
+        CT_SYSTEM_ALLOC,   // accumulated allocated size for internal lfalloc needs
+        CT_SYSTEM_FREE,    // accumulated deallocated size for internal lfalloc needs
+        CT_SMALL_ALLOC,    // accumulated allocated size for fixed-size blocks
+        CT_SMALL_FREE,     // accumulated deallocated size for fixed-size blocks
+        CT_LARGE_ALLOC,    // accumulated allocated size for large blocks
+        CT_LARGE_FREE,     // accumulated deallocated size for large blocks
+        CT_SLOW_ALLOC_CNT, // number of slow (not LF) allocations
+        CT_DEGRAGMENT_CNT, // number of memory defragmentations
+        CT_MAX
+    };
+
+    i64 GetAllocationCounterFast(ELFAllocCounter counter);
+    i64 GetAllocationCounterFull(ELFAllocCounter counter);
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Allocation statistics could be tracked on per-tag basis
+
+    int SetThreadAllocTag(int tag);
+
+    class TScopedTag {
+    private:
+        int PrevTag;
+
+    public:
+        explicit TScopedTag(int tag) {
+            PrevTag = SetThreadAllocTag(tag);
+        }
+
+        ~TScopedTag() {
+            SetThreadAllocTag(PrevTag);
+        }
+    };
+
+    struct TPerTagAllocInfo {
+        ssize_t Count;
+        ssize_t Size;
+    };
+
+    TArrayPtr<TPerTagAllocInfo> GetPerTagAllocInfo(
+        bool flushPerThreadCounters,
+        int& maxTag,
+        int& numSizes);
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Allocation sampling could be used to collect detailed information
+
+    bool SetProfileCurrentThread(bool newVal);
+    bool SetProfileAllThreads(bool newVal);
+    bool SetAllocationSamplingEnabled(bool newVal);
+
+    size_t SetAllocationSampleRate(size_t newVal);
+    size_t SetAllocationSampleMaxSize(size_t newVal);
+
+#define DBG_ALLOC_INVALID_COOKIE (-1)
+
+    using TAllocationCallback = int(int tag, size_t size, int sizeIdx);
+    using TDeallocationCallback = void(int cookie, int tag, size_t size, int sizeIdx);
+
+    TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal);
+    TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal);
+
+}
diff --git a/library/cpp/lfalloc/dbg_info/ya.make b/library/cpp/lfalloc/dbg_info/ya.make
new file mode 100644
index 0000000000..efecba5993
--- /dev/null
+++ b/library/cpp/lfalloc/dbg_info/ya.make
@@ -0,0 +1,15 @@
+LIBRARY()
+
+OWNER(vskipin)
+
+PEERDIR(
+    library/cpp/malloc/api
+)
+
+SRCS(
+    dbg_info.cpp
+)
+
+SET(IDE_FOLDER "util")
+
+END()
diff --git a/library/cpp/lfalloc/lf_allocX64.cpp b/library/cpp/lfalloc/lf_allocX64.cpp
new file mode 100644
index 0000000000..2eb90761fe
--- /dev/null
+++ b/library/cpp/lfalloc/lf_allocX64.cpp
@@ -0,0 +1,144 @@
+#include "lf_allocX64.h"
+
+//////////////////////////////////////////////////////////////////////////
+// hooks
+#if defined(USE_INTELCC) || defined(_darwin_) || defined(_freebsd_) || defined(_STLPORT_VERSION)
+#define OP_THROWNOTHING noexcept
+#else
+#define OP_THROWNOTHING
+#endif
+
+#ifndef _darwin_
+#if !defined(YMAKE)
+void* operator new(size_t size) {
+    return LFAlloc(size);
+}
+
+void* operator new(size_t size, const std::nothrow_t&) OP_THROWNOTHING {
+    return LFAlloc(size);
+}
+
+void operator delete(void* p)OP_THROWNOTHING {
+    LFFree(p);
+}
+
+void operator delete(void* p, const std::nothrow_t&)OP_THROWNOTHING {
+    LFFree(p);
+}
+
+void* operator new[](size_t size) {
+    return LFAlloc(size);
+}
+
+void* operator new[](size_t size, const std::nothrow_t&) OP_THROWNOTHING {
+    return LFAlloc(size);
+}
+
+void operator delete[](void* p) OP_THROWNOTHING {
+    LFFree(p);
+}
+
+void operator delete[](void* p, const std::nothrow_t&) OP_THROWNOTHING {
+    LFFree(p);
+}
+#endif
+
+//#ifndef _MSC_VER
+
+extern "C" void* malloc(size_t size) {
+    return LFAlloc(size);
+}
+
+extern "C" void* valloc(size_t size) {
+    return LFVAlloc(size);
+}
+
+extern "C" int posix_memalign(void** memptr, size_t alignment, size_t size) {
+    return LFPosixMemalign(memptr, alignment, size);
+}
+
+extern "C" void* memalign(size_t alignment, size_t size) {
+    void* ptr;
+    int res = LFPosixMemalign(&ptr, alignment, size);
+    return res ? nullptr : ptr;
+}
+
+extern "C" void* aligned_alloc(size_t alignment, size_t size) {
+    return memalign(alignment, size);
+}
+
+#if !defined(_MSC_VER) && !defined(_freebsd_)
+// Workaround for pthread_create bug in linux.
+extern "C" void* __libc_memalign(size_t alignment, size_t size) {
+    return memalign(alignment, size);
+}
+#endif
+
+extern "C" void free(void* ptr) {
+    LFFree(ptr);
+}
+
+extern "C" void* calloc(size_t n, size_t elem_size) {
+    // Overflow check
+    const size_t size = n * elem_size;
+    if (elem_size != 0 && size / elem_size != n)
+        return nullptr;
+
+    void* result = LFAlloc(size);
+    if (result != nullptr) {
+        memset(result, 0, size);
+    }
+    return result;
+}
+
+extern "C" void cfree(void* ptr) {
+    LFFree(ptr);
+}
+
+extern "C" void* realloc(void* old_ptr, size_t new_size) {
+    if (old_ptr == nullptr) {
+        void* result = LFAlloc(new_size);
+        return result;
+    }
+    if (new_size == 0) {
+        LFFree(old_ptr);
+        return nullptr;
+    }
+
+    void* new_ptr = LFAlloc(new_size);
+    if (new_ptr == nullptr) {
+        return nullptr;
+    }
+    size_t old_size = LFGetSize(old_ptr);
+    memcpy(new_ptr, old_ptr, ((old_size < new_size) ? old_size : new_size));
+    LFFree(old_ptr);
+    return new_ptr;
+}
+
+extern "C" size_t malloc_usable_size(void* ptr) {
+    if (ptr == nullptr) {
+        return 0;
+    }
+    return LFGetSize(ptr);
+}
+
+NMalloc::TMallocInfo NMalloc::MallocInfo() {
+    NMalloc::TMallocInfo r;
+#if defined(LFALLOC_DBG)
+    r.Name = "lfalloc_dbg";
+#elif defined(LFALLOC_YT)
+    r.Name = "lfalloc_yt";
+#else
+    r.Name = "lfalloc";
+#endif
+    r.SetParam = &LFAlloc_SetParam;
+    r.GetParam = &LFAlloc_GetParam;
+    return r;
+}
+#else
+NMalloc::TMallocInfo NMalloc::MallocInfo() {
+    NMalloc::TMallocInfo r;
+    r.Name = "system-darwin";
+    return r;
+}
+#endif
diff --git a/library/cpp/lfalloc/lf_allocX64.h b/library/cpp/lfalloc/lf_allocX64.h
new file mode 100644
index 0000000000..fd2a906d6f
--- /dev/null
+++ b/library/cpp/lfalloc/lf_allocX64.h
@@ -0,0 +1,1926 @@
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+
+#include <library/cpp/malloc/api/malloc.h>
+
+#include <util/system/compat.h>
+#include <util/system/compiler.h>
+#include <util/system/types.h>
+
+#ifdef _MSC_VER
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+#ifdef _M_X64
+#define _64_
+#endif
+#include <intrin.h>
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#pragma intrinsic(_InterlockedCompareExchange)
+#pragma intrinsic(_InterlockedExchangeAdd)
+
+#include <new>
+#include <assert.h>
+#include <errno.h>
+
+#define PERTHREAD __declspec(thread)
+#define _win_
+#define Y_FORCE_INLINE __forceinline
+
+using TAtomic = volatile long;
+
+static inline long AtomicAdd(TAtomic& a, long b) {
+    return _InterlockedExchangeAdd(&a, b) + b;
+}
+
+static inline long AtomicSub(TAtomic& a, long b) {
+    return AtomicAdd(a, -b);
+}
+
+#pragma comment(lib, "synchronization.lib")
+
+#ifndef NDEBUG
+#define Y_ASSERT_NOBT(x)           \
+    {                              \
+        if (IsDebuggerPresent()) { \
+            if (!(x))              \
+                __debugbreak();    \
+        } else                     \
+            assert(x);             \
+    }
+#else
+#define Y_ASSERT_NOBT(x) ((void)0)
+#endif
+
+#else
+
+#include <util/system/defaults.h>
+#include <util/system/atomic.h>
+#include <util/system/yassert.h>
+
+#if !defined(NDEBUG) && !defined(__GCCXML__)
+#define Y_ASSERT_NOBT(a)                                    \
+    do {                                                    \
+        try {                                               \
+            if (Y_UNLIKELY(!(a))) {                         \
+                if (YaIsDebuggerPresent())                  \
+                    __debugbreak();                         \
+                else {                                      \
+                    assert(false && (a));                   \
+                }                                           \
+            }                                               \
+        } catch (...) {                                     \
+            if (YaIsDebuggerPresent())                      \
+                __debugbreak();                             \
+            else {                                          \
+                assert(false && "Exception during assert"); \
+            }                                               \
+        }                                                   \
+    } while (0)
+#else
+#define Y_ASSERT_NOBT(a)                       \
+    do {                                       \
+        if (false) {                           \
+            bool __xxx = static_cast<bool>(a); \
+            Y_UNUSED(__xxx);                   \
+        }                                      \
+    } while (0)
+#endif
+
+#include <pthread.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <new>
+#include <errno.h>
+
+#if defined(_linux_)
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#if !defined(MADV_HUGEPAGE)
+#define MADV_HUGEPAGE 14
+#endif
+#if !defined(MAP_HUGETLB)
+#define MAP_HUGETLB 0x40000
+#endif
+#endif
+
+#define PERTHREAD __thread
+
+#endif
+
+#ifndef _darwin_
+
+#ifndef Y_ARRAY_SIZE
+#define Y_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+#endif
+
+#ifndef NDEBUG
+#define DBG_FILL_MEMORY
+static bool FillMemoryOnAllocation = true;
+#endif
+
+static bool TransparentHugePages = false; // force MADV_HUGEPAGE for large allocs
+static bool MapHugeTLB = false;           // force MAP_HUGETLB for small allocs
+static bool EnableDefrag = true;
+
+// Buffers that are larger than this size will not be filled with 0xcf
+#ifndef DBG_FILL_MAX_SIZE
+#define DBG_FILL_MAX_SIZE 0x01000000000000ULL
+#endif
+
+template <class T>
+inline T* DoCas(T* volatile* target, T* exchange, T* compare) {
+#if defined(__has_builtin) && __has_builtin(__sync_val_compare_and_swap)
+    return __sync_val_compare_and_swap(target, compare, exchange);
+#elif defined(_WIN32)
+#ifdef _64_
+    return (T*)_InterlockedCompareExchange64((__int64*)target, (__int64)exchange, (__int64)compare);
+#else
+    //return (T*)InterlockedCompareExchangePointer(targetVoidP, exchange, compare);
+    return (T*)_InterlockedCompareExchange((LONG*)target, (LONG)exchange, (LONG)compare);
+#endif
+#elif defined(__i386) || defined(__x86_64__)
+    union {
+        T* volatile* NP;
+        void* volatile* VoidP;
+    } gccSucks;
+    gccSucks.NP = target;
+    void* volatile* targetVoidP = gccSucks.VoidP;
+
+    __asm__ __volatile__(
+        "lock\n\t"
+        "cmpxchg %2,%0\n\t"
+        : "+m"(*(targetVoidP)), "+a"(compare)
+        : "r"(exchange)
+        : "cc", "memory");
+    return compare;
+#else
+#error inline_cas not defined for this platform
+#endif
+}
+
+#ifdef _64_
+const uintptr_t N_MAX_WORKSET_SIZE = 0x100000000ll * 200;
+const uintptr_t N_HUGE_AREA_FINISH = 0x700000000000ll;
+#ifndef _freebsd_
+const uintptr_t LINUX_MMAP_AREA_START = 0x100000000ll;
+static uintptr_t volatile linuxAllocPointer = LINUX_MMAP_AREA_START;
+static uintptr_t volatile linuxAllocPointerHuge = LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE;
+#endif
+#else
+const uintptr_t N_MAX_WORKSET_SIZE = 0xffffffff;
+#endif
+#define ALLOC_START ((char*)0)
+
+const size_t N_CHUNK_SIZE = 1024 * 1024;
+const size_t N_CHUNKS = N_MAX_WORKSET_SIZE / N_CHUNK_SIZE;
+const size_t N_LARGE_ALLOC_SIZE = N_CHUNK_SIZE * 128;
+
+// map size idx to size in bytes
+#ifdef LFALLOC_YT
+const int N_SIZES = 27;
+#else
+const int N_SIZES = 25;
+#endif
+const int nSizeIdxToSize[N_SIZES] = {
+    -1,
+#if defined(_64_)
+    16, 16, 32, 32, 48, 64, 96, 128,
+#else
+    8,
+    16,
+    24,
+    32,
+    48,
+    64,
+    96,
+    128,
+#endif
+    192, 256, 384, 512, 768, 1024, 1536, 2048,
+    3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768,
+#ifdef LFALLOC_YT
+    49152, 65536
+#endif
+};
+#ifdef LFALLOC_YT
+const size_t N_MAX_FAST_SIZE = 65536;
+#else
+const size_t N_MAX_FAST_SIZE = 32768;
+#endif
+const unsigned char size2idxArr1[64 + 1] = {
+    1,
+#if defined(_64_)
+    2, 2, 4, 4, // 16, 16, 32, 32
+#else
+    1, 2, 3, 4, // 8, 16, 24, 32
+#endif
+    5, 5, 6, 6,                                                     // 48, 64
+    7, 7, 7, 7, 8, 8, 8, 8,                                         // 96, 128
+    9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10,         // 192, 256
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, // 384
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12  // 512
+};
+#ifdef LFALLOC_YT
+const unsigned char size2idxArr2[256] = {
+#else
+const unsigned char size2idxArr2[128] = {
+#endif
+    12, 12, 13, 14,                                                 // 512, 512, 768, 1024
+    15, 15, 16, 16,                                                 // 1536, 2048
+    17, 17, 17, 17, 18, 18, 18, 18,                                 // 3072, 4096
+    19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, // 6144, 8192
+    21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, // 12288
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, // 16384
+    23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+    23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, // 24576
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, // 32768
+#ifdef LFALLOC_YT
+    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, // 49152
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, // 65536
+#endif
+};
+
+// map entry number to size idx
+// special size idx's: 0 = not used, -1 = mem locked, but not allocated
+static volatile char chunkSizeIdx[N_CHUNKS];
+const int FREE_CHUNK_ARR_BUF = 0x20000; // this is effectively 128G of free memory (with 1M chunks), should not be exhausted actually
+static volatile uintptr_t freeChunkArr[FREE_CHUNK_ARR_BUF];
+static volatile int freeChunkCount;
+
+static void AddFreeChunk(uintptr_t chunkId) {
+    chunkSizeIdx[chunkId] = -1;
+    if (Y_UNLIKELY(freeChunkCount == FREE_CHUNK_ARR_BUF))
+        NMalloc::AbortFromCorruptedAllocator("free chunks array overflowed");
+    freeChunkArr[freeChunkCount++] = chunkId;
+}
+
+static bool GetFreeChunk(uintptr_t* res) {
+    if (freeChunkCount == 0) {
+        *res = 0;
+        return false;
+    }
+    *res = freeChunkArr[--freeChunkCount];
+    return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+enum ELFAllocCounter {
+    CT_USER_ALLOC,     // accumulated size requested by user code
+    CT_MMAP,           // accumulated mmapped size
+    CT_MMAP_CNT,       // number of mmapped regions
+    CT_MUNMAP,         // accumulated unmmapped size
+    CT_MUNMAP_CNT,     // number of munmaped regions
+    CT_SYSTEM_ALLOC,   // accumulated allocated size for internal lfalloc needs
+    CT_SYSTEM_FREE,    // accumulated deallocated size for internal lfalloc needs
+    CT_SMALL_ALLOC,    // accumulated allocated size for fixed-size blocks
+    CT_SMALL_FREE,     // accumulated deallocated size for fixed-size blocks
+    CT_LARGE_ALLOC,    // accumulated allocated size for large blocks
+    CT_LARGE_FREE,     // accumulated deallocated size for large blocks
+    CT_SLOW_ALLOC_CNT, // number of slow (not LF) allocations
+    CT_DEGRAGMENT_CNT, // number of memory defragmentations
+    CT_MAX
+};
+
+static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value);
+
+//////////////////////////////////////////////////////////////////////////
+enum EMMapMode {
+    MM_NORMAL, // memory for small allocs
+    MM_HUGE    // memory for large allocs
+};
+
+#ifndef _MSC_VER
+inline void VerifyMmapResult(void* result) {
+    if (Y_UNLIKELY(result == MAP_FAILED))
+        NMalloc::AbortFromCorruptedAllocator("negative size requested? or just out of mem");
+}
+#endif
+
+#if !defined(_MSC_VER) && !defined(_freebsd_) && defined(_64_)
+static char* AllocWithMMapLinuxImpl(uintptr_t sz, EMMapMode mode) {
+    char* volatile* areaPtr;
+    char* areaStart;
+    uintptr_t areaFinish;
+
+    int mapProt = PROT_READ | PROT_WRITE;
+    int mapFlags = MAP_PRIVATE | MAP_ANON;
+
+    if (mode == MM_HUGE) {
+        areaPtr = reinterpret_cast<char* volatile*>(&linuxAllocPointerHuge);
+        areaStart = reinterpret_cast<char*>(LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE);
+        areaFinish = N_HUGE_AREA_FINISH;
+    } else {
+        areaPtr = reinterpret_cast<char* volatile*>(&linuxAllocPointer);
+        areaStart = reinterpret_cast<char*>(LINUX_MMAP_AREA_START);
+        areaFinish = N_MAX_WORKSET_SIZE;
+
+        if (MapHugeTLB) {
+            mapFlags |= MAP_HUGETLB;
+        }
+    }
+
+    bool wrapped = false;
+    for (;;) {
+        char* prevAllocPtr = *areaPtr;
+        char* nextAllocPtr = prevAllocPtr + sz;
+        if (uintptr_t(nextAllocPtr - (char*)nullptr) >= areaFinish) {
+            if (Y_UNLIKELY(wrapped)) {
+                NMalloc::AbortFromCorruptedAllocator("virtual memory is over fragmented");
+            }
+            // wrap after all area is used
+            DoCas(areaPtr, areaStart, prevAllocPtr);
+            wrapped = true;
+            continue;
+        }
+
+        if (DoCas(areaPtr, nextAllocPtr, prevAllocPtr) != prevAllocPtr)
+            continue;
+
+        char* largeBlock = (char*)mmap(prevAllocPtr, sz, mapProt, mapFlags, -1, 0);
+        VerifyMmapResult(largeBlock);
+        if (largeBlock == prevAllocPtr)
+            return largeBlock;
+        if (largeBlock)
+            munmap(largeBlock, sz);
+
+        if (sz < 0x80000) {
+            // skip utilized area with big steps
+            DoCas(areaPtr, nextAllocPtr + 0x10 * 0x10000, nextAllocPtr);
+        }
+    }
+}
+#endif
+
+static char* AllocWithMMap(uintptr_t sz, EMMapMode mode) {
+    (void)mode;
+#ifdef _MSC_VER
+    char* largeBlock = (char*)VirtualAlloc(0, sz, MEM_RESERVE, PAGE_READWRITE);
+    if (Y_UNLIKELY(largeBlock == nullptr))
+        NMalloc::AbortFromCorruptedAllocator("out of memory");
+    if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE))
+        NMalloc::AbortFromCorruptedAllocator("out of working set, something has broken");
+#else
+#if defined(_freebsd_) || !defined(_64_)
+    char* largeBlock = (char*)mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    VerifyMmapResult(largeBlock);
+    if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE))
+        NMalloc::AbortFromCorruptedAllocator("out of working set, something has broken");
+#else
+    char* largeBlock = AllocWithMMapLinuxImpl(sz, mode);
+    if (TransparentHugePages) {
+        madvise(largeBlock, sz, MADV_HUGEPAGE);
+    }
+#endif
+#endif
+    Y_ASSERT_NOBT(largeBlock);
+    IncrementCounter(CT_MMAP, sz);
+    IncrementCounter(CT_MMAP_CNT, 1);
+    return largeBlock;
+}
+
+enum class ELarge : ui8 {
+    Free = 0,   // block in free cache
+    Alloc = 1,  // block is allocated
+    Gone = 2,   // block was unmapped
+};
+
+struct TLargeBlk {
+
+    static TLargeBlk* As(void *raw) {
+        return reinterpret_cast<TLargeBlk*>((char*)raw - 4096ll);
+    }
+
+    static const TLargeBlk* As(const void *raw) {
+        return reinterpret_cast<const TLargeBlk*>((const char*)raw - 4096ll);
+    }
+
+    void SetSize(size_t bytes, size_t pages) {
+        Pages = pages;
+        Bytes = bytes;
+    }
+
+    void Mark(ELarge state) {
+        const ui64 marks[] = {
+            0x8b38aa5ca4953c98, // ELarge::Free
+            0xf916d33584eb5087, // ELarge::Alloc
+            0xd33b0eca7651bc3f  // ELarge::Gone
+        };
+
+        Token = size_t(marks[ui8(state)]);
+    }
+
+    size_t Pages; // Total pages allocated with mmap like call
+    size_t Bytes; // Actually requested bytes by user
+    size_t Token; // Block state token, see ELarge enum.
+};
+
+
+static void LargeBlockUnmap(void* p, size_t pages) {
+    const auto bytes = (pages + 1) * uintptr_t(4096);
+
+    IncrementCounter(CT_MUNMAP, bytes);
+    IncrementCounter(CT_MUNMAP_CNT, 1);
+#ifdef _MSC_VER
+    Y_ASSERT_NOBT(0);
+#else
+    TLargeBlk::As(p)->Mark(ELarge::Gone);
+    munmap((char*)p - 4096ll, bytes);
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////
+const size_t LB_BUF_SIZE = 250;
+const size_t LB_BUF_HASH = 977;
+static int LB_LIMIT_TOTAL_SIZE = 500 * 1024 * 1024 / 4096; // do not keep more then this mem total in lbFreePtrs[]
+static void* volatile lbFreePtrs[LB_BUF_HASH][LB_BUF_SIZE];
+static TAtomic lbFreePageCount;
+
+
+static void* LargeBlockAlloc(size_t _nSize, ELFAllocCounter counter) {
+    size_t pgCount = (_nSize + 4095) / 4096;
+#ifdef _MSC_VER
+    char* pRes = (char*)VirtualAlloc(0, (pgCount + 1) * 4096ll, MEM_COMMIT, PAGE_READWRITE);
+    if (Y_UNLIKELY(pRes == 0)) {
+        NMalloc::AbortFromCorruptedAllocator("out of memory");
+    }
+#else
+
+    IncrementCounter(counter, pgCount * 4096ll);
+    IncrementCounter(CT_SYSTEM_ALLOC, 4096ll);
+
+    int lbHash = pgCount % LB_BUF_HASH;
+    for (int i = 0; i < LB_BUF_SIZE; ++i) {
+        void* p = lbFreePtrs[lbHash][i];
+        if (p == nullptr)
+            continue;
+        if (DoCas(&lbFreePtrs[lbHash][i], (void*)nullptr, p) == p) {
+            size_t realPageCount = TLargeBlk::As(p)->Pages;
+            if (realPageCount == pgCount) {
+                AtomicAdd(lbFreePageCount, -pgCount);
+                TLargeBlk::As(p)->Mark(ELarge::Alloc);
+                return p;
+            } else {
+                if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) != (void*)nullptr) {
+                    // block was freed while we were busy
+                    AtomicAdd(lbFreePageCount, -realPageCount);
+                    LargeBlockUnmap(p, realPageCount);
+                    --i;
+                }
+            }
+        }
+    }
+    char* pRes = AllocWithMMap((pgCount + 1) * 4096ll, MM_HUGE);
+#endif
+    pRes += 4096ll;
+    TLargeBlk::As(pRes)->SetSize(_nSize, pgCount);
+    TLargeBlk::As(pRes)->Mark(ELarge::Alloc);
+
+    return pRes;
+}
+
+#ifndef _MSC_VER
+static void FreeAllLargeBlockMem() {
+    for (auto& lbFreePtr : lbFreePtrs) {
+        for (int i = 0; i < LB_BUF_SIZE; ++i) {
+            void* p = lbFreePtr[i];
+            if (p == nullptr)
+                continue;
+            if (DoCas(&lbFreePtr[i], (void*)nullptr, p) == p) {
+                int pgCount = TLargeBlk::As(p)->Pages;
+                AtomicAdd(lbFreePageCount, -pgCount);
+                LargeBlockUnmap(p, pgCount);
+            }
+        }
+    }
+}
+#endif
+
+static void LargeBlockFree(void* p, ELFAllocCounter counter) {
+    if (p == nullptr)
+        return;
+#ifdef _MSC_VER
+    VirtualFree((char*)p - 4096ll, 0, MEM_RELEASE);
+#else
+    size_t pgCount = TLargeBlk::As(p)->Pages;
+
+    TLargeBlk::As(p)->Mark(ELarge::Free);
+    IncrementCounter(counter, pgCount * 4096ll);
+    IncrementCounter(CT_SYSTEM_FREE, 4096ll);
+
+    if (lbFreePageCount > LB_LIMIT_TOTAL_SIZE)
+        FreeAllLargeBlockMem();
+    int lbHash = pgCount % LB_BUF_HASH;
+    for (int i = 0; i < LB_BUF_SIZE; ++i) {
+        if (lbFreePtrs[lbHash][i] == nullptr) {
+            if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) == nullptr) {
+                AtomicAdd(lbFreePageCount, pgCount);
+                return;
+            }
+        }
+    }
+
+    LargeBlockUnmap(p, pgCount);
+#endif
+}
+
+static void* SystemAlloc(size_t _nSize) {
+    //HeapAlloc(GetProcessHeap(), HEAP_GENERATE_EXCEPTIONS, _nSize);
+    return LargeBlockAlloc(_nSize, CT_SYSTEM_ALLOC);
+}
+static void SystemFree(void* p) {
+    //HeapFree(GetProcessHeap(), 0, p);
+    LargeBlockFree(p, CT_SYSTEM_FREE);
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+char* const LF_LOCK_FREE = ((char*)0) + 0;
+char* const LF_LOCK_LOCKED = ((char*)0) + 1;
+char* const LF_LOCK_FUTEX_WAIT = ((char*)0) + 2;
+static bool LFHasFutex = true;
+static bool LFCheckedWinVersion = false;
+
+// TLFLockData has to be zero-initialized explicitly https://en.cppreference.com/w/cpp/language/zero_initialization
+// otherwise constructor TLFLockData() for global var might be called after first use
+struct TLFLockData
+{
+    char* Pad1[15];
+    char* volatile LockVar; // = LF_LOCK_FREE; // no constructor, zero-initialize manually
+    char* Pad2[15];
+
+    bool TryLock()
+    {
+        return (LockVar == LF_LOCK_FREE && DoCas(&LockVar, LF_LOCK_LOCKED, LF_LOCK_FREE) == LF_LOCK_FREE);
+    }
+
+    void FutexWait()
+    {
+#ifdef _win_
+        if (!LFCheckedWinVersion) {
+            OSVERSIONINFOA ver;
+            memset(&ver, 0, sizeof(ver));
+            ver.dwOSVersionInfoSize = sizeof(OSVERSIONINFOA);
+            GetVersionExA(&ver);
+            LFHasFutex = (ver.dwMajorVersion > 6) || (ver.dwMajorVersion == 6 && ver.dwMinorVersion >= 2);
+            LFCheckedWinVersion = true;
+        }
+        if (LFHasFutex) {
+            if (LockVar == LF_LOCK_LOCKED) {
+                DoCas(&LockVar, LF_LOCK_FUTEX_WAIT, LF_LOCK_LOCKED);
+            }
+            if (LockVar == LF_LOCK_FUTEX_WAIT) {
+                char* lockedValue = LF_LOCK_FUTEX_WAIT;
+                WaitOnAddress(&LockVar, &lockedValue, sizeof(LockVar), INFINITE);
+            }
+        } else {
+            SwitchToThread();
+        }
+#elif defined(_linux_)
+        if (LFHasFutex) {
+            if (LockVar == LF_LOCK_LOCKED) {
+                DoCas(&LockVar, LF_LOCK_FUTEX_WAIT, LF_LOCK_LOCKED);
+            }
+            if (LockVar == LF_LOCK_FUTEX_WAIT) {
+                // linux allow only int variables checks, here we pretend low bits of LockVar are int
+                syscall(SYS_futex, &LockVar, FUTEX_WAIT_PRIVATE, *(int*)&LF_LOCK_FUTEX_WAIT, 0, 0, 0);
+            }
+        } else {
+            sched_yield();
+        }
+#else
+        sched_yield();
+#endif
+    }
+
+    void Unlock()
+    {
+        Y_ASSERT_NOBT(LockVar != LF_LOCK_FREE);
+        if (DoCas(&LockVar, LF_LOCK_FREE, LF_LOCK_LOCKED) != LF_LOCK_LOCKED) {
+            Y_ASSERT_NOBT(LockVar == LF_LOCK_FUTEX_WAIT && LFHasFutex);
+            LockVar = LF_LOCK_FREE;
+#ifdef _win_
+            WakeByAddressAll((PVOID)&LockVar);
+#elif defined(_linux_)
+            syscall(SYS_futex, &LockVar, FUTEX_WAKE_PRIVATE, INT_MAX, 0, 0, 0);
+#endif
+        }
+    }
+};
+
+static TLFLockData LFGlobalLock;
+
+
+class TLFLockHolder {
+    TLFLockData *LockData = nullptr;
+    int Attempt = 0;
+    int SleepMask = 0x7f;
+
+public:
+    TLFLockHolder() {}
+    TLFLockHolder(TLFLockData *lk) {
+        while (!TryLock(lk));
+    }
+    bool TryLock(TLFLockData *lk)
+    {
+        Y_ASSERT_NOBT(LockData == nullptr);
+        if (lk->TryLock()) {
+            LockData = lk;
+            return true;
+        }
+        if ((++Attempt & SleepMask) == 0) {
+            lk->FutexWait();
+            SleepMask = (SleepMask * 2 + 1) & 0x7fff;
+        } else {
+#ifdef _MSC_VER
+            _mm_pause();
+#elif defined(__i386) || defined(__x86_64__)
+            __asm__ __volatile__("pause");
+#endif
+        }
+        return false;
+    }
+    ~TLFLockHolder() {
+        if (LockData) {
+            LockData->Unlock();
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+class TLFAllocFreeList {
+    struct TNode {
+        TNode* Next;
+    };
+
+    TNode* volatile Head;
+    TNode* volatile Pending;
+    TAtomic PendingToFreeListCounter;
+    TAtomic AllocCount;
+    void* Padding;
+
+    static Y_FORCE_INLINE void Enqueue(TNode* volatile* headPtr, TNode* n) {
+        for (;;) {
+            TNode* volatile prevHead = *headPtr;
+            n->Next = prevHead;
+            if (DoCas(headPtr, n, prevHead) == prevHead)
+                break;
+        }
+    }
+    Y_FORCE_INLINE void* DoAlloc() {
+        TNode* res;
+        for (res = Head; res; res = Head) {
+            TNode* keepNext = res->Next;
+            if (DoCas(&Head, keepNext, res) == res) {
+                //Y_VERIFY(keepNext == res->Next);
+                break;
+            }
+        }
+        return res;
+    }
+    void FreeList(TNode* fl) {
+        if (!fl)
+            return;
+        TNode* flTail = fl;
+        while (flTail->Next)
+            flTail = flTail->Next;
+        for (;;) {
+            TNode* volatile prevHead = Head;
+            flTail->Next = prevHead;
+            if (DoCas(&Head, fl, prevHead) == prevHead)
+                break;
+        }
+    }
+
+public:
+    Y_FORCE_INLINE void Free(void* ptr) {
+        TNode* newFree = (TNode*)ptr;
+        if (AtomicAdd(AllocCount, 0) == 0)
+            Enqueue(&Head, newFree);
+        else
+            Enqueue(&Pending, newFree);
+    }
+    Y_FORCE_INLINE void* Alloc() {
+        TAtomic keepCounter = AtomicAdd(PendingToFreeListCounter, 0);
+        TNode* fl = Pending;
+        if (AtomicAdd(AllocCount, 1) == 1) {
+            // No other allocs in progress.
+            // If (keepCounter == PendingToFreeListCounter) then Pending was not freed by other threads.
+            // Hence Pending is not used in any concurrent DoAlloc() atm and can be safely moved to FreeList
+            if (fl && keepCounter == AtomicAdd(PendingToFreeListCounter, 0) && DoCas(&Pending, (TNode*)nullptr, fl) == fl) {
+                // pick first element from Pending and return it
+                void* res = fl;
+                fl = fl->Next;
+                // if there are other elements in Pending list, add them to main free list
+                FreeList(fl);
+                AtomicAdd(PendingToFreeListCounter, 1);
+                AtomicAdd(AllocCount, -1);
+                return res;
+            }
+        }
+        void* res = DoAlloc();
+        AtomicAdd(AllocCount, -1);
+        return res;
+    }
+    void* GetWholeList() {
+        TNode* res;
+        for (res = Head; res; res = Head) {
+            if (DoCas(&Head, (TNode*)nullptr, res) == res)
+                break;
+        }
+        return res;
+    }
+    void ReturnWholeList(void* ptr) {
+        while (AtomicAdd(AllocCount, 0) != 0) // theoretically can run into problems with parallel DoAlloc()
+            ;                                 //ThreadYield();
+        for (;;) {
+            TNode* prevHead = Head;
+            if (DoCas(&Head, (TNode*)ptr, prevHead) == prevHead) {
+                FreeList(prevHead);
+                break;
+            }
+        }
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////
+static TLFAllocFreeList globalFreeLists[N_SIZES];
+static char* volatile globalCurrentPtr[N_SIZES];
+static TLFAllocFreeList blockFreeList;
+
+// globalFreeLists[] contains TFreeListGroup, each of them points up to 15 free blocks
+const int FL_GROUP_SIZE = 15;
+struct TFreeListGroup {
+    TFreeListGroup* Next;
+    char* Ptrs[FL_GROUP_SIZE];
+};
+#ifdef _64_
+const int FREE_LIST_GROUP_SIZEIDX = 8;
+#else
+const int FREE_LIST_GROUP_SIZEIDX = 6;
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+// find free chunks and reset chunk size so they can be reused by different sized allocations
+// do not look at blockFreeList (TFreeListGroup has same size for any allocations)
+static bool DefragmentMem() {
+    if (!EnableDefrag) {
+        return false;
+    }
+
+    IncrementCounter(CT_DEGRAGMENT_CNT, 1);
+
+    int* nFreeCount = (int*)SystemAlloc(N_CHUNKS * sizeof(int));
+    if (Y_UNLIKELY(!nFreeCount)) {
+        //__debugbreak();
+        NMalloc::AbortFromCorruptedAllocator("debugbreak");
+    }
+    memset(nFreeCount, 0, N_CHUNKS * sizeof(int));
+
+    TFreeListGroup* wholeLists[N_SIZES];
+    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) {
+        wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList();
+        for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) {
+            for (auto pData : g->Ptrs) {
+                if (pData) {
+                    uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE;
+                    ++nFreeCount[nChunk];
+                    Y_ASSERT_NOBT(chunkSizeIdx[nChunk] == nSizeIdx);
+                }
+            }
+        }
+    }
+
+    bool bRes = false;
+    for (size_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) {
+        int fc = nFreeCount[nChunk];
+        nFreeCount[nChunk] = 0;
+        if (chunkSizeIdx[nChunk] <= 0)
+            continue;
+        int nEntries = N_CHUNK_SIZE / nSizeIdxToSize[static_cast<int>(chunkSizeIdx[nChunk])];
+        Y_ASSERT_NOBT(fc <= nEntries); // can not have more free blocks then total count
+        if (fc == nEntries) {
+            bRes = true;
+            nFreeCount[nChunk] = 1;
+        }
+    }
+    if (bRes) {
+        for (auto& wholeList : wholeLists) {
+            TFreeListGroup** ppPtr = &wholeList;
+            while (*ppPtr) {
+                TFreeListGroup* g = *ppPtr;
+                int dst = 0;
+                for (auto pData : g->Ptrs) {
+                    if (pData) {
+                        uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE;
+                        if (nFreeCount[nChunk] == 0)
+                            g->Ptrs[dst++] = pData; // block is not freed, keep pointer
+                    }
+                }
+                if (dst == 0) {
+                    // no valid pointers in group, free it
+                    *ppPtr = g->Next;
+                    blockFreeList.Free(g);
+                } else {
+                    // reset invalid pointers to 0
+                    for (int i = dst; i < FL_GROUP_SIZE; ++i)
+                        g->Ptrs[i] = nullptr;
+                    ppPtr = &g->Next;
+                }
+            }
+        }
+        for (uintptr_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) {
+            if (!nFreeCount[nChunk])
+                continue;
+            char* pStart = ALLOC_START + nChunk * N_CHUNK_SIZE;
+#ifdef _win_
+            VirtualFree(pStart, N_CHUNK_SIZE, MEM_DECOMMIT);
+#elif defined(_freebsd_)
+            madvise(pStart, N_CHUNK_SIZE, MADV_FREE);
+#else
+            madvise(pStart, N_CHUNK_SIZE, MADV_DONTNEED);
+#endif
+            AddFreeChunk(nChunk);
+        }
+    }
+
+    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx)
+        globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]);
+
+    SystemFree(nFreeCount);
+    return bRes;
+}
+
+static Y_FORCE_INLINE void* LFAllocFromCurrentChunk(int nSizeIdx, int blockSize, int count) {
+    char* volatile* pFreeArray = &globalCurrentPtr[nSizeIdx];
+    while (char* newBlock = *pFreeArray) {
+        char* nextFree = newBlock + blockSize * count;
+
+        // check if there is space in chunk
+        char* globalEndPtr = ALLOC_START + ((newBlock - ALLOC_START) & ~((uintptr_t)N_CHUNK_SIZE - 1)) + N_CHUNK_SIZE;
+        if (nextFree >= globalEndPtr) {
+            if (nextFree > globalEndPtr)
+                break;
+            nextFree = nullptr; // it was last block in chunk
+        }
+        if (DoCas(pFreeArray, nextFree, newBlock) == newBlock)
+            return newBlock;
+    }
+    return nullptr;
+}
+
+enum EDefrag {
+    MEM_DEFRAG,
+    NO_MEM_DEFRAG,
+};
+
+static void* SlowLFAlloc(int nSizeIdx, int blockSize, EDefrag defrag) {
+    IncrementCounter(CT_SLOW_ALLOC_CNT, 1);
+
+    TLFLockHolder ls;
+    for (;;) {
+        bool locked = ls.TryLock(&LFGlobalLock);
+        void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1);
+        if (res) {
+            return res; // might happen when other thread allocated new current chunk
+        }
+        if (locked) {
+            break;
+        }
+    }
+    for (;;) {
+        uintptr_t nChunk;
+        if (GetFreeChunk(&nChunk)) {
+            char* newPlace = ALLOC_START + nChunk * N_CHUNK_SIZE;
+#ifdef _MSC_VER
+            void* pTest = VirtualAlloc(newPlace, N_CHUNK_SIZE, MEM_COMMIT, PAGE_READWRITE);
+            Y_ASSERT_NOBT(pTest == newPlace);
+#endif
+            chunkSizeIdx[nChunk] = (char)nSizeIdx;
+            globalCurrentPtr[nSizeIdx] = newPlace + blockSize;
+            return newPlace;
+        }
+
+        // out of luck, try to defrag
+        if (defrag == MEM_DEFRAG && DefragmentMem()) {
+            continue;
+        }
+
+        char* largeBlock = AllocWithMMap(N_LARGE_ALLOC_SIZE, MM_NORMAL);
+        uintptr_t addr = ((largeBlock - ALLOC_START) + N_CHUNK_SIZE - 1) & (~(N_CHUNK_SIZE - 1));
+        uintptr_t endAddr = ((largeBlock - ALLOC_START) + N_LARGE_ALLOC_SIZE) & (~(N_CHUNK_SIZE - 1));
+        for (uintptr_t p = addr; p < endAddr; p += N_CHUNK_SIZE) {
+            uintptr_t chunk = p / N_CHUNK_SIZE;
+            Y_ASSERT_NOBT(chunk * N_CHUNK_SIZE == p);
+            Y_ASSERT_NOBT(chunkSizeIdx[chunk] == 0);
+            AddFreeChunk(chunk);
+        }
+    }
+    return nullptr;
+}
+
+// allocate single block
+static Y_FORCE_INLINE void* LFAllocNoCache(int nSizeIdx, EDefrag defrag) {
+    int blockSize = nSizeIdxToSize[nSizeIdx];
+    void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1);
+    if (res)
+        return res;
+
+    return SlowLFAlloc(nSizeIdx, blockSize, defrag);
+}
+
+// allocate multiple blocks, returns number of blocks allocated (max FL_GROUP_SIZE)
+// buf should have space for at least FL_GROUP_SIZE elems
+static Y_FORCE_INLINE int LFAllocNoCacheMultiple(int nSizeIdx, char** buf) {
+    int blockSize = nSizeIdxToSize[nSizeIdx];
+    void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, FL_GROUP_SIZE);
+    if (res) {
+        char* resPtr = (char*)res;
+        for (int k = 0; k < FL_GROUP_SIZE; ++k) {
+            buf[k] = resPtr;
+            resPtr += blockSize;
+        }
+        return FL_GROUP_SIZE;
+    }
+    buf[0] = (char*)SlowLFAlloc(nSizeIdx, blockSize, MEM_DEFRAG);
+    return 1;
+}
+
+// take several blocks from global free list (max FL_GROUP_SIZE blocks), returns number of blocks taken
+// buf should have space for at least FL_GROUP_SIZE elems
+static Y_FORCE_INLINE int TakeBlocksFromGlobalFreeList(int nSizeIdx, char** buf) {
+    TLFAllocFreeList& fl = globalFreeLists[nSizeIdx];
+    TFreeListGroup* g = (TFreeListGroup*)fl.Alloc();
+    if (g) {
+        int resCount = 0;
+        for (auto& ptr : g->Ptrs) {
+            if (ptr)
+                buf[resCount++] = ptr;
+            else
+                break;
+        }
+        blockFreeList.Free(g);
+        return resCount;
+    }
+    return 0;
+}
+
+// add several blocks to global free list
+static Y_FORCE_INLINE void PutBlocksToGlobalFreeList(ptrdiff_t nSizeIdx, char** buf, int count) {
+    for (int startIdx = 0; startIdx < count;) {
+        TFreeListGroup* g = (TFreeListGroup*)blockFreeList.Alloc();
+        Y_ASSERT_NOBT(sizeof(TFreeListGroup) == nSizeIdxToSize[FREE_LIST_GROUP_SIZEIDX]);
+        if (!g) {
+            g = (TFreeListGroup*)LFAllocNoCache(FREE_LIST_GROUP_SIZEIDX, NO_MEM_DEFRAG);
+        }
+
+        int groupSize = count - startIdx;
+        if (groupSize > FL_GROUP_SIZE)
+            groupSize = FL_GROUP_SIZE;
+        for (int i = 0; i < groupSize; ++i)
+            g->Ptrs[i] = buf[startIdx + i];
+        for (int i = groupSize; i < FL_GROUP_SIZE; ++i)
+            g->Ptrs[i] = nullptr;
+
+        // add free group to the global list
+        TLFAllocFreeList& fl = globalFreeLists[nSizeIdx];
+        fl.Free(g);
+
+        startIdx += groupSize;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+static TAtomic GlobalCounters[CT_MAX];
+const int MAX_LOCAL_UPDATES = 100;
+const intptr_t MAX_LOCAL_DELTA = 1*1024*1024;
+
+struct TLocalCounter {
+    intptr_t Value;
+    int Updates;
+    TAtomic* Parent;
+
+    Y_FORCE_INLINE void Init(TAtomic* parent) {
+        Parent = parent;
+        Value = 0;
+        Updates = 0;
+    }
+
+    Y_FORCE_INLINE void Increment(size_t value) {
+        Value += value;
+        if (++Updates > MAX_LOCAL_UPDATES || Value > MAX_LOCAL_DELTA) {
+            Flush();
+        }
+    }
+
+    Y_FORCE_INLINE void Flush() {
+        AtomicAdd(*Parent, Value);
+        Value = 0;
+        Updates = 0;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// DBG stuff
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(LFALLOC_DBG)
+
+struct TPerTagAllocCounter {
+    TAtomic Size;
+    TAtomic Count;
+
+    Y_FORCE_INLINE void Alloc(size_t size) {
+        AtomicAdd(Size, size);
+        AtomicAdd(Count, 1);
+    }
+
+    Y_FORCE_INLINE void Free(size_t size) {
+        AtomicSub(Size, size);
+        AtomicSub(Count, 1);
+    }
+};
+
+struct TLocalPerTagAllocCounter {
+    intptr_t Size;
+    int Count;
+    int Updates;
+
+    Y_FORCE_INLINE void Init() {
+        Size = 0;
+        Count = 0;
+        Updates = 0;
+    }
+
+    Y_FORCE_INLINE void Alloc(TPerTagAllocCounter& parent, size_t size) {
+        Size += size;
+        ++Count;
+        if (++Updates > MAX_LOCAL_UPDATES) {
+            Flush(parent);
+        }
+    }
+
+    Y_FORCE_INLINE void Free(TPerTagAllocCounter& parent, size_t size) {
+        Size -= size;
+        --Count;
+        if (++Updates > MAX_LOCAL_UPDATES) {
+            Flush(parent);
+        }
+    }
+
+    Y_FORCE_INLINE void Flush(TPerTagAllocCounter& parent) {
+        AtomicAdd(parent.Size, Size);
+        Size = 0;
+        AtomicAdd(parent.Count, Count);
+        Count = 0;
+        Updates = 0;
+    }
+};
+
+static const int DBG_ALLOC_MAX_TAG = 1000;
+static const int DBG_ALLOC_ALIGNED_TAG = 0xF0000000;
+static const int DBG_ALLOC_NUM_SIZES = 30;
+static TPerTagAllocCounter GlobalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES];
+
+#endif // LFALLOC_DBG
+
+//////////////////////////////////////////////////////////////////////////
+const int THREAD_BUF = 256;
+static int borderSizes[N_SIZES];
+const int MAX_MEM_PER_SIZE_PER_THREAD = 512 * 1024;
+struct TThreadAllocInfo {
+    // FreePtrs - pointers to first free blocks in per thread block list
+    // LastFreePtrs - pointers to last blocks in lists, may be invalid if FreePtr is zero
+    char* FreePtrs[N_SIZES][THREAD_BUF];
+    int FreePtrIndex[N_SIZES];
+    TThreadAllocInfo* pNextInfo;
+    TLocalCounter LocalCounters[CT_MAX];
+
+#if defined(LFALLOC_DBG)
+    TLocalPerTagAllocCounter LocalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES];
+#endif
+#ifdef _win_
+    HANDLE hThread;
+#endif
+
+    void Init(TThreadAllocInfo** pHead) {
+        memset(this, 0, sizeof(*this));
+        for (auto& i : FreePtrIndex)
+            i = THREAD_BUF;
+#ifdef _win_
+        BOOL b = DuplicateHandle(
+            GetCurrentProcess(), GetCurrentThread(),
+            GetCurrentProcess(), &hThread,
+            0, FALSE, DUPLICATE_SAME_ACCESS);
+        Y_ASSERT_NOBT(b);
+#endif
+        pNextInfo = *pHead;
+        *pHead = this;
+        for (int k = 0; k < N_SIZES; ++k) {
+            int maxCount = MAX_MEM_PER_SIZE_PER_THREAD / nSizeIdxToSize[k];
+            if (maxCount > THREAD_BUF)
+                maxCount = THREAD_BUF;
+            borderSizes[k] = THREAD_BUF - maxCount;
+        }
+        for (int i = 0; i < CT_MAX; ++i) {
+            LocalCounters[i].Init(&GlobalCounters[i]);
+        }
+#if defined(LFALLOC_DBG)
+        for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+            for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+                auto& local = LocalPerTagAllocCounters[tag][sizeIdx];
+                local.Init();
+            }
+        }
+#endif
+    }
+    void Done() {
+        for (auto sizeIdx : FreePtrIndex) {
+            Y_ASSERT_NOBT(sizeIdx == THREAD_BUF);
+        }
+        for (auto& localCounter : LocalCounters) {
+            localCounter.Flush();
+        }
+#if defined(LFALLOC_DBG)
+        for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+            for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+                auto& local = LocalPerTagAllocCounters[tag][sizeIdx];
+                auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+                local.Flush(global);
+            }
+        }
+#endif
+#ifdef _win_
+        if (hThread)
+            CloseHandle(hThread);
+#endif
+    }
+};
+PERTHREAD TThreadAllocInfo* pThreadInfo;
+static TThreadAllocInfo* pThreadInfoList;
+
+static TLFLockData LFLockThreadInfo;
+
+static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value) {
+#ifdef LFALLOC_YT
+    TThreadAllocInfo* thr = pThreadInfo;
+    if (thr) {
+        thr->LocalCounters[counter].Increment(value);
+    } else {
+        AtomicAdd(GlobalCounters[counter], value);
+    }
+#endif
+}
+
+extern "C" i64 GetLFAllocCounterFast(int counter) {
+#ifdef LFALLOC_YT
+    return GlobalCounters[counter];
+#else
+    return 0;
+#endif
+}
+
+extern "C" i64 GetLFAllocCounterFull(int counter) {
+#ifdef LFALLOC_YT
+    i64 ret = GlobalCounters[counter];
+    {
+        TLFLockHolder ll(&LFLockThreadInfo);
+        for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+            TThreadAllocInfo* pInfo = *p;
+            ret += pInfo->LocalCounters[counter].Value;
+            p = &pInfo->pNextInfo;
+        }
+    }
+    return ret;
+#else
+    return 0;
+#endif
+}
+
+static void MoveSingleThreadFreeToGlobal(TThreadAllocInfo* pInfo) {
+    for (int sizeIdx = 0; sizeIdx < N_SIZES; ++sizeIdx) {
+        int& freePtrIdx = pInfo->FreePtrIndex[sizeIdx];
+        char** freePtrs = pInfo->FreePtrs[sizeIdx];
+        PutBlocksToGlobalFreeList(sizeIdx, freePtrs + freePtrIdx, THREAD_BUF - freePtrIdx);
+        freePtrIdx = THREAD_BUF;
+    }
+}
+
+#ifdef _win_
+static bool IsDeadThread(TThreadAllocInfo* pInfo) {
+    DWORD dwExit;
+    bool isDead = !GetExitCodeThread(pInfo->hThread, &dwExit) || dwExit != STILL_ACTIVE;
+    return isDead;
+}
+
+static void CleanupAfterDeadThreads() {
+    TLFLockHolder ll(&LFLockThreadInfo);
+    for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+        TThreadAllocInfo* pInfo = *p;
+        if (IsDeadThread(pInfo)) {
+            MoveSingleThreadFreeToGlobal(pInfo);
+            pInfo->Done();
+            *p = pInfo->pNextInfo;
+            SystemFree(pInfo);
+        } else
+            p = &pInfo->pNextInfo;
+    }
+}
+#endif
+
+#ifndef _win_
+static pthread_key_t ThreadCacheCleaner;
+static void* volatile ThreadCacheCleanerStarted; // 0 = not started, -1 = started, -2 = is starting
+static PERTHREAD bool IsStoppingThread;
+
+static void FreeThreadCache(void*) {
+    TThreadAllocInfo* pToDelete = nullptr;
+    {
+        TLFLockHolder ll(&LFLockThreadInfo);
+        pToDelete = pThreadInfo;
+        if (pToDelete == nullptr)
+            return;
+
+        // remove from the list
+        for (TThreadAllocInfo** p = &pThreadInfoList; *p; p = &(*p)->pNextInfo) {
+            if (*p == pToDelete) {
+                *p = pToDelete->pNextInfo;
+                break;
+            }
+        }
+        IsStoppingThread = true;
+        pThreadInfo = nullptr;
+    }
+
+    // free per thread buf
+    MoveSingleThreadFreeToGlobal(pToDelete);
+    pToDelete->Done();
+    SystemFree(pToDelete);
+}
+#endif
+
+static void AllocThreadInfo() {
+#ifndef _win_
+    if (DoCas(&ThreadCacheCleanerStarted, (void*)-2, (void*)nullptr) == (void*)nullptr) {
+        pthread_key_create(&ThreadCacheCleaner, FreeThreadCache);
+        ThreadCacheCleanerStarted = (void*)-1;
+    }
+    if (ThreadCacheCleanerStarted != (void*)-1)
+        return; // do not use ThreadCacheCleaner until it is constructed
+
+    {
+        if (IsStoppingThread)
+            return;
+        TLFLockHolder ll(&LFLockThreadInfo);
+        if (IsStoppingThread) // better safe than sorry
+            return;
+
+        pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo));
+        pThreadInfo->Init(&pThreadInfoList);
+    }
+    pthread_setspecific(ThreadCacheCleaner, (void*)-1); // without value destructor will not be called
+#else
+    CleanupAfterDeadThreads();
+    {
+        pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo));
+        TLFLockHolder ll(&LFLockThreadInfo);
+        pThreadInfo->Init(&pThreadInfoList);
+    }
+#endif
+}
+
+    //////////////////////////////////////////////////////////////////////////
+    // DBG stuff
+    //////////////////////////////////////////////////////////////////////////
+
+#if defined(LFALLOC_DBG)
+
+struct TAllocHeader {
+    uint64_t Size;
+    int Tag;
+    int Cookie;
+};
+
+// should be power of 2
+static_assert(sizeof(TAllocHeader) == 16);
+
+static inline void* GetAllocPtr(TAllocHeader* p) {
+    return p + 1;
+}
+
+static inline TAllocHeader* GetAllocHeader(void* p) {
+    auto* header = ((TAllocHeader*)p) - 1;
+    if (header->Tag == DBG_ALLOC_ALIGNED_TAG) {
+        return (TAllocHeader*)header->Size;
+    }
+
+    return header;
+}
+
+PERTHREAD int AllocationTag;
+extern "C" int SetThreadAllocTag(int tag) {
+    int prevTag = AllocationTag;
+    if (tag < DBG_ALLOC_MAX_TAG && tag >= 0) {
+        AllocationTag = tag;
+    }
+    return prevTag;
+}
+
+PERTHREAD bool ProfileCurrentThread;
+extern "C" bool SetProfileCurrentThread(bool newVal) {
+    bool prevVal = ProfileCurrentThread;
+    ProfileCurrentThread = newVal;
+    return prevVal;
+}
+
+static volatile bool ProfileAllThreads;
+extern "C" bool SetProfileAllThreads(bool newVal) {
+    bool prevVal = ProfileAllThreads;
+    ProfileAllThreads = newVal;
+    return prevVal;
+}
+
+static volatile bool AllocationSamplingEnabled;
+extern "C" bool SetAllocationSamplingEnabled(bool newVal) {
+    bool prevVal = AllocationSamplingEnabled;
+    AllocationSamplingEnabled = newVal;
+    return prevVal;
+}
+
+static size_t AllocationSampleRate = 1000;
+extern "C" size_t SetAllocationSampleRate(size_t newVal) {
+    size_t prevVal = AllocationSampleRate;
+    AllocationSampleRate = newVal;
+    return prevVal;
+}
+
+static size_t AllocationSampleMaxSize = N_MAX_FAST_SIZE;
+extern "C" size_t SetAllocationSampleMaxSize(size_t newVal) {
+    size_t prevVal = AllocationSampleMaxSize;
+    AllocationSampleMaxSize = newVal;
+    return prevVal;
+}
+
+using TAllocationCallback = int(int tag, size_t size, int sizeIdx);
+static TAllocationCallback* AllocationCallback;
+extern "C" TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal) {
+    TAllocationCallback* prevVal = AllocationCallback;
+    AllocationCallback = newVal;
+    return prevVal;
+}
+
+using TDeallocationCallback = void(int cookie, int tag, size_t size, int sizeIdx);
+static TDeallocationCallback* DeallocationCallback;
+extern "C" TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal) {
+    TDeallocationCallback* prevVal = DeallocationCallback;
+    DeallocationCallback = newVal;
+    return prevVal;
+}
+
+PERTHREAD TAtomic AllocationsCount;
+PERTHREAD bool InAllocationCallback;
+
+static const int DBG_ALLOC_INVALID_COOKIE = -1;
+static inline int SampleAllocation(TAllocHeader* p, int sizeIdx) {
+    int cookie = DBG_ALLOC_INVALID_COOKIE;
+    if (AllocationSamplingEnabled && (ProfileCurrentThread || ProfileAllThreads) && !InAllocationCallback) {
+        if (p->Size > AllocationSampleMaxSize || ++AllocationsCount % AllocationSampleRate == 0) {
+            if (AllocationCallback) {
+                InAllocationCallback = true;
+                cookie = AllocationCallback(p->Tag, p->Size, sizeIdx);
+                InAllocationCallback = false;
+            }
+        }
+    }
+    return cookie;
+}
+
+static inline void SampleDeallocation(TAllocHeader* p, int sizeIdx) {
+    if (p->Cookie != DBG_ALLOC_INVALID_COOKIE && !InAllocationCallback) {
+        if (DeallocationCallback) {
+            InAllocationCallback = true;
+            DeallocationCallback(p->Cookie, p->Tag, p->Size, sizeIdx);
+            InAllocationCallback = false;
+        }
+    }
+}
+
+static inline void TrackPerTagAllocation(TAllocHeader* p, int sizeIdx) {
+    if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) {
+        Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES);
+        auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx];
+
+        TThreadAllocInfo* thr = pThreadInfo;
+        if (thr) {
+            auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx];
+            local.Alloc(global, p->Size);
+        } else {
+            global.Alloc(p->Size);
+        }
+    }
+}
+
+static inline void TrackPerTagDeallocation(TAllocHeader* p, int sizeIdx) {
+    if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) {
+        Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES);
+        auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx];
+
+        TThreadAllocInfo* thr = pThreadInfo;
+        if (thr) {
+            auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx];
+            local.Free(global, p->Size);
+        } else {
+            global.Free(p->Size);
+        }
+    }
+}
+
+static void* TrackAllocation(void* ptr, size_t size, int sizeIdx) {
+    TAllocHeader* p = (TAllocHeader*)ptr;
+    p->Size = size;
+    p->Tag = AllocationTag;
+    p->Cookie = SampleAllocation(p, sizeIdx);
+    TrackPerTagAllocation(p, sizeIdx);
+    return GetAllocPtr(p);
+}
+
+static void TrackDeallocation(void* ptr, int sizeIdx) {
+    TAllocHeader* p = (TAllocHeader*)ptr;
+    SampleDeallocation(p, sizeIdx);
+    TrackPerTagDeallocation(p, sizeIdx);
+}
+
+struct TPerTagAllocInfo {
+    ssize_t Count;
+    ssize_t Size;
+};
+
+extern "C" void GetPerTagAllocInfo(
+    bool flushPerThreadCounters,
+    TPerTagAllocInfo* info,
+    int& maxTag,
+    int& numSizes) {
+    maxTag = DBG_ALLOC_MAX_TAG;
+    numSizes = DBG_ALLOC_NUM_SIZES;
+
+    if (info) {
+        if (flushPerThreadCounters) {
+            TLFLockHolder ll(&LFLockThreadInfo);
+            for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+                TThreadAllocInfo* pInfo = *p;
+                for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+                    for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+                        auto& local = pInfo->LocalPerTagAllocCounters[tag][sizeIdx];
+                        auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+                        local.Flush(global);
+                    }
+                }
+                p = &pInfo->pNextInfo;
+            }
+        }
+
+        for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+            for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+                auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+                auto& res = info[tag * DBG_ALLOC_NUM_SIZES + sizeIdx];
+                res.Count = global.Count;
+                res.Size = global.Size;
+            }
+        }
+    }
+}
+
+#endif // LFALLOC_DBG
+
+//////////////////////////////////////////////////////////////////////////
+static Y_FORCE_INLINE void* LFAllocImpl(size_t _nSize) {
+#if defined(LFALLOC_DBG)
+    size_t size = _nSize;
+    _nSize += sizeof(TAllocHeader);
+#endif
+
+    IncrementCounter(CT_USER_ALLOC, _nSize);
+
+    int nSizeIdx;
+    if (_nSize > 512) {
+        if (_nSize > N_MAX_FAST_SIZE) {
+            void* ptr = LargeBlockAlloc(_nSize, CT_LARGE_ALLOC);
+#if defined(LFALLOC_DBG)
+            ptr = TrackAllocation(ptr, size, N_SIZES);
+#endif
+            return ptr;
+        }
+        nSizeIdx = size2idxArr2[(_nSize - 1) >> 8];
+    } else
+        nSizeIdx = size2idxArr1[1 + (((int)_nSize - 1) >> 3)];
+
+    IncrementCounter(CT_SMALL_ALLOC, nSizeIdxToSize[nSizeIdx]);
+
+    // check per thread buffer
+    TThreadAllocInfo* thr = pThreadInfo;
+    if (!thr) {
+        AllocThreadInfo();
+        thr = pThreadInfo;
+        if (!thr) {
+            void* ptr = LFAllocNoCache(nSizeIdx, MEM_DEFRAG);
+#if defined(LFALLOC_DBG)
+            ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+            return ptr;
+        }
+    }
+    {
+        int& freePtrIdx = thr->FreePtrIndex[nSizeIdx];
+        if (freePtrIdx < THREAD_BUF) {
+            void* ptr = thr->FreePtrs[nSizeIdx][freePtrIdx++];
+#if defined(LFALLOC_DBG)
+            ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+            return ptr;
+        }
+
+        // try to alloc from global free list
+        char* buf[FL_GROUP_SIZE];
+        int count = TakeBlocksFromGlobalFreeList(nSizeIdx, buf);
+        if (count == 0) {
+            count = LFAllocNoCacheMultiple(nSizeIdx, buf);
+            if (count == 0) {
+                NMalloc::AbortFromCorruptedAllocator("no way LFAllocNoCacheMultiple() can fail");
+            }
+        }
+        char** dstBuf = thr->FreePtrs[nSizeIdx] + freePtrIdx - 1;
+        for (int i = 0; i < count - 1; ++i)
+            dstBuf[-i] = buf[i];
+        freePtrIdx -= count - 1;
+        void* ptr = buf[count - 1];
+#if defined(LFALLOC_DBG)
+        ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+        return ptr;
+    }
+}
+
+static Y_FORCE_INLINE void* LFAlloc(size_t _nSize) {
+    void* res = LFAllocImpl(_nSize);
+#ifdef DBG_FILL_MEMORY
+    if (FillMemoryOnAllocation && res && (_nSize <= DBG_FILL_MAX_SIZE)) {
+        memset(res, 0xcf, _nSize);
+    }
+#endif
+    return res;
+}
+
+static Y_FORCE_INLINE void LFFree(void* p) {
+#if defined(LFALLOC_DBG)
+    if (p == nullptr)
+        return;
+    p = GetAllocHeader(p);
+#endif
+
+    uintptr_t chkOffset = ((char*)p - ALLOC_START) - 1ll;
+    if (chkOffset >= N_MAX_WORKSET_SIZE) {
+        if (p == nullptr)
+            return;
+#if defined(LFALLOC_DBG)
+        TrackDeallocation(p, N_SIZES);
+#endif
+        LargeBlockFree(p, CT_LARGE_FREE);
+        return;
+    }
+
+    uintptr_t chunk = ((char*)p - ALLOC_START) / N_CHUNK_SIZE;
+    ptrdiff_t nSizeIdx = chunkSizeIdx[chunk];
+    if (nSizeIdx <= 0) {
+#if defined(LFALLOC_DBG)
+        TrackDeallocation(p, N_SIZES);
+#endif
+        LargeBlockFree(p, CT_LARGE_FREE);
+        return;
+    }
+
+#if defined(LFALLOC_DBG)
+    TrackDeallocation(p, nSizeIdx);
+#endif
+
+#ifdef DBG_FILL_MEMORY
+    memset(p, 0xfe, nSizeIdxToSize[nSizeIdx]);
+#endif
+
+    IncrementCounter(CT_SMALL_FREE, nSizeIdxToSize[nSizeIdx]);
+
+    // try to store info to per thread buf
+    TThreadAllocInfo* thr = pThreadInfo;
+    if (thr) {
+        int& freePtrIdx = thr->FreePtrIndex[nSizeIdx];
+        if (freePtrIdx > borderSizes[nSizeIdx]) {
+            thr->FreePtrs[nSizeIdx][--freePtrIdx] = (char*)p;
+            return;
+        }
+
+        // move several pointers to global free list
+        int freeCount = FL_GROUP_SIZE;
+        if (freeCount > THREAD_BUF - freePtrIdx)
+            freeCount = THREAD_BUF - freePtrIdx;
+        char** freePtrs = thr->FreePtrs[nSizeIdx];
+        PutBlocksToGlobalFreeList(nSizeIdx, freePtrs + freePtrIdx, freeCount);
+        freePtrIdx += freeCount;
+
+        freePtrs[--freePtrIdx] = (char*)p;
+
+    } else {
+        AllocThreadInfo();
+        PutBlocksToGlobalFreeList(nSizeIdx, (char**)&p, 1);
+    }
+}
+
+static size_t LFGetSize(const void* p) {
+#if defined(LFALLOC_DBG)
+    if (p == nullptr)
+        return 0;
+    return GetAllocHeader(const_cast<void*>(p))->Size;
+#endif
+
+    uintptr_t chkOffset = ((const char*)p - ALLOC_START);
+    if (chkOffset >= N_MAX_WORKSET_SIZE) {
+        if (p == nullptr)
+            return 0;
+        return TLargeBlk::As(p)->Pages * 4096ll;
+    }
+    uintptr_t chunk = ((const char*)p - ALLOC_START) / N_CHUNK_SIZE;
+    ptrdiff_t nSizeIdx = chunkSizeIdx[chunk];
+    if (nSizeIdx <= 0)
+        return TLargeBlk::As(p)->Pages * 4096ll;
+    return nSizeIdxToSize[nSizeIdx];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Output mem alloc stats
+const int N_PAGE_SIZE = 4096;
+static void DebugTraceMMgr(const char* pszFormat, ...) // __cdecl
+{
+    static char buff[20000];
+    va_list va;
+    //
+    va_start(va, pszFormat);
+    vsprintf(buff, pszFormat, va);
+    va_end(va);
+//
+#ifdef _win_
+    OutputDebugStringA(buff);
+#else
+    fputs(buff, stderr);
+#endif
+}
+
+struct TChunkStats {
+    char *Start, *Finish;
+    i64 Size;
+    char* Entries;
+    i64 FreeCount;
+
+    TChunkStats(size_t chunk, i64 size, char* entries)
+        : Size(size)
+        , Entries(entries)
+        , FreeCount(0)
+    {
+        Start = ALLOC_START + chunk * N_CHUNK_SIZE;
+        Finish = Start + N_CHUNK_SIZE;
+    }
+    void CheckBlock(char* pBlock) {
+        if (pBlock && pBlock >= Start && pBlock < Finish) {
+            ++FreeCount;
+            i64 nShift = pBlock - Start;
+            i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1);
+            Entries[nOffsetInStep / Size] = 1;
+        }
+    }
+    void SetGlobalFree(char* ptr) {
+        i64 nShift = ptr - Start;
+        i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1);
+        while (nOffsetInStep + Size <= N_CHUNK_SIZE) {
+            ++FreeCount;
+            Entries[nOffsetInStep / Size] = 1;
+            nOffsetInStep += Size;
+        }
+    }
+};
+
+static void DumpMemoryBlockUtilizationLocked() {
+    TFreeListGroup* wholeLists[N_SIZES];
+    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) {
+        wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList();
+    }
+    char* bfList = (char*)blockFreeList.GetWholeList();
+
+    DebugTraceMMgr("memory blocks utilisation stats:\n");
+    i64 nTotalAllocated = 0, nTotalFree = 0, nTotalBadPages = 0, nTotalPages = 0, nTotalUsed = 0, nTotalLocked = 0;
+    i64 nTotalGroupBlocks = 0;
+    char* entries;
+    entries = (char*)SystemAlloc((N_CHUNK_SIZE / 4));
+    for (size_t k = 0; k < N_CHUNKS; ++k) {
+        if (chunkSizeIdx[k] <= 0) {
+            if (chunkSizeIdx[k] == -1)
+                nTotalLocked += N_CHUNK_SIZE;
+            continue;
+        }
+        i64 nSizeIdx = chunkSizeIdx[k];
+        i64 nSize = nSizeIdxToSize[nSizeIdx];
+        TChunkStats cs(k, nSize, entries);
+        int nEntriesTotal = N_CHUNK_SIZE / nSize;
+        memset(entries, 0, nEntriesTotal);
+        for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) {
+            for (auto& ptr : g->Ptrs)
+                cs.CheckBlock(ptr);
+        }
+        TChunkStats csGB(k, nSize, entries);
+        if (nSizeIdx == FREE_LIST_GROUP_SIZEIDX) {
+            for (auto g : wholeLists) {
+                for (; g; g = g->Next)
+                    csGB.CheckBlock((char*)g);
+            }
+            for (char* blk = bfList; blk; blk = *(char**)blk)
+                csGB.CheckBlock(blk);
+            nTotalGroupBlocks += csGB.FreeCount * nSize;
+        }
+        if (((globalCurrentPtr[nSizeIdx] - ALLOC_START) / N_CHUNK_SIZE) == k)
+            cs.SetGlobalFree(globalCurrentPtr[nSizeIdx]);
+        nTotalUsed += (nEntriesTotal - cs.FreeCount - csGB.FreeCount) * nSize;
+
+        char pages[N_CHUNK_SIZE / N_PAGE_SIZE];
+        memset(pages, 0, sizeof(pages));
+        for (int i = 0, nShift = 0; i < nEntriesTotal; ++i, nShift += nSize) {
+            int nBit = 0;
+            if (entries[i])
+                nBit = 1; // free entry
+            else
+                nBit = 2; // used entry
+            for (i64 nDelta = nSize - 1; nDelta >= 0; nDelta -= N_PAGE_SIZE)
+                pages[(nShift + nDelta) / N_PAGE_SIZE] |= nBit;
+        }
+        i64 nBadPages = 0;
+        for (auto page : pages) {
+            nBadPages += page == 3;
+            nTotalPages += page != 1;
+        }
+        DebugTraceMMgr("entry = %lld; size = %lld; free = %lld; system %lld; utilisation = %g%%, fragmentation = %g%%\n",
+                       k, nSize, cs.FreeCount * nSize, csGB.FreeCount * nSize,
+                       (N_CHUNK_SIZE - cs.FreeCount * nSize) * 100.0f / N_CHUNK_SIZE, 100.0f * nBadPages / Y_ARRAY_SIZE(pages));
+        nTotalAllocated += N_CHUNK_SIZE;
+        nTotalFree += cs.FreeCount * nSize;
+        nTotalBadPages += nBadPages;
+    }
+    SystemFree(entries);
+    DebugTraceMMgr("Total allocated = %llu, free = %lld, system = %lld, locked for future use %lld, utilisation = %g, fragmentation = %g\n",
+                   nTotalAllocated, nTotalFree, nTotalGroupBlocks, nTotalLocked,
+                   100.0f * (nTotalAllocated - nTotalFree) / nTotalAllocated, 100.0f * nTotalBadPages / nTotalPages);
+    DebugTraceMMgr("Total %lld bytes used, %lld bytes in used pages\n", nTotalUsed, nTotalPages * N_PAGE_SIZE);
+
+    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx)
+        globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]);
+    blockFreeList.ReturnWholeList(bfList);
+}
+
+void FlushThreadFreeList() {
+    if (pThreadInfo)
+        MoveSingleThreadFreeToGlobal(pThreadInfo);
+}
+
+void DumpMemoryBlockUtilization() {
+    // move current thread free to global lists to get better statistics
+    FlushThreadFreeList();
+    {
+        TLFLockHolder ls(&LFGlobalLock);
+        DumpMemoryBlockUtilizationLocked();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// malloc api
+
+static bool LFAlloc_SetParam(const char* param, const char* value) {
+    if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE")) {
+        LB_LIMIT_TOTAL_SIZE = atoi(value);
+        return true;
+    }
+    if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE_BYTES")) {
+        LB_LIMIT_TOTAL_SIZE = (atoi(value) + N_PAGE_SIZE - 1) / N_PAGE_SIZE;
+        return true;
+    }
+#ifdef DBG_FILL_MEMORY
+    if (!strcmp(param, "FillMemoryOnAllocation")) {
+        FillMemoryOnAllocation = !strcmp(value, "true");
+        return true;
+    }
+#endif
+    if (!strcmp(param, "TransparentHugePages")) {
+        TransparentHugePages = !strcmp(value, "true");
+        return true;
+    }
+    if (!strcmp(param, "MapHugeTLB")) {
+        MapHugeTLB = !strcmp(value, "true");
+        return true;
+    }
+    if (!strcmp(param, "EnableDefrag")) {
+        EnableDefrag = !strcmp(value, "true");
+        return true;
+    }
+    return false;
+};
+
+static const char* LFAlloc_GetParam(const char* param) {
+    struct TParam {
+        const char* Name;
+        const char* Value;
+    };
+
+    static const TParam Params[] = {
+        {"GetLFAllocCounterFast", (const char*)&GetLFAllocCounterFast},
+        {"GetLFAllocCounterFull", (const char*)&GetLFAllocCounterFull},
+#if defined(LFALLOC_DBG)
+        {"SetThreadAllocTag", (const char*)&SetThreadAllocTag},
+        {"SetProfileCurrentThread", (const char*)&SetProfileCurrentThread},
+        {"SetProfileAllThreads", (const char*)&SetProfileAllThreads},
+        {"SetAllocationSamplingEnabled", (const char*)&SetAllocationSamplingEnabled},
+        {"SetAllocationSampleRate", (const char*)&SetAllocationSampleRate},
+        {"SetAllocationSampleMaxSize", (const char*)&SetAllocationSampleMaxSize},
+        {"SetAllocationCallback", (const char*)&SetAllocationCallback},
+        {"SetDeallocationCallback", (const char*)&SetDeallocationCallback},
+        {"GetPerTagAllocInfo", (const char*)&GetPerTagAllocInfo},
+#endif // LFALLOC_DBG
+    };
+
+    for (int i = 0; i < Y_ARRAY_SIZE(Params); ++i) {
+        if (strcmp(param, Params[i].Name) == 0) {
+            return Params[i].Value;
+        }
+    }
+    return nullptr;
+}
+
+static Y_FORCE_INLINE int LFPosixMemalign(void** memptr, size_t alignment, size_t size) {
+    if (Y_UNLIKELY(alignment > 4096)) {
+        const char* error = "Larger alignment are not guaranteed with this implementation\n";
+#ifdef _win_
+        OutputDebugStringA(error);
+#endif
+        NMalloc::AbortFromCorruptedAllocator(error);
+    }
+    size_t bigsize = size;
+    if (bigsize <= alignment) {
+        bigsize = alignment;
+    } else if (bigsize < 2 * alignment) {
+        bigsize = 2 * alignment;
+    }
+#if defined(LFALLOC_DBG)
+    if (alignment > sizeof(TAllocHeader)) {
+        bigsize += alignment;
+    }
+#endif
+
+    *memptr = LFAlloc(bigsize);
+
+#if defined(LFALLOC_DBG)
+    if (alignment > sizeof(TAllocHeader)) {
+        // memptr isn't aligned due to alloc header
+        const auto* header = GetAllocHeader(*memptr);
+        *memptr = (void*)((const char*) (*memptr) + alignment - sizeof(TAllocHeader));
+
+        // make fake header to retrieve original header ptr on dealloc
+        auto* next = GetAllocHeader(*memptr);
+        next->Tag = DBG_ALLOC_ALIGNED_TAG;
+        next->Size = (uint64_t)header;
+        next->Cookie = 0;
+    }
+#endif
+
+    Y_ASSERT_NOBT((intptr_t)*memptr % alignment == 0);
+    return 0;
+}
+
+static Y_FORCE_INLINE void* LFVAlloc(size_t size) {
+    const size_t pg = N_PAGE_SIZE;
+    void* p = nullptr;
+
+#if defined(LFALLOC_DBG)
+    LFPosixMemalign(&p, pg, size);
+#else
+    size_t bigsize = (size + pg - 1) & (~(pg - 1));
+    p = LFAlloc(bigsize);
+#endif
+
+    Y_ASSERT_NOBT((intptr_t)p % N_PAGE_SIZE == 0);
+    return p;
+}
+
+#endif
diff --git a/library/cpp/lfalloc/ya.make b/library/cpp/lfalloc/ya.make
new file mode 100644
index 0000000000..cace05f9d8
--- /dev/null
+++ b/library/cpp/lfalloc/ya.make
@@ -0,0 +1,25 @@
+LIBRARY()
+
+OWNER(gulin)
+
+NO_UTIL()
+
+NO_COMPILER_WARNINGS()
+
+IF (ARCH_AARCH64)
+    PEERDIR(
+        contrib/libs/jemalloc
+    )
+ELSE()
+    SRCS(
+        lf_allocX64.cpp
+    )
+ENDIF()
+
+PEERDIR(
+    library/cpp/malloc/api
+)
+
+SET(IDE_FOLDER "util")
+
+END()
diff --git a/library/cpp/lfalloc/yt/ya.make b/library/cpp/lfalloc/yt/ya.make
new file mode 100644
index 0000000000..8c1a4f8a72
--- /dev/null
+++ b/library/cpp/lfalloc/yt/ya.make
@@ -0,0 +1,29 @@
+LIBRARY()
+
+OWNER(a-romanov)
+
+NO_UTIL()
+
+NO_COMPILER_WARNINGS()
+
+IF (ARCH_AARCH64)
+    PEERDIR(
+        contrib/libs/jemalloc
+    )
+ELSE()
+    IF ("${YMAKE}" MATCHES "devtools")
+        CFLAGS(-DYMAKE=1)
+    ENDIF()
+    CXXFLAGS(-DLFALLOC_YT)
+    SRCS(
+        ../lf_allocX64.cpp
+    )
+ENDIF()
+
+PEERDIR(
+    library/cpp/malloc/api
+)
+
+SET(IDE_FOLDER "util")
+
+END()
author	Devtools Arcadia <arcadia-devtools@yandex-team.ru>	2022-02-07 18:08:42 +0300
committer	Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>	2022-02-07 18:08:42 +0300
commit	1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree	e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/lfalloc
download	ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz