intermediate changes

ref:cde9a383711a11544ce7e107a78147fb96cc4029
author: Devtools Arcadia <arcadia-devtools@yandex-team.ru> 2022-02-07 18:08:42 +0300
committer: Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> 2022-02-07 18:08:42 +0300
commit: 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree: e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/lfalloc/lf_allocX64.h
download: ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
1 files changed, 1926 insertions, 0 deletions
diff --git a/library/cpp/lfalloc/lf_allocX64.h b/library/cpp/lfalloc/lf_allocX64.h
new file mode 100644
index 0000000000..fd2a906d6f
--- /dev/null
+++ b/library/cpp/lfalloc/lf_allocX64.h
@@ -0,0 +1,1926 @@
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+
+#include <library/cpp/malloc/api/malloc.h>
+
+#include <util/system/compat.h>
+#include <util/system/compiler.h>
+#include <util/system/types.h>
+
+#ifdef _MSC_VER
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+#ifdef _M_X64
+#define _64_
+#endif
+#include <intrin.h>
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#pragma intrinsic(_InterlockedCompareExchange)
+#pragma intrinsic(_InterlockedExchangeAdd)
+
+#include <new>
+#include <assert.h>
+#include <errno.h>
+
+#define PERTHREAD __declspec(thread)
+#define _win_
+#define Y_FORCE_INLINE __forceinline
+
+using TAtomic = volatile long;
+
+static inline long AtomicAdd(TAtomic& a, long b) {
+    return _InterlockedExchangeAdd(&a, b) + b;
+}
+
+static inline long AtomicSub(TAtomic& a, long b) {
+    return AtomicAdd(a, -b);
+}
+
+#pragma comment(lib, "synchronization.lib")
+
+#ifndef NDEBUG
+#define Y_ASSERT_NOBT(x)           \
+    {                              \
+        if (IsDebuggerPresent()) { \
+            if (!(x))              \
+                __debugbreak();    \
+        } else                     \
+            assert(x);             \
+    }
+#else
+#define Y_ASSERT_NOBT(x) ((void)0)
+#endif
+
+#else
+
+#include <util/system/defaults.h>
+#include <util/system/atomic.h>
+#include <util/system/yassert.h>
+
+#if !defined(NDEBUG) && !defined(__GCCXML__)
+#define Y_ASSERT_NOBT(a)                                    \
+    do {                                                    \
+        try {                                               \
+            if (Y_UNLIKELY(!(a))) {                         \
+                if (YaIsDebuggerPresent())                  \
+                    __debugbreak();                         \
+                else {                                      \
+                    assert(false && (a));                   \
+                }                                           \
+            }                                               \
+        } catch (...) {                                     \
+            if (YaIsDebuggerPresent())                      \
+                __debugbreak();                             \
+            else {                                          \
+                assert(false && "Exception during assert"); \
+            }                                               \
+        }                                                   \
+    } while (0)
+#else
+#define Y_ASSERT_NOBT(a)                       \
+    do {                                       \
+        if (false) {                           \
+            bool __xxx = static_cast<bool>(a); \
+            Y_UNUSED(__xxx);                   \
+        }                                      \
+    } while (0)
+#endif
+
+#include <pthread.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <new>
+#include <errno.h>
+
+#if defined(_linux_)
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#if !defined(MADV_HUGEPAGE)
+#define MADV_HUGEPAGE 14
+#endif
+#if !defined(MAP_HUGETLB)
+#define MAP_HUGETLB 0x40000
+#endif
+#endif
+
+#define PERTHREAD __thread
+
+#endif
+
+#ifndef _darwin_
+
+#ifndef Y_ARRAY_SIZE
+#define Y_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+#endif
+
+#ifndef NDEBUG
+#define DBG_FILL_MEMORY
+static bool FillMemoryOnAllocation = true;
+#endif
+
+static bool TransparentHugePages = false; // force MADV_HUGEPAGE for large allocs
+static bool MapHugeTLB = false;           // force MAP_HUGETLB for small allocs
+static bool EnableDefrag = true;
+
+// Buffers that are larger than this size will not be filled with 0xcf
+#ifndef DBG_FILL_MAX_SIZE
+#define DBG_FILL_MAX_SIZE 0x01000000000000ULL
+#endif
+
+template <class T>
+inline T* DoCas(T* volatile* target, T* exchange, T* compare) {
+#if defined(__has_builtin) && __has_builtin(__sync_val_compare_and_swap)
+    return __sync_val_compare_and_swap(target, compare, exchange);
+#elif defined(_WIN32)
+#ifdef _64_
+    return (T*)_InterlockedCompareExchange64((__int64*)target, (__int64)exchange, (__int64)compare);
+#else
+    //return (T*)InterlockedCompareExchangePointer(targetVoidP, exchange, compare);
+    return (T*)_InterlockedCompareExchange((LONG*)target, (LONG)exchange, (LONG)compare);
+#endif
+#elif defined(__i386) || defined(__x86_64__)
+    union {
+        T* volatile* NP;
+        void* volatile* VoidP;
+    } gccSucks;
+    gccSucks.NP = target;
+    void* volatile* targetVoidP = gccSucks.VoidP;
+
+    __asm__ __volatile__(
+        "lock\n\t"
+        "cmpxchg %2,%0\n\t"
+        : "+m"(*(targetVoidP)), "+a"(compare)
+        : "r"(exchange)
+        : "cc", "memory");
+    return compare;
+#else
+#error inline_cas not defined for this platform
+#endif
+}
+
+#ifdef _64_
+const uintptr_t N_MAX_WORKSET_SIZE = 0x100000000ll * 200;
+const uintptr_t N_HUGE_AREA_FINISH = 0x700000000000ll;
+#ifndef _freebsd_
+const uintptr_t LINUX_MMAP_AREA_START = 0x100000000ll;
+static uintptr_t volatile linuxAllocPointer = LINUX_MMAP_AREA_START;
+static uintptr_t volatile linuxAllocPointerHuge = LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE;
+#endif
+#else
+const uintptr_t N_MAX_WORKSET_SIZE = 0xffffffff;
+#endif
+#define ALLOC_START ((char*)0)
+
+const size_t N_CHUNK_SIZE = 1024 * 1024;
+const size_t N_CHUNKS = N_MAX_WORKSET_SIZE / N_CHUNK_SIZE;
+const size_t N_LARGE_ALLOC_SIZE = N_CHUNK_SIZE * 128;
+
+// map size idx to size in bytes
+#ifdef LFALLOC_YT
+const int N_SIZES = 27;
+#else
+const int N_SIZES = 25;
+#endif
+const int nSizeIdxToSize[N_SIZES] = {
+    -1,
+#if defined(_64_)
+    16, 16, 32, 32, 48, 64, 96, 128,
+#else
+    8,
+    16,
+    24,
+    32,
+    48,
+    64,
+    96,
+    128,
+#endif
+    192, 256, 384, 512, 768, 1024, 1536, 2048,
+    3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768,
+#ifdef LFALLOC_YT
+    49152, 65536
+#endif
+};
+#ifdef LFALLOC_YT
+const size_t N_MAX_FAST_SIZE = 65536;
+#else
+const size_t N_MAX_FAST_SIZE = 32768;
+#endif
+const unsigned char size2idxArr1[64 + 1] = {
+    1,
+#if defined(_64_)
+    2, 2, 4, 4, // 16, 16, 32, 32
+#else
+    1, 2, 3, 4, // 8, 16, 24, 32
+#endif
+    5, 5, 6, 6,                                                     // 48, 64
+    7, 7, 7, 7, 8, 8, 8, 8,                                         // 96, 128
+    9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10,         // 192, 256
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, // 384
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12  // 512
+};
+#ifdef LFALLOC_YT
+const unsigned char size2idxArr2[256] = {
+#else
+const unsigned char size2idxArr2[128] = {
+#endif
+    12, 12, 13, 14,                                                 // 512, 512, 768, 1024
+    15, 15, 16, 16,                                                 // 1536, 2048
+    17, 17, 17, 17, 18, 18, 18, 18,                                 // 3072, 4096
+    19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, // 6144, 8192
+    21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, // 12288
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, // 16384
+    23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+    23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, // 24576
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, // 32768
+#ifdef LFALLOC_YT
+    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, // 49152
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, // 65536
+#endif
+};
+
+// map entry number to size idx
+// special size idx's: 0 = not used, -1 = mem locked, but not allocated
+static volatile char chunkSizeIdx[N_CHUNKS];
+const int FREE_CHUNK_ARR_BUF = 0x20000; // this is effectively 128G of free memory (with 1M chunks), should not be exhausted actually
+static volatile uintptr_t freeChunkArr[FREE_CHUNK_ARR_BUF];
+static volatile int freeChunkCount;
+
+static void AddFreeChunk(uintptr_t chunkId) {
+    chunkSizeIdx[chunkId] = -1;
+    if (Y_UNLIKELY(freeChunkCount == FREE_CHUNK_ARR_BUF))
+        NMalloc::AbortFromCorruptedAllocator("free chunks array overflowed");
+    freeChunkArr[freeChunkCount++] = chunkId;
+}
+
+static bool GetFreeChunk(uintptr_t* res) {
+    if (freeChunkCount == 0) {
+        *res = 0;
+        return false;
+    }
+    *res = freeChunkArr[--freeChunkCount];
+    return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+enum ELFAllocCounter {
+    CT_USER_ALLOC,     // accumulated size requested by user code
+    CT_MMAP,           // accumulated mmapped size
+    CT_MMAP_CNT,       // number of mmapped regions
+    CT_MUNMAP,         // accumulated unmmapped size
+    CT_MUNMAP_CNT,     // number of munmaped regions
+    CT_SYSTEM_ALLOC,   // accumulated allocated size for internal lfalloc needs
+    CT_SYSTEM_FREE,    // accumulated deallocated size for internal lfalloc needs
+    CT_SMALL_ALLOC,    // accumulated allocated size for fixed-size blocks
+    CT_SMALL_FREE,     // accumulated deallocated size for fixed-size blocks
+    CT_LARGE_ALLOC,    // accumulated allocated size for large blocks
+    CT_LARGE_FREE,     // accumulated deallocated size for large blocks
+    CT_SLOW_ALLOC_CNT, // number of slow (not LF) allocations
+    CT_DEGRAGMENT_CNT, // number of memory defragmentations
+    CT_MAX
+};
+
+static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value);
+
+//////////////////////////////////////////////////////////////////////////
+enum EMMapMode {
+    MM_NORMAL, // memory for small allocs
+    MM_HUGE    // memory for large allocs
+};
+
+#ifndef _MSC_VER
+inline void VerifyMmapResult(void* result) {
+    if (Y_UNLIKELY(result == MAP_FAILED))
+        NMalloc::AbortFromCorruptedAllocator("negative size requested? or just out of mem");
+}
+#endif
+
+#if !defined(_MSC_VER) && !defined(_freebsd_) && defined(_64_)
+static char* AllocWithMMapLinuxImpl(uintptr_t sz, EMMapMode mode) {
+    char* volatile* areaPtr;
+    char* areaStart;
+    uintptr_t areaFinish;
+
+    int mapProt = PROT_READ | PROT_WRITE;
+    int mapFlags = MAP_PRIVATE | MAP_ANON;
+
+    if (mode == MM_HUGE) {
+        areaPtr = reinterpret_cast<char* volatile*>(&linuxAllocPointerHuge);
+        areaStart = reinterpret_cast<char*>(LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE);
+        areaFinish = N_HUGE_AREA_FINISH;
+    } else {
+        areaPtr = reinterpret_cast<char* volatile*>(&linuxAllocPointer);
+        areaStart = reinterpret_cast<char*>(LINUX_MMAP_AREA_START);
+        areaFinish = N_MAX_WORKSET_SIZE;
+
+        if (MapHugeTLB) {
+            mapFlags |= MAP_HUGETLB;
+        }
+    }
+
+    bool wrapped = false;
+    for (;;) {
+        char* prevAllocPtr = *areaPtr;
+        char* nextAllocPtr = prevAllocPtr + sz;
+        if (uintptr_t(nextAllocPtr - (char*)nullptr) >= areaFinish) {
+            if (Y_UNLIKELY(wrapped)) {
+                NMalloc::AbortFromCorruptedAllocator("virtual memory is over fragmented");
+            }
+            // wrap after all area is used
+            DoCas(areaPtr, areaStart, prevAllocPtr);
+            wrapped = true;
+            continue;
+        }
+
+        if (DoCas(areaPtr, nextAllocPtr, prevAllocPtr) != prevAllocPtr)
+            continue;
+
+        char* largeBlock = (char*)mmap(prevAllocPtr, sz, mapProt, mapFlags, -1, 0);
+        VerifyMmapResult(largeBlock);
+        if (largeBlock == prevAllocPtr)
+            return largeBlock;
+        if (largeBlock)
+            munmap(largeBlock, sz);
+
+        if (sz < 0x80000) {
+            // skip utilized area with big steps
+            DoCas(areaPtr, nextAllocPtr + 0x10 * 0x10000, nextAllocPtr);
+        }
+    }
+}
+#endif
+
+static char* AllocWithMMap(uintptr_t sz, EMMapMode mode) {
+    (void)mode;
+#ifdef _MSC_VER
+    char* largeBlock = (char*)VirtualAlloc(0, sz, MEM_RESERVE, PAGE_READWRITE);
+    if (Y_UNLIKELY(largeBlock == nullptr))
+        NMalloc::AbortFromCorruptedAllocator("out of memory");
+    if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE))
+        NMalloc::AbortFromCorruptedAllocator("out of working set, something has broken");
+#else
+#if defined(_freebsd_) || !defined(_64_)
+    char* largeBlock = (char*)mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    VerifyMmapResult(largeBlock);
+    if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE))
+        NMalloc::AbortFromCorruptedAllocator("out of working set, something has broken");
+#else
+    char* largeBlock = AllocWithMMapLinuxImpl(sz, mode);
+    if (TransparentHugePages) {
+        madvise(largeBlock, sz, MADV_HUGEPAGE);
+    }
+#endif
+#endif
+    Y_ASSERT_NOBT(largeBlock);
+    IncrementCounter(CT_MMAP, sz);
+    IncrementCounter(CT_MMAP_CNT, 1);
+    return largeBlock;
+}
+
+enum class ELarge : ui8 {
+    Free = 0,   // block in free cache
+    Alloc = 1,  // block is allocated
+    Gone = 2,   // block was unmapped
+};
+
+struct TLargeBlk {
+
+    static TLargeBlk* As(void *raw) {
+        return reinterpret_cast<TLargeBlk*>((char*)raw - 4096ll);
+    }
+
+    static const TLargeBlk* As(const void *raw) {
+        return reinterpret_cast<const TLargeBlk*>((const char*)raw - 4096ll);
+    }
+
+    void SetSize(size_t bytes, size_t pages) {
+        Pages = pages;
+        Bytes = bytes;
+    }
+
+    void Mark(ELarge state) {
+        const ui64 marks[] = {
+            0x8b38aa5ca4953c98, // ELarge::Free
+            0xf916d33584eb5087, // ELarge::Alloc
+            0xd33b0eca7651bc3f  // ELarge::Gone
+        };
+
+        Token = size_t(marks[ui8(state)]);
+    }
+
+    size_t Pages; // Total pages allocated with mmap like call
+    size_t Bytes; // Actually requested bytes by user
+    size_t Token; // Block state token, see ELarge enum.
+};
+
+
+static void LargeBlockUnmap(void* p, size_t pages) {
+    const auto bytes = (pages + 1) * uintptr_t(4096);
+
+    IncrementCounter(CT_MUNMAP, bytes);
+    IncrementCounter(CT_MUNMAP_CNT, 1);
+#ifdef _MSC_VER
+    Y_ASSERT_NOBT(0);
+#else
+    TLargeBlk::As(p)->Mark(ELarge::Gone);
+    munmap((char*)p - 4096ll, bytes);
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////
+const size_t LB_BUF_SIZE = 250;
+const size_t LB_BUF_HASH = 977;
+static int LB_LIMIT_TOTAL_SIZE = 500 * 1024 * 1024 / 4096; // do not keep more then this mem total in lbFreePtrs[]
+static void* volatile lbFreePtrs[LB_BUF_HASH][LB_BUF_SIZE];
+static TAtomic lbFreePageCount;
+
+
+static void* LargeBlockAlloc(size_t _nSize, ELFAllocCounter counter) {
+    size_t pgCount = (_nSize + 4095) / 4096;
+#ifdef _MSC_VER
+    char* pRes = (char*)VirtualAlloc(0, (pgCount + 1) * 4096ll, MEM_COMMIT, PAGE_READWRITE);
+    if (Y_UNLIKELY(pRes == 0)) {
+        NMalloc::AbortFromCorruptedAllocator("out of memory");
+    }
+#else
+
+    IncrementCounter(counter, pgCount * 4096ll);
+    IncrementCounter(CT_SYSTEM_ALLOC, 4096ll);
+
+    int lbHash = pgCount % LB_BUF_HASH;
+    for (int i = 0; i < LB_BUF_SIZE; ++i) {
+        void* p = lbFreePtrs[lbHash][i];
+        if (p == nullptr)
+            continue;
+        if (DoCas(&lbFreePtrs[lbHash][i], (void*)nullptr, p) == p) {
+            size_t realPageCount = TLargeBlk::As(p)->Pages;
+            if (realPageCount == pgCount) {
+                AtomicAdd(lbFreePageCount, -pgCount);
+                TLargeBlk::As(p)->Mark(ELarge::Alloc);
+                return p;
+            } else {
+                if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) != (void*)nullptr) {
+                    // block was freed while we were busy
+                    AtomicAdd(lbFreePageCount, -realPageCount);
+                    LargeBlockUnmap(p, realPageCount);
+                    --i;
+                }
+            }
+        }
+    }
+    char* pRes = AllocWithMMap((pgCount + 1) * 4096ll, MM_HUGE);
+#endif
+    pRes += 4096ll;
+    TLargeBlk::As(pRes)->SetSize(_nSize, pgCount);
+    TLargeBlk::As(pRes)->Mark(ELarge::Alloc);
+
+    return pRes;
+}
+
+#ifndef _MSC_VER
+static void FreeAllLargeBlockMem() {
+    for (auto& lbFreePtr : lbFreePtrs) {
+        for (int i = 0; i < LB_BUF_SIZE; ++i) {
+            void* p = lbFreePtr[i];
+            if (p == nullptr)
+                continue;
+            if (DoCas(&lbFreePtr[i], (void*)nullptr, p) == p) {
+                int pgCount = TLargeBlk::As(p)->Pages;
+                AtomicAdd(lbFreePageCount, -pgCount);
+                LargeBlockUnmap(p, pgCount);
+            }
+        }
+    }
+}
+#endif
+
+static void LargeBlockFree(void* p, ELFAllocCounter counter) {
+    if (p == nullptr)
+        return;
+#ifdef _MSC_VER
+    VirtualFree((char*)p - 4096ll, 0, MEM_RELEASE);
+#else
+    size_t pgCount = TLargeBlk::As(p)->Pages;
+
+    TLargeBlk::As(p)->Mark(ELarge::Free);
+    IncrementCounter(counter, pgCount * 4096ll);
+    IncrementCounter(CT_SYSTEM_FREE, 4096ll);
+
+    if (lbFreePageCount > LB_LIMIT_TOTAL_SIZE)
+        FreeAllLargeBlockMem();
+    int lbHash = pgCount % LB_BUF_HASH;
+    for (int i = 0; i < LB_BUF_SIZE; ++i) {
+        if (lbFreePtrs[lbHash][i] == nullptr) {
+            if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) == nullptr) {
+                AtomicAdd(lbFreePageCount, pgCount);
+                return;
+            }
+        }
+    }
+
+    LargeBlockUnmap(p, pgCount);
+#endif
+}
+
+static void* SystemAlloc(size_t _nSize) {
+    //HeapAlloc(GetProcessHeap(), HEAP_GENERATE_EXCEPTIONS, _nSize);
+    return LargeBlockAlloc(_nSize, CT_SYSTEM_ALLOC);
+}
+static void SystemFree(void* p) {
+    //HeapFree(GetProcessHeap(), 0, p);
+    LargeBlockFree(p, CT_SYSTEM_FREE);
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+char* const LF_LOCK_FREE = ((char*)0) + 0;
+char* const LF_LOCK_LOCKED = ((char*)0) + 1;
+char* const LF_LOCK_FUTEX_WAIT = ((char*)0) + 2;
+static bool LFHasFutex = true;
+static bool LFCheckedWinVersion = false;
+
+// TLFLockData has to be zero-initialized explicitly https://en.cppreference.com/w/cpp/language/zero_initialization
+// otherwise constructor TLFLockData() for global var might be called after first use
+struct TLFLockData
+{
+    char* Pad1[15];
+    char* volatile LockVar; // = LF_LOCK_FREE; // no constructor, zero-initialize manually
+    char* Pad2[15];
+
+    bool TryLock()
+    {
+        return (LockVar == LF_LOCK_FREE && DoCas(&LockVar, LF_LOCK_LOCKED, LF_LOCK_FREE) == LF_LOCK_FREE);
+    }
+
+    void FutexWait()
+    {
+#ifdef _win_
+        if (!LFCheckedWinVersion) {
+            OSVERSIONINFOA ver;
+            memset(&ver, 0, sizeof(ver));
+            ver.dwOSVersionInfoSize = sizeof(OSVERSIONINFOA);
+            GetVersionExA(&ver);
+            LFHasFutex = (ver.dwMajorVersion > 6) || (ver.dwMajorVersion == 6 && ver.dwMinorVersion >= 2);
+            LFCheckedWinVersion = true;
+        }
+        if (LFHasFutex) {
+            if (LockVar == LF_LOCK_LOCKED) {
+                DoCas(&LockVar, LF_LOCK_FUTEX_WAIT, LF_LOCK_LOCKED);
+            }
+            if (LockVar == LF_LOCK_FUTEX_WAIT) {
+                char* lockedValue = LF_LOCK_FUTEX_WAIT;
+                WaitOnAddress(&LockVar, &lockedValue, sizeof(LockVar), INFINITE);
+            }
+        } else {
+            SwitchToThread();
+        }
+#elif defined(_linux_)
+        if (LFHasFutex) {
+            if (LockVar == LF_LOCK_LOCKED) {
+                DoCas(&LockVar, LF_LOCK_FUTEX_WAIT, LF_LOCK_LOCKED);
+            }
+            if (LockVar == LF_LOCK_FUTEX_WAIT) {
+                // linux allow only int variables checks, here we pretend low bits of LockVar are int
+                syscall(SYS_futex, &LockVar, FUTEX_WAIT_PRIVATE, *(int*)&LF_LOCK_FUTEX_WAIT, 0, 0, 0);
+            }
+        } else {
+            sched_yield();
+        }
+#else
+        sched_yield();
+#endif
+    }
+
+    void Unlock()
+    {
+        Y_ASSERT_NOBT(LockVar != LF_LOCK_FREE);
+        if (DoCas(&LockVar, LF_LOCK_FREE, LF_LOCK_LOCKED) != LF_LOCK_LOCKED) {
+            Y_ASSERT_NOBT(LockVar == LF_LOCK_FUTEX_WAIT && LFHasFutex);
+            LockVar = LF_LOCK_FREE;
+#ifdef _win_
+            WakeByAddressAll((PVOID)&LockVar);
+#elif defined(_linux_)
+            syscall(SYS_futex, &LockVar, FUTEX_WAKE_PRIVATE, INT_MAX, 0, 0, 0);
+#endif
+        }
+    }
+};
+
+static TLFLockData LFGlobalLock;
+
+
+class TLFLockHolder {
+    TLFLockData *LockData = nullptr;
+    int Attempt = 0;
+    int SleepMask = 0x7f;
+
+public:
+    TLFLockHolder() {}
+    TLFLockHolder(TLFLockData *lk) {
+        while (!TryLock(lk));
+    }
+    bool TryLock(TLFLockData *lk)
+    {
+        Y_ASSERT_NOBT(LockData == nullptr);
+        if (lk->TryLock()) {
+            LockData = lk;
+            return true;
+        }
+        if ((++Attempt & SleepMask) == 0) {
+            lk->FutexWait();
+            SleepMask = (SleepMask * 2 + 1) & 0x7fff;
+        } else {
+#ifdef _MSC_VER
+            _mm_pause();
+#elif defined(__i386) || defined(__x86_64__)
+            __asm__ __volatile__("pause");
+#endif
+        }
+        return false;
+    }
+    ~TLFLockHolder() {
+        if (LockData) {
+            LockData->Unlock();
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+class TLFAllocFreeList {
+    struct TNode {
+        TNode* Next;
+    };
+
+    TNode* volatile Head;
+    TNode* volatile Pending;
+    TAtomic PendingToFreeListCounter;
+    TAtomic AllocCount;
+    void* Padding;
+
+    static Y_FORCE_INLINE void Enqueue(TNode* volatile* headPtr, TNode* n) {
+        for (;;) {
+            TNode* volatile prevHead = *headPtr;
+            n->Next = prevHead;
+            if (DoCas(headPtr, n, prevHead) == prevHead)
+                break;
+        }
+    }
+    Y_FORCE_INLINE void* DoAlloc() {
+        TNode* res;
+        for (res = Head; res; res = Head) {
+            TNode* keepNext = res->Next;
+            if (DoCas(&Head, keepNext, res) == res) {
+                //Y_VERIFY(keepNext == res->Next);
+                break;
+            }
+        }
+        return res;
+    }
+    void FreeList(TNode* fl) {
+        if (!fl)
+            return;
+        TNode* flTail = fl;
+        while (flTail->Next)
+            flTail = flTail->Next;
+        for (;;) {
+            TNode* volatile prevHead = Head;
+            flTail->Next = prevHead;
+            if (DoCas(&Head, fl, prevHead) == prevHead)
+                break;
+        }
+    }
+
+public:
+    Y_FORCE_INLINE void Free(void* ptr) {
+        TNode* newFree = (TNode*)ptr;
+        if (AtomicAdd(AllocCount, 0) == 0)
+            Enqueue(&Head, newFree);
+        else
+            Enqueue(&Pending, newFree);
+    }
+    Y_FORCE_INLINE void* Alloc() {
+        TAtomic keepCounter = AtomicAdd(PendingToFreeListCounter, 0);
+        TNode* fl = Pending;
+        if (AtomicAdd(AllocCount, 1) == 1) {
+            // No other allocs in progress.
+            // If (keepCounter == PendingToFreeListCounter) then Pending was not freed by other threads.
+            // Hence Pending is not used in any concurrent DoAlloc() atm and can be safely moved to FreeList
+            if (fl && keepCounter == AtomicAdd(PendingToFreeListCounter, 0) && DoCas(&Pending, (TNode*)nullptr, fl) == fl) {
+                // pick first element from Pending and return it
+                void* res = fl;
+                fl = fl->Next;
+                // if there are other elements in Pending list, add them to main free list
+                FreeList(fl);
+                AtomicAdd(PendingToFreeListCounter, 1);
+                AtomicAdd(AllocCount, -1);
+                return res;
+            }
+        }
+        void* res = DoAlloc();
+        AtomicAdd(AllocCount, -1);
+        return res;
+    }
+    void* GetWholeList() {
+        TNode* res;
+        for (res = Head; res; res = Head) {
+            if (DoCas(&Head, (TNode*)nullptr, res) == res)
+                break;
+        }
+        return res;
+    }
+    void ReturnWholeList(void* ptr) {
+        while (AtomicAdd(AllocCount, 0) != 0) // theoretically can run into problems with parallel DoAlloc()
+            ;                                 //ThreadYield();
+        for (;;) {
+            TNode* prevHead = Head;
+            if (DoCas(&Head, (TNode*)ptr, prevHead) == prevHead) {
+                FreeList(prevHead);
+                break;
+            }
+        }
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////
+static TLFAllocFreeList globalFreeLists[N_SIZES];
+static char* volatile globalCurrentPtr[N_SIZES];
+static TLFAllocFreeList blockFreeList;
+
+// globalFreeLists[] contains TFreeListGroup, each of them points up to 15 free blocks
+const int FL_GROUP_SIZE = 15;
+struct TFreeListGroup {
+    TFreeListGroup* Next;
+    char* Ptrs[FL_GROUP_SIZE];
+};
+#ifdef _64_
+const int FREE_LIST_GROUP_SIZEIDX = 8;
+#else
+const int FREE_LIST_GROUP_SIZEIDX = 6;
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+// find free chunks and reset chunk size so they can be reused by different sized allocations
+// do not look at blockFreeList (TFreeListGroup has same size for any allocations)
+static bool DefragmentMem() {
+    if (!EnableDefrag) {
+        return false;
+    }
+
+    IncrementCounter(CT_DEGRAGMENT_CNT, 1);
+
+    int* nFreeCount = (int*)SystemAlloc(N_CHUNKS * sizeof(int));
+    if (Y_UNLIKELY(!nFreeCount)) {
+        //__debugbreak();
+        NMalloc::AbortFromCorruptedAllocator("debugbreak");
+    }
+    memset(nFreeCount, 0, N_CHUNKS * sizeof(int));
+
+    TFreeListGroup* wholeLists[N_SIZES];
+    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) {
+        wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList();
+        for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) {
+            for (auto pData : g->Ptrs) {
+                if (pData) {
+                    uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE;
+                    ++nFreeCount[nChunk];
+                    Y_ASSERT_NOBT(chunkSizeIdx[nChunk] == nSizeIdx);
+                }
+            }
+        }
+    }
+
+    bool bRes = false;
+    for (size_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) {
+        int fc = nFreeCount[nChunk];
+        nFreeCount[nChunk] = 0;
+        if (chunkSizeIdx[nChunk] <= 0)
+            continue;
+        int nEntries = N_CHUNK_SIZE / nSizeIdxToSize[static_cast<int>(chunkSizeIdx[nChunk])];
+        Y_ASSERT_NOBT(fc <= nEntries); // can not have more free blocks then total count
+        if (fc == nEntries) {
+            bRes = true;
+            nFreeCount[nChunk] = 1;
+        }
+    }
+    if (bRes) {
+        for (auto& wholeList : wholeLists) {
+            TFreeListGroup** ppPtr = &wholeList;
+            while (*ppPtr) {
+                TFreeListGroup* g = *ppPtr;
+                int dst = 0;
+                for (auto pData : g->Ptrs) {
+                    if (pData) {
+                        uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE;
+                        if (nFreeCount[nChunk] == 0)
+                            g->Ptrs[dst++] = pData; // block is not freed, keep pointer
+                    }
+                }
+                if (dst == 0) {
+                    // no valid pointers in group, free it
+                    *ppPtr = g->Next;
+                    blockFreeList.Free(g);
+                } else {
+                    // reset invalid pointers to 0
+                    for (int i = dst; i < FL_GROUP_SIZE; ++i)
+                        g->Ptrs[i] = nullptr;
+                    ppPtr = &g->Next;
+                }
+            }
+        }
+        for (uintptr_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) {
+            if (!nFreeCount[nChunk])
+                continue;
+            char* pStart = ALLOC_START + nChunk * N_CHUNK_SIZE;
+#ifdef _win_
+            VirtualFree(pStart, N_CHUNK_SIZE, MEM_DECOMMIT);
+#elif defined(_freebsd_)
+            madvise(pStart, N_CHUNK_SIZE, MADV_FREE);
+#else
+            madvise(pStart, N_CHUNK_SIZE, MADV_DONTNEED);
+#endif
+            AddFreeChunk(nChunk);
+        }
+    }
+
+    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx)
+        globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]);
+
+    SystemFree(nFreeCount);
+    return bRes;
+}
+
+static Y_FORCE_INLINE void* LFAllocFromCurrentChunk(int nSizeIdx, int blockSize, int count) {
+    char* volatile* pFreeArray = &globalCurrentPtr[nSizeIdx];
+    while (char* newBlock = *pFreeArray) {
+        char* nextFree = newBlock + blockSize * count;
+
+        // check if there is space in chunk
+        char* globalEndPtr = ALLOC_START + ((newBlock - ALLOC_START) & ~((uintptr_t)N_CHUNK_SIZE - 1)) + N_CHUNK_SIZE;
+        if (nextFree >= globalEndPtr) {
+            if (nextFree > globalEndPtr)
+                break;
+            nextFree = nullptr; // it was last block in chunk
+        }
+        if (DoCas(pFreeArray, nextFree, newBlock) == newBlock)
+            return newBlock;
+    }
+    return nullptr;
+}
+
+enum EDefrag {
+    MEM_DEFRAG,
+    NO_MEM_DEFRAG,
+};
+
+static void* SlowLFAlloc(int nSizeIdx, int blockSize, EDefrag defrag) {
+    IncrementCounter(CT_SLOW_ALLOC_CNT, 1);
+
+    TLFLockHolder ls;
+    for (;;) {
+        bool locked = ls.TryLock(&LFGlobalLock);
+        void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1);
+        if (res) {
+            return res; // might happen when other thread allocated new current chunk
+        }
+        if (locked) {
+            break;
+        }
+    }
+    for (;;) {
+        uintptr_t nChunk;
+        if (GetFreeChunk(&nChunk)) {
+            char* newPlace = ALLOC_START + nChunk * N_CHUNK_SIZE;
+#ifdef _MSC_VER
+            void* pTest = VirtualAlloc(newPlace, N_CHUNK_SIZE, MEM_COMMIT, PAGE_READWRITE);
+            Y_ASSERT_NOBT(pTest == newPlace);
+#endif
+            chunkSizeIdx[nChunk] = (char)nSizeIdx;
+            globalCurrentPtr[nSizeIdx] = newPlace + blockSize;
+            return newPlace;
+        }
+
+        // out of luck, try to defrag
+        if (defrag == MEM_DEFRAG && DefragmentMem()) {
+            continue;
+        }
+
+        char* largeBlock = AllocWithMMap(N_LARGE_ALLOC_SIZE, MM_NORMAL);
+        uintptr_t addr = ((largeBlock - ALLOC_START) + N_CHUNK_SIZE - 1) & (~(N_CHUNK_SIZE - 1));
+        uintptr_t endAddr = ((largeBlock - ALLOC_START) + N_LARGE_ALLOC_SIZE) & (~(N_CHUNK_SIZE - 1));
+        for (uintptr_t p = addr; p < endAddr; p += N_CHUNK_SIZE) {
+            uintptr_t chunk = p / N_CHUNK_SIZE;
+            Y_ASSERT_NOBT(chunk * N_CHUNK_SIZE == p);
+            Y_ASSERT_NOBT(chunkSizeIdx[chunk] == 0);
+            AddFreeChunk(chunk);
+        }
+    }
+    return nullptr;
+}
+
+// allocate single block
+static Y_FORCE_INLINE void* LFAllocNoCache(int nSizeIdx, EDefrag defrag) {
+    int blockSize = nSizeIdxToSize[nSizeIdx];
+    void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1);
+    if (res)
+        return res;
+
+    return SlowLFAlloc(nSizeIdx, blockSize, defrag);
+}
+
+// allocate multiple blocks, returns number of blocks allocated (max FL_GROUP_SIZE)
+// buf should have space for at least FL_GROUP_SIZE elems
+static Y_FORCE_INLINE int LFAllocNoCacheMultiple(int nSizeIdx, char** buf) {
+    int blockSize = nSizeIdxToSize[nSizeIdx];
+    void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, FL_GROUP_SIZE);
+    if (res) {
+        char* resPtr = (char*)res;
+        for (int k = 0; k < FL_GROUP_SIZE; ++k) {
+            buf[k] = resPtr;
+            resPtr += blockSize;
+        }
+        return FL_GROUP_SIZE;
+    }
+    buf[0] = (char*)SlowLFAlloc(nSizeIdx, blockSize, MEM_DEFRAG);
+    return 1;
+}
+
+// take several blocks from global free list (max FL_GROUP_SIZE blocks), returns number of blocks taken
+// buf should have space for at least FL_GROUP_SIZE elems
+static Y_FORCE_INLINE int TakeBlocksFromGlobalFreeList(int nSizeIdx, char** buf) {
+    TLFAllocFreeList& fl = globalFreeLists[nSizeIdx];
+    TFreeListGroup* g = (TFreeListGroup*)fl.Alloc();
+    if (g) {
+        int resCount = 0;
+        for (auto& ptr : g->Ptrs) {
+            if (ptr)
+                buf[resCount++] = ptr;
+            else
+                break;
+        }
+        blockFreeList.Free(g);
+        return resCount;
+    }
+    return 0;
+}
+
+// add several blocks to global free list
+static Y_FORCE_INLINE void PutBlocksToGlobalFreeList(ptrdiff_t nSizeIdx, char** buf, int count) {
+    for (int startIdx = 0; startIdx < count;) {
+        TFreeListGroup* g = (TFreeListGroup*)blockFreeList.Alloc();
+        Y_ASSERT_NOBT(sizeof(TFreeListGroup) == nSizeIdxToSize[FREE_LIST_GROUP_SIZEIDX]);
+        if (!g) {
+            g = (TFreeListGroup*)LFAllocNoCache(FREE_LIST_GROUP_SIZEIDX, NO_MEM_DEFRAG);
+        }
+
+        int groupSize = count - startIdx;
+        if (groupSize > FL_GROUP_SIZE)
+            groupSize = FL_GROUP_SIZE;
+        for (int i = 0; i < groupSize; ++i)
+            g->Ptrs[i] = buf[startIdx + i];
+        for (int i = groupSize; i < FL_GROUP_SIZE; ++i)
+            g->Ptrs[i] = nullptr;
+
+        // add free group to the global list
+        TLFAllocFreeList& fl = globalFreeLists[nSizeIdx];
+        fl.Free(g);
+
+        startIdx += groupSize;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+static TAtomic GlobalCounters[CT_MAX];
+const int MAX_LOCAL_UPDATES = 100;
+const intptr_t MAX_LOCAL_DELTA = 1*1024*1024;
+
+struct TLocalCounter {
+    intptr_t Value;
+    int Updates;
+    TAtomic* Parent;
+
+    Y_FORCE_INLINE void Init(TAtomic* parent) {
+        Parent = parent;
+        Value = 0;
+        Updates = 0;
+    }
+
+    Y_FORCE_INLINE void Increment(size_t value) {
+        Value += value;
+        if (++Updates > MAX_LOCAL_UPDATES || Value > MAX_LOCAL_DELTA) {
+            Flush();
+        }
+    }
+
+    Y_FORCE_INLINE void Flush() {
+        AtomicAdd(*Parent, Value);
+        Value = 0;
+        Updates = 0;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// DBG stuff
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(LFALLOC_DBG)
+
+struct TPerTagAllocCounter {
+    TAtomic Size;
+    TAtomic Count;
+
+    Y_FORCE_INLINE void Alloc(size_t size) {
+        AtomicAdd(Size, size);
+        AtomicAdd(Count, 1);
+    }
+
+    Y_FORCE_INLINE void Free(size_t size) {
+        AtomicSub(Size, size);
+        AtomicSub(Count, 1);
+    }
+};
+
+struct TLocalPerTagAllocCounter {
+    intptr_t Size;
+    int Count;
+    int Updates;
+
+    Y_FORCE_INLINE void Init() {
+        Size = 0;
+        Count = 0;
+        Updates = 0;
+    }
+
+    Y_FORCE_INLINE void Alloc(TPerTagAllocCounter& parent, size_t size) {
+        Size += size;
+        ++Count;
+        if (++Updates > MAX_LOCAL_UPDATES) {
+            Flush(parent);
+        }
+    }
+
+    Y_FORCE_INLINE void Free(TPerTagAllocCounter& parent, size_t size) {
+        Size -= size;
+        --Count;
+        if (++Updates > MAX_LOCAL_UPDATES) {
+            Flush(parent);
+        }
+    }
+
+    Y_FORCE_INLINE void Flush(TPerTagAllocCounter& parent) {
+        AtomicAdd(parent.Size, Size);
+        Size = 0;
+        AtomicAdd(parent.Count, Count);
+        Count = 0;
+        Updates = 0;
+    }
+};
+
+static const int DBG_ALLOC_MAX_TAG = 1000;
+static const int DBG_ALLOC_ALIGNED_TAG = 0xF0000000;
+static const int DBG_ALLOC_NUM_SIZES = 30;
+static TPerTagAllocCounter GlobalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES];
+
+#endif // LFALLOC_DBG
+
+//////////////////////////////////////////////////////////////////////////
+const int THREAD_BUF = 256;
+static int borderSizes[N_SIZES];
+const int MAX_MEM_PER_SIZE_PER_THREAD = 512 * 1024;
+struct TThreadAllocInfo {
+    // FreePtrs - pointers to first free blocks in per thread block list
+    // LastFreePtrs - pointers to last blocks in lists, may be invalid if FreePtr is zero
+    char* FreePtrs[N_SIZES][THREAD_BUF];
+    int FreePtrIndex[N_SIZES];
+    TThreadAllocInfo* pNextInfo;
+    TLocalCounter LocalCounters[CT_MAX];
+
+#if defined(LFALLOC_DBG)
+    TLocalPerTagAllocCounter LocalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES];
+#endif
+#ifdef _win_
+    HANDLE hThread;
+#endif
+
+    void Init(TThreadAllocInfo** pHead) {
+        memset(this, 0, sizeof(*this));
+        for (auto& i : FreePtrIndex)
+            i = THREAD_BUF;
+#ifdef _win_
+        BOOL b = DuplicateHandle(
+            GetCurrentProcess(), GetCurrentThread(),
+            GetCurrentProcess(), &hThread,
+            0, FALSE, DUPLICATE_SAME_ACCESS);
+        Y_ASSERT_NOBT(b);
+#endif
+        pNextInfo = *pHead;
+        *pHead = this;
+        for (int k = 0; k < N_SIZES; ++k) {
+            int maxCount = MAX_MEM_PER_SIZE_PER_THREAD / nSizeIdxToSize[k];
+            if (maxCount > THREAD_BUF)
+                maxCount = THREAD_BUF;
+            borderSizes[k] = THREAD_BUF - maxCount;
+        }
+        for (int i = 0; i < CT_MAX; ++i) {
+            LocalCounters[i].Init(&GlobalCounters[i]);
+        }
+#if defined(LFALLOC_DBG)
+        for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+            for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+                auto& local = LocalPerTagAllocCounters[tag][sizeIdx];
+                local.Init();
+            }
+        }
+#endif
+    }
+    void Done() {
+        for (auto sizeIdx : FreePtrIndex) {
+            Y_ASSERT_NOBT(sizeIdx == THREAD_BUF);
+        }
+        for (auto& localCounter : LocalCounters) {
+            localCounter.Flush();
+        }
+#if defined(LFALLOC_DBG)
+        for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+            for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+                auto& local = LocalPerTagAllocCounters[tag][sizeIdx];
+                auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+                local.Flush(global);
+            }
+        }
+#endif
+#ifdef _win_
+        if (hThread)
+            CloseHandle(hThread);
+#endif
+    }
+};
+PERTHREAD TThreadAllocInfo* pThreadInfo;
+static TThreadAllocInfo* pThreadInfoList;
+
+static TLFLockData LFLockThreadInfo;
+
+static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value) {
+#ifdef LFALLOC_YT
+    TThreadAllocInfo* thr = pThreadInfo;
+    if (thr) {
+        thr->LocalCounters[counter].Increment(value);
+    } else {
+        AtomicAdd(GlobalCounters[counter], value);
+    }
+#endif
+}
+
+extern "C" i64 GetLFAllocCounterFast(int counter) {
+#ifdef LFALLOC_YT
+    return GlobalCounters[counter];
+#else
+    return 0;
+#endif
+}
+
+extern "C" i64 GetLFAllocCounterFull(int counter) {
+#ifdef LFALLOC_YT
+    i64 ret = GlobalCounters[counter];
+    {
+        TLFLockHolder ll(&LFLockThreadInfo);
+        for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+            TThreadAllocInfo* pInfo = *p;
+            ret += pInfo->LocalCounters[counter].Value;
+            p = &pInfo->pNextInfo;
+        }
+    }
+    return ret;
+#else
+    return 0;
+#endif
+}
+
+static void MoveSingleThreadFreeToGlobal(TThreadAllocInfo* pInfo) {
+    for (int sizeIdx = 0; sizeIdx < N_SIZES; ++sizeIdx) {
+        int& freePtrIdx = pInfo->FreePtrIndex[sizeIdx];
+        char** freePtrs = pInfo->FreePtrs[sizeIdx];
+        PutBlocksToGlobalFreeList(sizeIdx, freePtrs + freePtrIdx, THREAD_BUF - freePtrIdx);
+        freePtrIdx = THREAD_BUF;
+    }
+}
+
+#ifdef _win_
+static bool IsDeadThread(TThreadAllocInfo* pInfo) {
+    DWORD dwExit;
+    bool isDead = !GetExitCodeThread(pInfo->hThread, &dwExit) || dwExit != STILL_ACTIVE;
+    return isDead;
+}
+
+static void CleanupAfterDeadThreads() {
+    TLFLockHolder ll(&LFLockThreadInfo);
+    for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+        TThreadAllocInfo* pInfo = *p;
+        if (IsDeadThread(pInfo)) {
+            MoveSingleThreadFreeToGlobal(pInfo);
+            pInfo->Done();
+            *p = pInfo->pNextInfo;
+            SystemFree(pInfo);
+        } else
+            p = &pInfo->pNextInfo;
+    }
+}
+#endif
+
+#ifndef _win_
+static pthread_key_t ThreadCacheCleaner;
+static void* volatile ThreadCacheCleanerStarted; // 0 = not started, -1 = started, -2 = is starting
+static PERTHREAD bool IsStoppingThread;
+
+static void FreeThreadCache(void*) {
+    TThreadAllocInfo* pToDelete = nullptr;
+    {
+        TLFLockHolder ll(&LFLockThreadInfo);
+        pToDelete = pThreadInfo;
+        if (pToDelete == nullptr)
+            return;
+
+        // remove from the list
+        for (TThreadAllocInfo** p = &pThreadInfoList; *p; p = &(*p)->pNextInfo) {
+            if (*p == pToDelete) {
+                *p = pToDelete->pNextInfo;
+                break;
+            }
+        }
+        IsStoppingThread = true;
+        pThreadInfo = nullptr;
+    }
+
+    // free per thread buf
+    MoveSingleThreadFreeToGlobal(pToDelete);
+    pToDelete->Done();
+    SystemFree(pToDelete);
+}
+#endif
+
+static void AllocThreadInfo() {
+#ifndef _win_
+    if (DoCas(&ThreadCacheCleanerStarted, (void*)-2, (void*)nullptr) == (void*)nullptr) {
+        pthread_key_create(&ThreadCacheCleaner, FreeThreadCache);
+        ThreadCacheCleanerStarted = (void*)-1;
+    }
+    if (ThreadCacheCleanerStarted != (void*)-1)
+        return; // do not use ThreadCacheCleaner until it is constructed
+
+    {
+        if (IsStoppingThread)
+            return;
+        TLFLockHolder ll(&LFLockThreadInfo);
+        if (IsStoppingThread) // better safe than sorry
+            return;
+
+        pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo));
+        pThreadInfo->Init(&pThreadInfoList);
+    }
+    pthread_setspecific(ThreadCacheCleaner, (void*)-1); // without value destructor will not be called
+#else
+    CleanupAfterDeadThreads();
+    {
+        pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo));
+        TLFLockHolder ll(&LFLockThreadInfo);
+        pThreadInfo->Init(&pThreadInfoList);
+    }
+#endif
+}
+
+    //////////////////////////////////////////////////////////////////////////
+    // DBG stuff
+    //////////////////////////////////////////////////////////////////////////
+
+#if defined(LFALLOC_DBG)
+
+struct TAllocHeader {
+    uint64_t Size;
+    int Tag;
+    int Cookie;
+};
+
+// should be power of 2
+static_assert(sizeof(TAllocHeader) == 16);
+
+static inline void* GetAllocPtr(TAllocHeader* p) {
+    return p + 1;
+}
+
+static inline TAllocHeader* GetAllocHeader(void* p) {
+    auto* header = ((TAllocHeader*)p) - 1;
+    if (header->Tag == DBG_ALLOC_ALIGNED_TAG) {
+        return (TAllocHeader*)header->Size;
+    }
+
+    return header;
+}
+
+PERTHREAD int AllocationTag;
+extern "C" int SetThreadAllocTag(int tag) {
+    int prevTag = AllocationTag;
+    if (tag < DBG_ALLOC_MAX_TAG && tag >= 0) {
+        AllocationTag = tag;
+    }
+    return prevTag;
+}
+
+PERTHREAD bool ProfileCurrentThread;
+extern "C" bool SetProfileCurrentThread(bool newVal) {
+    bool prevVal = ProfileCurrentThread;
+    ProfileCurrentThread = newVal;
+    return prevVal;
+}
+
+static volatile bool ProfileAllThreads;
+extern "C" bool SetProfileAllThreads(bool newVal) {
+    bool prevVal = ProfileAllThreads;
+    ProfileAllThreads = newVal;
+    return prevVal;
+}
+
+static volatile bool AllocationSamplingEnabled;
+extern "C" bool SetAllocationSamplingEnabled(bool newVal) {
+    bool prevVal = AllocationSamplingEnabled;
+    AllocationSamplingEnabled = newVal;
+    return prevVal;
+}
+
+static size_t AllocationSampleRate = 1000;
+extern "C" size_t SetAllocationSampleRate(size_t newVal) {
+    size_t prevVal = AllocationSampleRate;
+    AllocationSampleRate = newVal;
+    return prevVal;
+}
+
+static size_t AllocationSampleMaxSize = N_MAX_FAST_SIZE;
+extern "C" size_t SetAllocationSampleMaxSize(size_t newVal) {
+    size_t prevVal = AllocationSampleMaxSize;
+    AllocationSampleMaxSize = newVal;
+    return prevVal;
+}
+
+using TAllocationCallback = int(int tag, size_t size, int sizeIdx);
+static TAllocationCallback* AllocationCallback;
+extern "C" TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal) {
+    TAllocationCallback* prevVal = AllocationCallback;
+    AllocationCallback = newVal;
+    return prevVal;
+}
+
+using TDeallocationCallback = void(int cookie, int tag, size_t size, int sizeIdx);
+static TDeallocationCallback* DeallocationCallback;
+extern "C" TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal) {
+    TDeallocationCallback* prevVal = DeallocationCallback;
+    DeallocationCallback = newVal;
+    return prevVal;
+}
+
+PERTHREAD TAtomic AllocationsCount;
+PERTHREAD bool InAllocationCallback;
+
+static const int DBG_ALLOC_INVALID_COOKIE = -1;
+static inline int SampleAllocation(TAllocHeader* p, int sizeIdx) {
+    int cookie = DBG_ALLOC_INVALID_COOKIE;
+    if (AllocationSamplingEnabled && (ProfileCurrentThread || ProfileAllThreads) && !InAllocationCallback) {
+        if (p->Size > AllocationSampleMaxSize || ++AllocationsCount % AllocationSampleRate == 0) {
+            if (AllocationCallback) {
+                InAllocationCallback = true;
+                cookie = AllocationCallback(p->Tag, p->Size, sizeIdx);
+                InAllocationCallback = false;
+            }
+        }
+    }
+    return cookie;
+}
+
+static inline void SampleDeallocation(TAllocHeader* p, int sizeIdx) {
+    if (p->Cookie != DBG_ALLOC_INVALID_COOKIE && !InAllocationCallback) {
+        if (DeallocationCallback) {
+            InAllocationCallback = true;
+            DeallocationCallback(p->Cookie, p->Tag, p->Size, sizeIdx);
+            InAllocationCallback = false;
+        }
+    }
+}
+
+static inline void TrackPerTagAllocation(TAllocHeader* p, int sizeIdx) {
+    if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) {
+        Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES);
+        auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx];
+
+        TThreadAllocInfo* thr = pThreadInfo;
+        if (thr) {
+            auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx];
+            local.Alloc(global, p->Size);
+        } else {
+            global.Alloc(p->Size);
+        }
+    }
+}
+
+static inline void TrackPerTagDeallocation(TAllocHeader* p, int sizeIdx) {
+    if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) {
+        Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES);
+        auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx];
+
+        TThreadAllocInfo* thr = pThreadInfo;
+        if (thr) {
+            auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx];
+            local.Free(global, p->Size);
+        } else {
+            global.Free(p->Size);
+        }
+    }
+}
+
+static void* TrackAllocation(void* ptr, size_t size, int sizeIdx) {
+    TAllocHeader* p = (TAllocHeader*)ptr;
+    p->Size = size;
+    p->Tag = AllocationTag;
+    p->Cookie = SampleAllocation(p, sizeIdx);
+    TrackPerTagAllocation(p, sizeIdx);
+    return GetAllocPtr(p);
+}
+
+static void TrackDeallocation(void* ptr, int sizeIdx) {
+    TAllocHeader* p = (TAllocHeader*)ptr;
+    SampleDeallocation(p, sizeIdx);
+    TrackPerTagDeallocation(p, sizeIdx);
+}
+
+struct TPerTagAllocInfo {
+    ssize_t Count;
+    ssize_t Size;
+};
+
+extern "C" void GetPerTagAllocInfo(
+    bool flushPerThreadCounters,
+    TPerTagAllocInfo* info,
+    int& maxTag,
+    int& numSizes) {
+    maxTag = DBG_ALLOC_MAX_TAG;
+    numSizes = DBG_ALLOC_NUM_SIZES;
+
+    if (info) {
+        if (flushPerThreadCounters) {
+            TLFLockHolder ll(&LFLockThreadInfo);
+            for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+                TThreadAllocInfo* pInfo = *p;
+                for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+                    for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+                        auto& local = pInfo->LocalPerTagAllocCounters[tag][sizeIdx];
+                        auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+                        local.Flush(global);
+                    }
+                }
+                p = &pInfo->pNextInfo;
+            }
+        }
+
+        for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+            for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+                auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+                auto& res = info[tag * DBG_ALLOC_NUM_SIZES + sizeIdx];
+                res.Count = global.Count;
+                res.Size = global.Size;
+            }
+        }
+    }
+}
+
+#endif // LFALLOC_DBG
+
+//////////////////////////////////////////////////////////////////////////
+static Y_FORCE_INLINE void* LFAllocImpl(size_t _nSize) {
+#if defined(LFALLOC_DBG)
+    size_t size = _nSize;
+    _nSize += sizeof(TAllocHeader);
+#endif
+
+    IncrementCounter(CT_USER_ALLOC, _nSize);
+
+    int nSizeIdx;
+    if (_nSize > 512) {
+        if (_nSize > N_MAX_FAST_SIZE) {
+            void* ptr = LargeBlockAlloc(_nSize, CT_LARGE_ALLOC);
+#if defined(LFALLOC_DBG)
+            ptr = TrackAllocation(ptr, size, N_SIZES);
+#endif
+            return ptr;
+        }
+        nSizeIdx = size2idxArr2[(_nSize - 1) >> 8];
+    } else
+        nSizeIdx = size2idxArr1[1 + (((int)_nSize - 1) >> 3)];
+
+    IncrementCounter(CT_SMALL_ALLOC, nSizeIdxToSize[nSizeIdx]);
+
+    // check per thread buffer
+    TThreadAllocInfo* thr = pThreadInfo;
+    if (!thr) {
+        AllocThreadInfo();
+        thr = pThreadInfo;
+        if (!thr) {
+            void* ptr = LFAllocNoCache(nSizeIdx, MEM_DEFRAG);
+#if defined(LFALLOC_DBG)
+            ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+            return ptr;
+        }
+    }
+    {
+        int& freePtrIdx = thr->FreePtrIndex[nSizeIdx];
+        if (freePtrIdx < THREAD_BUF) {
+            void* ptr = thr->FreePtrs[nSizeIdx][freePtrIdx++];
+#if defined(LFALLOC_DBG)
+            ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+            return ptr;
+        }
+
+        // try to alloc from global free list
+        char* buf[FL_GROUP_SIZE];
+        int count = TakeBlocksFromGlobalFreeList(nSizeIdx, buf);
+        if (count == 0) {
+            count = LFAllocNoCacheMultiple(nSizeIdx, buf);
+            if (count == 0) {
+                NMalloc::AbortFromCorruptedAllocator("no way LFAllocNoCacheMultiple() can fail");
+            }
+        }
+        char** dstBuf = thr->FreePtrs[nSizeIdx] + freePtrIdx - 1;
+        for (int i = 0; i < count - 1; ++i)
+            dstBuf[-i] = buf[i];
+        freePtrIdx -= count - 1;
+        void* ptr = buf[count - 1];
+#if defined(LFALLOC_DBG)
+        ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+        return ptr;
+    }
+}
+
+static Y_FORCE_INLINE void* LFAlloc(size_t _nSize) {
+    void* res = LFAllocImpl(_nSize);
+#ifdef DBG_FILL_MEMORY
+    if (FillMemoryOnAllocation && res && (_nSize <= DBG_FILL_MAX_SIZE)) {
+        memset(res, 0xcf, _nSize);
+    }
+#endif
+    return res;
+}
+
+static Y_FORCE_INLINE void LFFree(void* p) {
+#if defined(LFALLOC_DBG)
+    if (p == nullptr)
+        return;
+    p = GetAllocHeader(p);
+#endif
+
+    uintptr_t chkOffset = ((char*)p - ALLOC_START) - 1ll;
+    if (chkOffset >= N_MAX_WORKSET_SIZE) {
+        if (p == nullptr)
+            return;
+#if defined(LFALLOC_DBG)
+        TrackDeallocation(p, N_SIZES);
+#endif
+        LargeBlockFree(p, CT_LARGE_FREE);
+        return;
+    }
+
+    uintptr_t chunk = ((char*)p - ALLOC_START) / N_CHUNK_SIZE;
+    ptrdiff_t nSizeIdx = chunkSizeIdx[chunk];
+    if (nSizeIdx <= 0) {
+#if defined(LFALLOC_DBG)
+        TrackDeallocation(p, N_SIZES);
+#endif
+        LargeBlockFree(p, CT_LARGE_FREE);
+        return;
+    }
+
+#if defined(LFALLOC_DBG)
+    TrackDeallocation(p, nSizeIdx);
+#endif
+
+#ifdef DBG_FILL_MEMORY
+    memset(p, 0xfe, nSizeIdxToSize[nSizeIdx]);
+#endif
+
+    IncrementCounter(CT_SMALL_FREE, nSizeIdxToSize[nSizeIdx]);
+
+    // try to store info to per thread buf
+    TThreadAllocInfo* thr = pThreadInfo;
+    if (thr) {
+        int& freePtrIdx = thr->FreePtrIndex[nSizeIdx];
+        if (freePtrIdx > borderSizes[nSizeIdx]) {
+            thr->FreePtrs[nSizeIdx][--freePtrIdx] = (char*)p;
+            return;
+        }
+
+        // move several pointers to global free list
+        int freeCount = FL_GROUP_SIZE;
+        if (freeCount > THREAD_BUF - freePtrIdx)
+            freeCount = THREAD_BUF - freePtrIdx;
+        char** freePtrs = thr->FreePtrs[nSizeIdx];
+        PutBlocksToGlobalFreeList(nSizeIdx, freePtrs + freePtrIdx, freeCount);
+        freePtrIdx += freeCount;
+
+        freePtrs[--freePtrIdx] = (char*)p;
+
+    } else {
+        AllocThreadInfo();
+        PutBlocksToGlobalFreeList(nSizeIdx, (char**)&p, 1);
+    }
+}
+
+static size_t LFGetSize(const void* p) {
+#if defined(LFALLOC_DBG)
+    if (p == nullptr)
+        return 0;
+    return GetAllocHeader(const_cast<void*>(p))->Size;
+#endif
+
+    uintptr_t chkOffset = ((const char*)p - ALLOC_START);
+    if (chkOffset >= N_MAX_WORKSET_SIZE) {
+        if (p == nullptr)
+            return 0;
+        return TLargeBlk::As(p)->Pages * 4096ll;
+    }
+    uintptr_t chunk = ((const char*)p - ALLOC_START) / N_CHUNK_SIZE;
+    ptrdiff_t nSizeIdx = chunkSizeIdx[chunk];
+    if (nSizeIdx <= 0)
+        return TLargeBlk::As(p)->Pages * 4096ll;
+    return nSizeIdxToSize[nSizeIdx];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Output mem alloc stats
+const int N_PAGE_SIZE = 4096;
+static void DebugTraceMMgr(const char* pszFormat, ...) // __cdecl
+{
+    static char buff[20000];
+    va_list va;
+    //
+    va_start(va, pszFormat);
+    vsprintf(buff, pszFormat, va);
+    va_end(va);
+//
+#ifdef _win_
+    OutputDebugStringA(buff);
+#else
+    fputs(buff, stderr);
+#endif
+}
+
+struct TChunkStats {
+    char *Start, *Finish;
+    i64 Size;
+    char* Entries;
+    i64 FreeCount;
+
+    TChunkStats(size_t chunk, i64 size, char* entries)
+        : Size(size)
+        , Entries(entries)
+        , FreeCount(0)
+    {
+        Start = ALLOC_START + chunk * N_CHUNK_SIZE;
+        Finish = Start + N_CHUNK_SIZE;
+    }
+    void CheckBlock(char* pBlock) {
+        if (pBlock && pBlock >= Start && pBlock < Finish) {
+            ++FreeCount;
+            i64 nShift = pBlock - Start;
+            i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1);
+            Entries[nOffsetInStep / Size] = 1;
+        }
+    }
+    void SetGlobalFree(char* ptr) {
+        i64 nShift = ptr - Start;
+        i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1);
+        while (nOffsetInStep + Size <= N_CHUNK_SIZE) {
+            ++FreeCount;
+            Entries[nOffsetInStep / Size] = 1;
+            nOffsetInStep += Size;
+        }
+    }
+};
+
+static void DumpMemoryBlockUtilizationLocked() {
+    TFreeListGroup* wholeLists[N_SIZES];
+    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) {
+        wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList();
+    }
+    char* bfList = (char*)blockFreeList.GetWholeList();
+
+    DebugTraceMMgr("memory blocks utilisation stats:\n");
+    i64 nTotalAllocated = 0, nTotalFree = 0, nTotalBadPages = 0, nTotalPages = 0, nTotalUsed = 0, nTotalLocked = 0;
+    i64 nTotalGroupBlocks = 0;
+    char* entries;
+    entries = (char*)SystemAlloc((N_CHUNK_SIZE / 4));
+    for (size_t k = 0; k < N_CHUNKS; ++k) {
+        if (chunkSizeIdx[k] <= 0) {
+            if (chunkSizeIdx[k] == -1)
+                nTotalLocked += N_CHUNK_SIZE;
+            continue;
+        }
+        i64 nSizeIdx = chunkSizeIdx[k];
+        i64 nSize = nSizeIdxToSize[nSizeIdx];
+        TChunkStats cs(k, nSize, entries);
+        int nEntriesTotal = N_CHUNK_SIZE / nSize;
+        memset(entries, 0, nEntriesTotal);
+        for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) {
+            for (auto& ptr : g->Ptrs)
+                cs.CheckBlock(ptr);
+        }
+        TChunkStats csGB(k, nSize, entries);
+        if (nSizeIdx == FREE_LIST_GROUP_SIZEIDX) {
+            for (auto g : wholeLists) {
+                for (; g; g = g->Next)
+                    csGB.CheckBlock((char*)g);
+            }
+            for (char* blk = bfList; blk; blk = *(char**)blk)
+                csGB.CheckBlock(blk);
+            nTotalGroupBlocks += csGB.FreeCount * nSize;
+        }
+        if (((globalCurrentPtr[nSizeIdx] - ALLOC_START) / N_CHUNK_SIZE) == k)
+            cs.SetGlobalFree(globalCurrentPtr[nSizeIdx]);
+        nTotalUsed += (nEntriesTotal - cs.FreeCount - csGB.FreeCount) * nSize;
+
+        char pages[N_CHUNK_SIZE / N_PAGE_SIZE];
+        memset(pages, 0, sizeof(pages));
+        for (int i = 0, nShift = 0; i < nEntriesTotal; ++i, nShift += nSize) {
+            int nBit = 0;
+            if (entries[i])
+                nBit = 1; // free entry
+            else
+                nBit = 2; // used entry
+            for (i64 nDelta = nSize - 1; nDelta >= 0; nDelta -= N_PAGE_SIZE)
+                pages[(nShift + nDelta) / N_PAGE_SIZE] |= nBit;
+        }
+        i64 nBadPages = 0;
+        for (auto page : pages) {
+            nBadPages += page == 3;
+            nTotalPages += page != 1;
+        }
+        DebugTraceMMgr("entry = %lld; size = %lld; free = %lld; system %lld; utilisation = %g%%, fragmentation = %g%%\n",
+                       k, nSize, cs.FreeCount * nSize, csGB.FreeCount * nSize,
+                       (N_CHUNK_SIZE - cs.FreeCount * nSize) * 100.0f / N_CHUNK_SIZE, 100.0f * nBadPages / Y_ARRAY_SIZE(pages));
+        nTotalAllocated += N_CHUNK_SIZE;
+        nTotalFree += cs.FreeCount * nSize;
+        nTotalBadPages += nBadPages;
+    }
+    SystemFree(entries);
+    DebugTraceMMgr("Total allocated = %llu, free = %lld, system = %lld, locked for future use %lld, utilisation = %g, fragmentation = %g\n",
+                   nTotalAllocated, nTotalFree, nTotalGroupBlocks, nTotalLocked,
+                   100.0f * (nTotalAllocated - nTotalFree) / nTotalAllocated, 100.0f * nTotalBadPages / nTotalPages);
+    DebugTraceMMgr("Total %lld bytes used, %lld bytes in used pages\n", nTotalUsed, nTotalPages * N_PAGE_SIZE);
+
+    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx)
+        globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]);
+    blockFreeList.ReturnWholeList(bfList);
+}
+
+void FlushThreadFreeList() {
+    if (pThreadInfo)
+        MoveSingleThreadFreeToGlobal(pThreadInfo);
+}
+
+void DumpMemoryBlockUtilization() {
+    // move current thread free to global lists to get better statistics
+    FlushThreadFreeList();
+    {
+        TLFLockHolder ls(&LFGlobalLock);
+        DumpMemoryBlockUtilizationLocked();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// malloc api
+
+static bool LFAlloc_SetParam(const char* param, const char* value) {
+    if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE")) {
+        LB_LIMIT_TOTAL_SIZE = atoi(value);
+        return true;
+    }
+    if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE_BYTES")) {
+        LB_LIMIT_TOTAL_SIZE = (atoi(value) + N_PAGE_SIZE - 1) / N_PAGE_SIZE;
+        return true;
+    }
+#ifdef DBG_FILL_MEMORY
+    if (!strcmp(param, "FillMemoryOnAllocation")) {
+        FillMemoryOnAllocation = !strcmp(value, "true");
+        return true;
+    }
+#endif
+    if (!strcmp(param, "TransparentHugePages")) {
+        TransparentHugePages = !strcmp(value, "true");
+        return true;
+    }
+    if (!strcmp(param, "MapHugeTLB")) {
+        MapHugeTLB = !strcmp(value, "true");
+        return true;
+    }
+    if (!strcmp(param, "EnableDefrag")) {
+        EnableDefrag = !strcmp(value, "true");
+        return true;
+    }
+    return false;
+};
+
+static const char* LFAlloc_GetParam(const char* param) {
+    struct TParam {
+        const char* Name;
+        const char* Value;
+    };
+
+    static const TParam Params[] = {
+        {"GetLFAllocCounterFast", (const char*)&GetLFAllocCounterFast},
+        {"GetLFAllocCounterFull", (const char*)&GetLFAllocCounterFull},
+#if defined(LFALLOC_DBG)
+        {"SetThreadAllocTag", (const char*)&SetThreadAllocTag},
+        {"SetProfileCurrentThread", (const char*)&SetProfileCurrentThread},
+        {"SetProfileAllThreads", (const char*)&SetProfileAllThreads},
+        {"SetAllocationSamplingEnabled", (const char*)&SetAllocationSamplingEnabled},
+        {"SetAllocationSampleRate", (const char*)&SetAllocationSampleRate},
+        {"SetAllocationSampleMaxSize", (const char*)&SetAllocationSampleMaxSize},
+        {"SetAllocationCallback", (const char*)&SetAllocationCallback},
+        {"SetDeallocationCallback", (const char*)&SetDeallocationCallback},
+        {"GetPerTagAllocInfo", (const char*)&GetPerTagAllocInfo},
+#endif // LFALLOC_DBG
+    };
+
+    for (int i = 0; i < Y_ARRAY_SIZE(Params); ++i) {
+        if (strcmp(param, Params[i].Name) == 0) {
+            return Params[i].Value;
+        }
+    }
+    return nullptr;
+}
+
+static Y_FORCE_INLINE int LFPosixMemalign(void** memptr, size_t alignment, size_t size) {
+    if (Y_UNLIKELY(alignment > 4096)) {
+        const char* error = "Larger alignment are not guaranteed with this implementation\n";
+#ifdef _win_
+        OutputDebugStringA(error);
+#endif
+        NMalloc::AbortFromCorruptedAllocator(error);
+    }
+    size_t bigsize = size;
+    if (bigsize <= alignment) {
+        bigsize = alignment;
+    } else if (bigsize < 2 * alignment) {
+        bigsize = 2 * alignment;
+    }
+#if defined(LFALLOC_DBG)
+    if (alignment > sizeof(TAllocHeader)) {
+        bigsize += alignment;
+    }
+#endif
+
+    *memptr = LFAlloc(bigsize);
+
+#if defined(LFALLOC_DBG)
+    if (alignment > sizeof(TAllocHeader)) {
+        // memptr isn't aligned due to alloc header
+        const auto* header = GetAllocHeader(*memptr);
+        *memptr = (void*)((const char*) (*memptr) + alignment - sizeof(TAllocHeader));
+
+        // make fake header to retrieve original header ptr on dealloc
+        auto* next = GetAllocHeader(*memptr);
+        next->Tag = DBG_ALLOC_ALIGNED_TAG;
+        next->Size = (uint64_t)header;
+        next->Cookie = 0;
+    }
+#endif
+
+    Y_ASSERT_NOBT((intptr_t)*memptr % alignment == 0);
+    return 0;
+}
+
+static Y_FORCE_INLINE void* LFVAlloc(size_t size) {
+    const size_t pg = N_PAGE_SIZE;
+    void* p = nullptr;
+
+#if defined(LFALLOC_DBG)
+    LFPosixMemalign(&p, pg, size);
+#else
+    size_t bigsize = (size + pg - 1) & (~(pg - 1));
+    p = LFAlloc(bigsize);
+#endif
+
+    Y_ASSERT_NOBT((intptr_t)p % N_PAGE_SIZE == 0);
+    return p;
+}
+
+#endif
author	Devtools Arcadia <arcadia-devtools@yandex-team.ru>	2022-02-07 18:08:42 +0300
committer	Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>	2022-02-07 18:08:42 +0300
commit	1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree	e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/lfalloc/lf_allocX64.h
download	ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz