aboutsummaryrefslogblamecommitdiffstats
path: root/library/cpp/lfalloc/lf_allocX64.h
blob: ae753925c36fd4fa46c76adb569b0dc40067a0b3 (plain) (tree)
1
2
3
4
5
6
7
8
9
            


                   
                                          
 
                               
                                 
                              
 
                               
                               
             
            
                   
                           



                                              
                   


                                    
                                    
 
                              
 
                                                  

                                              


                                                  
                                           
              






                                    
     
                                  



                                 
                                                 
                                



























                                                             





                     
                    
                        
                           
      

                           
      
 




                          
                    
                                                    
      
              
                       
                                          
      
                                                                                 
                                
 



                                                                      
                                                               

                                                                        





                                                                                                    

                              
                         
                                                 
 
                         
                           
                                             

                          
                                               


           
                                                         
                 

                                                                                             
      
                                                






                                                          

                       
                       
      
                                     

                                    






        
      
                                              


                                                       
  

                                     
                                     
      
                                            
                 
                                 
     
                                
      



                                                                               
  

                                         
                                         
      








                                                                                          
                 






                                                                            
      







                                                                                                                                     
                                             
                               
                                                         
                                                                             

                                             
                                          







                                                                          












                                                                                 

          
                                                                                   
                                                                          

                                         
  
 
                
                                            
                                         
                                                                                            
 
      
 


                                                                   
                         
                                          













                                                                                        
                         

                                               
                                                                     
                                      
                                                                                          
                                          
                                                    


                           
                                                                       
                     
                                                                                   
                                     
                                       
                              



                                                
                                                                        



         
                                                          
               
                                                                               
                                          
                                                              
                                                                                            
                                                                                         
     
                                                                                                 
                                 
                                                                                            
                                                                                         
     
                                                        

                                               
      
                              
                                     

                      







































                                                                             
                                       
               
                     
     
                                         
      
 
                                                                          
                               
                                                                                                                  
                                                           
                               
 
                                                                      
                                            
                                                                                            
                                
                                                              
     
     


                                                
                                           
                                        
                         
                     
                                                                    
                                                           
                                                     
                                                      
                         
                                                                                         
                                                         
                                                               
                                                      



                        
                                                                
                   
                                                  



                
                                    
                                        
                                               
                                   
                             
                         
                                                               
                                                      
                                                     
                                            




             
                                                              
                     


                                                   
                                             
 
                                         

                                                


                                              
                                                                              
                                                    



                       
                                

      
                                         
                                                                    
                                                    
 
                                 
                                       
                                      
 
                                                                          


























































































                                                                                                                   
         







                                                     
         
                     
     

                               
         

                                                                          

                        
      
                            
                                     
                       
                  
 

                                                                            



                                                        
                                    
                                           
                                        
                                                     
                                                        
                      
             

                   
                              
                   
                           
                                  
                                            



                                                       
 
       

                                          


                                       
                                  
                                                                     
                                            

                                                                                                            
                                                                                                                            
                                                                
                               

                                                                                          
                                                       
                                          

                           
                                  
                   
                          
                                           
                                                          


                      



                                                                                                            








                                                                         
                                                


                                                                                      

                              








                                                                                             
                             


                        
                                           
                                                                
                                  
                         
                                                           

                                                  
                                        
                                                                                         
                                                                        
                                        

                                                                            
                                                                    










                                                                                             
                                                                                        




                                   
                                                
                            
                                           
                            
                                            











                                                                                       
                                             





                                                                 
                                                               
















                                                                        


                                                                                             
                                           
                                                                                                                      

                                        
                                                             


                                                              
                   
 
              


                  
                                                                       
                                           
                     
                                                


                                                                                     

                     


                                    
                                                                 
               
                                                                                           
                                             









                                                              
                                                                        

                                                                                                      
                                               
                                                     

                                
                   

                        
                                                                          
                                             
                                                                






                                                                                   
                                                                            
                                             
                                                                            
              
                                  










                                                                                                       

                                                                                  
                         

                                      








                                         
                                                                                                 
                                               
                                                                   
                                                                                         








                                                                                        
                                 
                                            
                                                         





                                                                          
                                      
                                             




                      
                                               



                        
                                                 
                       
                                                                       


                    
                                 




                                  

                                                                                
 



                            
 
                                            


                              
                                           








                                 
                                



                    
                                                                         





                                            
                                                                        





                                            



                                                            



                                          
                                                    

                                                                                             
                     
 
                                                                          

                                                   
                         
                                                                                         
                                        
                              
                                
                                        

                                                                                              


                   
                                         
                                       
                                    
                                 
                                                    
                                          
                                             
                         







                                                                           

                                                      






                                                                             
     
                 
                                           
                                                 
         
                                                  
         







                                                                             




                                 
                                         

                                    
                                                                                    








                                                     
                                                   





                                   
                                                   

                                      
                                            
                                                            








                                                       
                                                                   
                                                         
                                                       




                                                                                           
                                                   



                                                                                        
                                       
                                        
                                                        











                                                
                                                                                                   
                                       
                                          

                                            
                                 

                               
                                                                                 




                                          
                              







                                            
                               
             
                                                                                         








                                                                        

                                                       












                                                                                                      

                                                                              
 
                        
 
                     
                  
            

               

                                          
                                                  

                 
                                                     




                                                                                                



                                               

                            
                                           
                                
                                              




                                    
                                                      



                                        





                                                   
                                               
                                                           




                                             
                                                          




                                                        
                                                             





                                                                   
                                                                                    





                                                                                  
                                                                                          







                                                          
                                                                  
                                          
                                                                                                            









                                                                                                  
                                                                     







                                                                         
                                                                        
                                                    
                                                     










                                                                         
                                                                          
                                                    
                                                     










                                                                         
                                                                   






                                             
                                                       












                                         
                    



                                     
                                                
                                                                












                                                                                     
                                                                      





                                         
                     
 
                                                                          
                                                        



                                   
                                            
                       





                                                                


                                                              
                                                               
                              
                                        


                          



                                                             

         
                                                      





                                                              
                                             
                                 


                                                                
                                                                                                 
             
                                                                 

                                           



                                                   
     
 
                                                    
                      
                                                                         
                                  
      

               
                                            

                        
                              
      
                                                         
                         
                   

                                      
                                         
               
 

                                                              

                                      
                                         
               
 


                                   

                                              

                                                              
                                          
                                        
              
                                                      







                                                             
                                                  









                                                                              
                                        

                        
                                                          
      
                                                         
                                          
                         
                     
                                                
     
                                                                    
                                             
                                                




                                                                                                    
                                                                 

                            
      

                                  
            
                             
     
                        
      
 
                    
                         

                  
 
                                                      





                                                   
                                   
                                                           
                                                            

                                              

                                                        






                                                      
                                                

                                                                                         
                                                       
                                                         

                                                                                                                   





                                                     
                                             

                                                 
                                                                        
                                     

                                                  
                                       
                                              
                                                             













                                                                              
                                                                            
                                                               
                          

                                     
         
                                                                                                                         
                                                                                                                                



                                           
                                                                                                                                            
                                                                                                                     
                                                                                                               




                                                                        
                            


                                                  
                                   
                                                                        
                          





                                                                          









                                                                            




                                                        


                                                      


                                            






                                                        




                                    
                                                                       
                        
                                                                           
                                                                     





                                                                                     




                                                    


                   
                                                                                         
                                       
                                                                                             
            
                                  
      
                                                    
     




                                         




                                           
                               














                                                                                      
             








                                                   
      




                                                  
#pragma once

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>

#include <library/cpp/malloc/api/malloc.h>

#include <util/system/compat.h>
#include <util/system/compiler.h>
#include <util/system/types.h>

#ifdef _MSC_VER
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS
#endif
#ifdef _M_X64
#define _64_
#endif
#include <intrin.h>
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#pragma intrinsic(_InterlockedCompareExchange)
#pragma intrinsic(_InterlockedExchangeAdd)

#include <new>
#include <assert.h>
#include <errno.h>

#define PERTHREAD __declspec(thread)
#define _win_
#define Y_FORCE_INLINE __forceinline

using TAtomic = volatile long;

static inline long AtomicAdd(TAtomic& a, long b) {
    return _InterlockedExchangeAdd(&a, b) + b;
}

static inline long AtomicSub(TAtomic& a, long b) {
    return AtomicAdd(a, -b);
}

#pragma comment(lib, "synchronization.lib")

#ifndef NDEBUG
#define Y_ASSERT_NOBT(x)           \
    {                              \
        if (IsDebuggerPresent()) { \
            if (!(x))              \
                __debugbreak();    \
        } else                     \
            assert(x);             \
    }
#else
#define Y_ASSERT_NOBT(x) ((void)0)
#endif

#else

#include <util/system/defaults.h>
#include <library/cpp/deprecated/atomic/atomic.h>
#include <util/system/yassert.h>

#if !defined(NDEBUG) && !defined(__GCCXML__)
#define Y_ASSERT_NOBT(a)                                    \
    do {                                                    \
        try {                                               \
            if (Y_UNLIKELY(!(a))) {                         \
                if (YaIsDebuggerPresent())                  \
                    __debugbreak();                         \
                else {                                      \
                    assert(false && (a));                   \
                }                                           \
            }                                               \
        } catch (...) {                                     \
            if (YaIsDebuggerPresent())                      \
                __debugbreak();                             \
            else {                                          \
                assert(false && "Exception during assert"); \
            }                                               \
        }                                                   \
    } while (0)
#else
#define Y_ASSERT_NOBT(a)                       \
    do {                                       \
        if (false) {                           \
            bool __xxx = static_cast<bool>(a); \
            Y_UNUSED(__xxx);                   \
        }                                      \
    } while (0)
#endif

#include <pthread.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <memory.h>
#include <new>
#include <errno.h>

#if defined(_linux_)
#include <linux/futex.h>
#include <sys/syscall.h>
#if !defined(MADV_HUGEPAGE)
#define MADV_HUGEPAGE 14
#endif
#if !defined(MAP_HUGETLB)
#define MAP_HUGETLB 0x40000
#endif
#endif

#define PERTHREAD __thread

#endif

#ifndef _darwin_

#ifndef Y_ARRAY_SIZE
#define Y_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
#endif

#ifndef NDEBUG
#define DBG_FILL_MEMORY
static bool FillMemoryOnAllocation = true;
#endif

static bool TransparentHugePages = false; // force MADV_HUGEPAGE for large allocs
static bool MapHugeTLB = false;           // force MAP_HUGETLB for small allocs
static bool EnableDefrag = true;

// Buffers that are larger than this size will not be filled with 0xcf
#ifndef DBG_FILL_MAX_SIZE
#define DBG_FILL_MAX_SIZE 0x01000000000000ULL
#endif

template <class T>
inline T* DoCas(T* volatile* target, T* exchange, T* compare) {
#if defined(__has_builtin) && __has_builtin(__sync_val_compare_and_swap)
    return __sync_val_compare_and_swap(target, compare, exchange);
#elif defined(_WIN32)
#ifdef _64_
    return (T*)_InterlockedCompareExchange64((__int64*)target, (__int64)exchange, (__int64)compare);
#else
    //return (T*)InterlockedCompareExchangePointer(targetVoidP, exchange, compare);
    return (T*)_InterlockedCompareExchange((LONG*)target, (LONG)exchange, (LONG)compare);
#endif
#elif defined(__i386) || defined(__x86_64__)
    union {
        T* volatile* NP;
        void* volatile* VoidP;
    } gccSucks;
    gccSucks.NP = target;
    void* volatile* targetVoidP = gccSucks.VoidP;

    __asm__ __volatile__(
        "lock\n\t"
        "cmpxchg %2,%0\n\t"
        : "+m"(*(targetVoidP)), "+a"(compare)
        : "r"(exchange)
        : "cc", "memory");
    return compare;
#else
#error inline_cas not defined for this platform
#endif
}

#ifdef _64_
const uintptr_t N_MAX_WORKSET_SIZE = 0x100000000ll * 200;
const uintptr_t N_HUGE_AREA_FINISH = 0x700000000000ll;
#ifndef _freebsd_
const uintptr_t LINUX_MMAP_AREA_START = 0x100000000ll;
static uintptr_t volatile linuxAllocPointer = LINUX_MMAP_AREA_START;
static uintptr_t volatile linuxAllocPointerHuge = LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE;
#endif
#else
const uintptr_t N_MAX_WORKSET_SIZE = 0xffffffff;
#endif
#define ALLOC_START ((char*)0)

const size_t N_CHUNK_SIZE = 1024 * 1024;
const size_t N_CHUNKS = N_MAX_WORKSET_SIZE / N_CHUNK_SIZE;
const size_t N_LARGE_ALLOC_SIZE = N_CHUNK_SIZE * 128;

// map size idx to size in bytes
#ifdef LFALLOC_YT
const int N_SIZES = 27;
#else
const int N_SIZES = 25;
#endif
const int nSizeIdxToSize[N_SIZES] = {
    -1,
#if defined(_64_)
    16, 16, 32, 32, 48, 64, 96, 128,
#else
    8,
    16,
    24,
    32,
    48,
    64,
    96,
    128,
#endif
    192, 256, 384, 512, 768, 1024, 1536, 2048,
    3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768,
#ifdef LFALLOC_YT
    49152, 65536
#endif
};
#ifdef LFALLOC_YT
const size_t N_MAX_FAST_SIZE = 65536;
#else
const size_t N_MAX_FAST_SIZE = 32768;
#endif
const unsigned char size2idxArr1[64 + 1] = {
    1,
#if defined(_64_)
    2, 2, 4, 4, // 16, 16, 32, 32
#else
    1, 2, 3, 4, // 8, 16, 24, 32
#endif
    5, 5, 6, 6,                                                     // 48, 64
    7, 7, 7, 7, 8, 8, 8, 8,                                         // 96, 128
    9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10,         // 192, 256
    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, // 384
    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12  // 512
};
#ifdef LFALLOC_YT
const unsigned char size2idxArr2[256] = {
#else
const unsigned char size2idxArr2[128] = {
#endif
    12, 12, 13, 14,                                                 // 512, 512, 768, 1024
    15, 15, 16, 16,                                                 // 1536, 2048
    17, 17, 17, 17, 18, 18, 18, 18,                                 // 3072, 4096
    19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, // 6144, 8192
    21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, // 12288
    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, // 16384
    23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
    23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, // 24576
    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, // 32768
#ifdef LFALLOC_YT
    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, // 49152
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, // 65536
#endif
};

// map entry number to size idx
// special size idx's: 0 = not used, -1 = mem locked, but not allocated
static volatile char chunkSizeIdx[N_CHUNKS];
const int FREE_CHUNK_ARR_BUF = 0x20000; // this is effectively 128G of free memory (with 1M chunks), should not be exhausted actually
static volatile uintptr_t freeChunkArr[FREE_CHUNK_ARR_BUF];
static volatile int freeChunkCount;

static void AddFreeChunk(uintptr_t chunkId) {
    chunkSizeIdx[chunkId] = -1;
    if (Y_UNLIKELY(freeChunkCount == FREE_CHUNK_ARR_BUF))
        NMalloc::AbortFromCorruptedAllocator("free chunks array overflowed");
    freeChunkArr[freeChunkCount++] = chunkId;
}

static bool GetFreeChunk(uintptr_t* res) {
    if (freeChunkCount == 0) {
        *res = 0;
        return false;
    }
    *res = freeChunkArr[--freeChunkCount];
    return true;
}

//////////////////////////////////////////////////////////////////////////
enum ELFAllocCounter {
    CT_USER_ALLOC,     // accumulated size requested by user code
    CT_MMAP,           // accumulated mmapped size
    CT_MMAP_CNT,       // number of mmapped regions
    CT_MUNMAP,         // accumulated unmmapped size
    CT_MUNMAP_CNT,     // number of munmaped regions
    CT_SYSTEM_ALLOC,   // accumulated allocated size for internal lfalloc needs
    CT_SYSTEM_FREE,    // accumulated deallocated size for internal lfalloc needs
    CT_SMALL_ALLOC,    // accumulated allocated size for fixed-size blocks
    CT_SMALL_FREE,     // accumulated deallocated size for fixed-size blocks
    CT_LARGE_ALLOC,    // accumulated allocated size for large blocks
    CT_LARGE_FREE,     // accumulated deallocated size for large blocks
    CT_SLOW_ALLOC_CNT, // number of slow (not LF) allocations
    CT_DEGRAGMENT_CNT, // number of memory defragmentations
    CT_MAX
};

static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value);

//////////////////////////////////////////////////////////////////////////
enum EMMapMode {
    MM_NORMAL, // memory for small allocs
    MM_HUGE    // memory for large allocs
};

#ifndef _MSC_VER
inline void VerifyMmapResult(void* result) {
    if (Y_UNLIKELY(result == MAP_FAILED))
        NMalloc::AbortFromCorruptedAllocator("negative size requested? or just out of mem");
}
#endif

#if !defined(_MSC_VER) && !defined(_freebsd_) && defined(_64_)
static char* AllocWithMMapLinuxImpl(uintptr_t sz, EMMapMode mode) {
    char* volatile* areaPtr;
    char* areaStart;
    uintptr_t areaFinish;

    int mapProt = PROT_READ | PROT_WRITE;
    int mapFlags = MAP_PRIVATE | MAP_ANON;

    if (mode == MM_HUGE) {
        areaPtr = reinterpret_cast<char* volatile*>(&linuxAllocPointerHuge);
        areaStart = reinterpret_cast<char*>(LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE);
        areaFinish = N_HUGE_AREA_FINISH;
    } else {
        areaPtr = reinterpret_cast<char* volatile*>(&linuxAllocPointer);
        areaStart = reinterpret_cast<char*>(LINUX_MMAP_AREA_START);
        areaFinish = N_MAX_WORKSET_SIZE;

        if (MapHugeTLB) {
            mapFlags |= MAP_HUGETLB;
        }
    }

    bool wrapped = false;
    for (;;) {
        char* prevAllocPtr = *areaPtr;
        char* nextAllocPtr = prevAllocPtr + sz;
        if (uintptr_t(nextAllocPtr - (char*)nullptr) >= areaFinish) {
            if (Y_UNLIKELY(wrapped)) {
                NMalloc::AbortFromCorruptedAllocator("virtual memory is over fragmented");
            }
            // wrap after all area is used
            DoCas(areaPtr, areaStart, prevAllocPtr);
            wrapped = true;
            continue;
        }

        if (DoCas(areaPtr, nextAllocPtr, prevAllocPtr) != prevAllocPtr)
            continue;

        char* largeBlock = (char*)mmap(prevAllocPtr, sz, mapProt, mapFlags, -1, 0);
        VerifyMmapResult(largeBlock);
        if (largeBlock == prevAllocPtr)
            return largeBlock;
        if (largeBlock)
            munmap(largeBlock, sz);

        if (sz < 0x80000) {
            // skip utilized area with big steps
            DoCas(areaPtr, nextAllocPtr + 0x10 * 0x10000, nextAllocPtr);
        }
    }
}
#endif

static char* AllocWithMMap(uintptr_t sz, EMMapMode mode) {
    (void)mode;
#ifdef _MSC_VER
    char* largeBlock = (char*)VirtualAlloc(0, sz, MEM_RESERVE, PAGE_READWRITE);
    if (Y_UNLIKELY(largeBlock == nullptr))
        NMalloc::AbortFromCorruptedAllocator("out of memory");
    if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE))
        NMalloc::AbortFromCorruptedAllocator("out of working set, something has broken");
#else
#if defined(_freebsd_) || !defined(_64_)
    char* largeBlock = (char*)mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
    VerifyMmapResult(largeBlock);
    if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE))
        NMalloc::AbortFromCorruptedAllocator("out of working set, something has broken");
#else
    char* largeBlock = AllocWithMMapLinuxImpl(sz, mode);
    if (TransparentHugePages) {
        madvise(largeBlock, sz, MADV_HUGEPAGE);
    }
#endif
#endif
    Y_ASSERT_NOBT(largeBlock);
    IncrementCounter(CT_MMAP, sz);
    IncrementCounter(CT_MMAP_CNT, 1);
    return largeBlock;
}

enum class ELarge : ui8 {
    Free = 0,   // block in free cache
    Alloc = 1,  // block is allocated
    Gone = 2,   // block was unmapped
};

struct TLargeBlk {

    static TLargeBlk* As(void *raw) {
        return reinterpret_cast<TLargeBlk*>((char*)raw - 4096ll);
    }

    static const TLargeBlk* As(const void *raw) {
        return reinterpret_cast<const TLargeBlk*>((const char*)raw - 4096ll);
    }

    void SetSize(size_t bytes, size_t pages) {
        Pages = pages;
        Bytes = bytes;
    }

    void Mark(ELarge state) {
        const ui64 marks[] = {
            0x8b38aa5ca4953c98, // ELarge::Free
            0xf916d33584eb5087, // ELarge::Alloc
            0xd33b0eca7651bc3f  // ELarge::Gone
        };

        Token = size_t(marks[ui8(state)]);
    }

    size_t Pages; // Total pages allocated with mmap like call
    size_t Bytes; // Actually requested bytes by user
    size_t Token; // Block state token, see ELarge enum.
};


static void LargeBlockUnmap(void* p, size_t pages) {
    const auto bytes = (pages + 1) * uintptr_t(4096);

    IncrementCounter(CT_MUNMAP, bytes);
    IncrementCounter(CT_MUNMAP_CNT, 1);
#ifdef _MSC_VER
    Y_ASSERT_NOBT(0);
#else
    TLargeBlk::As(p)->Mark(ELarge::Gone);
    munmap((char*)p - 4096ll, bytes);
#endif
}

//////////////////////////////////////////////////////////////////////////
const size_t LB_BUF_SIZE = 250;
const size_t LB_BUF_HASH = 977;
static int LB_LIMIT_TOTAL_SIZE = 500 * 1024 * 1024 / 4096; // do not keep more then this mem total in lbFreePtrs[]
static void* volatile lbFreePtrs[LB_BUF_HASH][LB_BUF_SIZE];
static TAtomic lbFreePageCount;


static void* LargeBlockAlloc(size_t _nSize, ELFAllocCounter counter) {
    size_t pgCount = (_nSize + 4095) / 4096;
#ifdef _MSC_VER
    char* pRes = (char*)VirtualAlloc(0, (pgCount + 1) * 4096ll, MEM_COMMIT, PAGE_READWRITE);
    if (Y_UNLIKELY(pRes == 0)) {
        NMalloc::AbortFromCorruptedAllocator("out of memory");
    }
#else

    IncrementCounter(counter, pgCount * 4096ll);
    IncrementCounter(CT_SYSTEM_ALLOC, 4096ll);

    int lbHash = pgCount % LB_BUF_HASH;
    for (int i = 0; i < LB_BUF_SIZE; ++i) {
        void* p = lbFreePtrs[lbHash][i];
        if (p == nullptr)
            continue;
        if (DoCas(&lbFreePtrs[lbHash][i], (void*)nullptr, p) == p) {
            size_t realPageCount = TLargeBlk::As(p)->Pages;
            if (realPageCount == pgCount) {
                AtomicAdd(lbFreePageCount, -pgCount);
                TLargeBlk::As(p)->Mark(ELarge::Alloc);
                return p;
            } else {
                if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) != (void*)nullptr) {
                    // block was freed while we were busy
                    AtomicAdd(lbFreePageCount, -realPageCount);
                    LargeBlockUnmap(p, realPageCount);
                    --i;
                }
            }
        }
    }
    char* pRes = AllocWithMMap((pgCount + 1) * 4096ll, MM_HUGE);
#endif
    pRes += 4096ll;
    TLargeBlk::As(pRes)->SetSize(_nSize, pgCount);
    TLargeBlk::As(pRes)->Mark(ELarge::Alloc);

    return pRes;
}

#ifndef _MSC_VER
static void FreeAllLargeBlockMem() {
    for (auto& lbFreePtr : lbFreePtrs) {
        for (int i = 0; i < LB_BUF_SIZE; ++i) {
            void* p = lbFreePtr[i];
            if (p == nullptr)
                continue;
            if (DoCas(&lbFreePtr[i], (void*)nullptr, p) == p) {
                int pgCount = TLargeBlk::As(p)->Pages;
                AtomicAdd(lbFreePageCount, -pgCount);
                LargeBlockUnmap(p, pgCount);
            }
        }
    }
}
#endif

static void LargeBlockFree(void* p, ELFAllocCounter counter) {
    if (p == nullptr)
        return;
#ifdef _MSC_VER
    VirtualFree((char*)p - 4096ll, 0, MEM_RELEASE);
#else
    size_t pgCount = TLargeBlk::As(p)->Pages;

    TLargeBlk::As(p)->Mark(ELarge::Free);
    IncrementCounter(counter, pgCount * 4096ll);
    IncrementCounter(CT_SYSTEM_FREE, 4096ll);

    if (lbFreePageCount > LB_LIMIT_TOTAL_SIZE)
        FreeAllLargeBlockMem();
    int lbHash = pgCount % LB_BUF_HASH;
    for (int i = 0; i < LB_BUF_SIZE; ++i) {
        if (lbFreePtrs[lbHash][i] == nullptr) {
            if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) == nullptr) {
                AtomicAdd(lbFreePageCount, pgCount);
                return;
            }
        }
    }

    LargeBlockUnmap(p, pgCount);
#endif
}

static void* SystemAlloc(size_t _nSize) {
    //HeapAlloc(GetProcessHeap(), HEAP_GENERATE_EXCEPTIONS, _nSize);
    return LargeBlockAlloc(_nSize, CT_SYSTEM_ALLOC);
}
static void SystemFree(void* p) {
    //HeapFree(GetProcessHeap(), 0, p);
    LargeBlockFree(p, CT_SYSTEM_FREE);
}


//////////////////////////////////////////////////////////////////////////
char* const LF_LOCK_FREE = ((char*)0) + 0;
char* const LF_LOCK_LOCKED = ((char*)0) + 1;
char* const LF_LOCK_FUTEX_WAIT = ((char*)0) + 2;
static bool LFHasFutex = true;
static bool LFCheckedWinVersion = false;

// TLFLockData has to be zero-initialized explicitly https://en.cppreference.com/w/cpp/language/zero_initialization
// otherwise constructor TLFLockData() for global var might be called after first use
struct TLFLockData
{
    char* Pad1[15];
    char* volatile LockVar; // = LF_LOCK_FREE; // no constructor, zero-initialize manually
    char* Pad2[15];

    bool TryLock()
    {
        return (LockVar == LF_LOCK_FREE && DoCas(&LockVar, LF_LOCK_LOCKED, LF_LOCK_FREE) == LF_LOCK_FREE);
    }

    void FutexWait()
    {
#ifdef _win_
        if (!LFCheckedWinVersion) {
            OSVERSIONINFOA ver;
            memset(&ver, 0, sizeof(ver));
            ver.dwOSVersionInfoSize = sizeof(OSVERSIONINFOA);
            GetVersionExA(&ver);
            LFHasFutex = (ver.dwMajorVersion > 6) || (ver.dwMajorVersion == 6 && ver.dwMinorVersion >= 2);
            LFCheckedWinVersion = true;
        }
        if (LFHasFutex) {
            if (LockVar == LF_LOCK_LOCKED) {
                DoCas(&LockVar, LF_LOCK_FUTEX_WAIT, LF_LOCK_LOCKED);
            }
            if (LockVar == LF_LOCK_FUTEX_WAIT) {
                char* lockedValue = LF_LOCK_FUTEX_WAIT;
                WaitOnAddress(&LockVar, &lockedValue, sizeof(LockVar), INFINITE);
            }
        } else {
            SwitchToThread();
        }
#elif defined(_linux_)
        if (LFHasFutex) {
            if (LockVar == LF_LOCK_LOCKED) {
                DoCas(&LockVar, LF_LOCK_FUTEX_WAIT, LF_LOCK_LOCKED);
            }
            if (LockVar == LF_LOCK_FUTEX_WAIT) {
                // linux allow only int variables checks, here we pretend low bits of LockVar are int
                syscall(SYS_futex, &LockVar, FUTEX_WAIT_PRIVATE, *(int*)&LF_LOCK_FUTEX_WAIT, 0, 0, 0);
            }
        } else {
            sched_yield();
        }
#else
        sched_yield();
#endif
    }

    void Unlock()
    {
        Y_ASSERT_NOBT(LockVar != LF_LOCK_FREE);
        if (DoCas(&LockVar, LF_LOCK_FREE, LF_LOCK_LOCKED) != LF_LOCK_LOCKED) {
            Y_ASSERT_NOBT(LockVar == LF_LOCK_FUTEX_WAIT && LFHasFutex);
            LockVar = LF_LOCK_FREE;
#ifdef _win_
            WakeByAddressAll((PVOID)&LockVar);
#elif defined(_linux_)
            syscall(SYS_futex, &LockVar, FUTEX_WAKE_PRIVATE, INT_MAX, 0, 0, 0);
#endif
        }
    }
};

static TLFLockData LFGlobalLock;


class TLFLockHolder {
    TLFLockData *LockData = nullptr;
    int Attempt = 0;
    int SleepMask = 0x7f;

public:
    TLFLockHolder() {}
    TLFLockHolder(TLFLockData *lk) {
        while (!TryLock(lk));
    }
    bool TryLock(TLFLockData *lk)
    {
        Y_ASSERT_NOBT(LockData == nullptr);
        if (lk->TryLock()) {
            LockData = lk;
            return true;
        }
        if ((++Attempt & SleepMask) == 0) {
            lk->FutexWait();
            SleepMask = (SleepMask * 2 + 1) & 0x7fff;
        } else {
#ifdef _MSC_VER
            _mm_pause();
#elif defined(__i386) || defined(__x86_64__)
            __asm__ __volatile__("pause");
#endif
        }
        return false;
    }
    ~TLFLockHolder() {
        if (LockData) {
            LockData->Unlock();
        }
    }
};

//////////////////////////////////////////////////////////////////////////
class TLFAllocFreeList {
    struct TNode {
        TNode* Next;
    };

    TNode* volatile Head;
    TNode* volatile Pending;
    TAtomic PendingToFreeListCounter;
    TAtomic AllocCount;
    void* Padding;

    static Y_FORCE_INLINE void Enqueue(TNode* volatile* headPtr, TNode* n) {
        for (;;) {
            TNode* volatile prevHead = *headPtr;
            n->Next = prevHead;
            if (DoCas(headPtr, n, prevHead) == prevHead)
                break;
        }
    }
    Y_FORCE_INLINE void* DoAlloc() {
        TNode* res;
        for (res = Head; res; res = Head) {
            TNode* keepNext = res->Next;
            if (DoCas(&Head, keepNext, res) == res) {
                //Y_ABORT_UNLESS(keepNext == res->Next);
                break;
            }
        }
        return res;
    }
    void FreeList(TNode* fl) {
        if (!fl)
            return;
        TNode* flTail = fl;
        while (flTail->Next)
            flTail = flTail->Next;
        for (;;) {
            TNode* volatile prevHead = Head;
            flTail->Next = prevHead;
            if (DoCas(&Head, fl, prevHead) == prevHead)
                break;
        }
    }

public:
    Y_FORCE_INLINE void Free(void* ptr) {
        TNode* newFree = (TNode*)ptr;
        if (AtomicAdd(AllocCount, 0) == 0)
            Enqueue(&Head, newFree);
        else
            Enqueue(&Pending, newFree);
    }
    Y_FORCE_INLINE void* Alloc() {
        TAtomic keepCounter = AtomicAdd(PendingToFreeListCounter, 0);
        TNode* fl = Pending;
        if (AtomicAdd(AllocCount, 1) == 1) {
            // No other allocs in progress.
            // If (keepCounter == PendingToFreeListCounter) then Pending was not freed by other threads.
            // Hence Pending is not used in any concurrent DoAlloc() atm and can be safely moved to FreeList
            if (fl && keepCounter == AtomicAdd(PendingToFreeListCounter, 0) && DoCas(&Pending, (TNode*)nullptr, fl) == fl) {
                // pick first element from Pending and return it
                void* res = fl;
                fl = fl->Next;
                // if there are other elements in Pending list, add them to main free list
                FreeList(fl);
                AtomicAdd(PendingToFreeListCounter, 1);
                AtomicAdd(AllocCount, -1);
                return res;
            }
        }
        void* res = DoAlloc();
        AtomicAdd(AllocCount, -1);
        return res;
    }
    void* GetWholeList() {
        TNode* res;
        for (res = Head; res; res = Head) {
            if (DoCas(&Head, (TNode*)nullptr, res) == res)
                break;
        }
        return res;
    }
    void ReturnWholeList(void* ptr) {
        while (AtomicAdd(AllocCount, 0) != 0) // theoretically can run into problems with parallel DoAlloc()
            ;                                 //ThreadYield();
        for (;;) {
            TNode* prevHead = Head;
            if (DoCas(&Head, (TNode*)ptr, prevHead) == prevHead) {
                FreeList(prevHead);
                break;
            }
        }
    }
};

/////////////////////////////////////////////////////////////////////////
static TLFAllocFreeList globalFreeLists[N_SIZES];
static char* volatile globalCurrentPtr[N_SIZES];
static TLFAllocFreeList blockFreeList;

// globalFreeLists[] contains TFreeListGroup, each of them points up to 15 free blocks
const int FL_GROUP_SIZE = 15;
struct TFreeListGroup {
    TFreeListGroup* Next;
    char* Ptrs[FL_GROUP_SIZE];
};
#ifdef _64_
const int FREE_LIST_GROUP_SIZEIDX = 8;
#else
const int FREE_LIST_GROUP_SIZEIDX = 6;
#endif

//////////////////////////////////////////////////////////////////////////
// find free chunks and reset chunk size so they can be reused by different sized allocations
// do not look at blockFreeList (TFreeListGroup has same size for any allocations)
static bool DefragmentMem() {
    if (!EnableDefrag) {
        return false;
    }

    IncrementCounter(CT_DEGRAGMENT_CNT, 1);

    int* nFreeCount = (int*)SystemAlloc(N_CHUNKS * sizeof(int));
    if (Y_UNLIKELY(!nFreeCount)) {
        //__debugbreak();
        NMalloc::AbortFromCorruptedAllocator("debugbreak");
    }
    memset(nFreeCount, 0, N_CHUNKS * sizeof(int));

    TFreeListGroup* wholeLists[N_SIZES];
    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) {
        wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList();
        for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) {
            for (auto pData : g->Ptrs) {
                if (pData) {
                    uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE;
                    ++nFreeCount[nChunk];
                    Y_ASSERT_NOBT(chunkSizeIdx[nChunk] == nSizeIdx);
                }
            }
        }
    }

    bool bRes = false;
    for (size_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) {
        int fc = nFreeCount[nChunk];
        nFreeCount[nChunk] = 0;
        if (chunkSizeIdx[nChunk] <= 0)
            continue;
        int nEntries = N_CHUNK_SIZE / nSizeIdxToSize[static_cast<int>(chunkSizeIdx[nChunk])];
        Y_ASSERT_NOBT(fc <= nEntries); // can not have more free blocks then total count
        if (fc == nEntries) {
            bRes = true;
            nFreeCount[nChunk] = 1;
        }
    }
    if (bRes) {
        for (auto& wholeList : wholeLists) {
            TFreeListGroup** ppPtr = &wholeList;
            while (*ppPtr) {
                TFreeListGroup* g = *ppPtr;
                int dst = 0;
                for (auto pData : g->Ptrs) {
                    if (pData) {
                        uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE;
                        if (nFreeCount[nChunk] == 0)
                            g->Ptrs[dst++] = pData; // block is not freed, keep pointer
                    }
                }
                if (dst == 0) {
                    // no valid pointers in group, free it
                    *ppPtr = g->Next;
                    blockFreeList.Free(g);
                } else {
                    // reset invalid pointers to 0
                    for (int i = dst; i < FL_GROUP_SIZE; ++i)
                        g->Ptrs[i] = nullptr;
                    ppPtr = &g->Next;
                }
            }
        }
        for (uintptr_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) {
            if (!nFreeCount[nChunk])
                continue;
            char* pStart = ALLOC_START + nChunk * N_CHUNK_SIZE;
#ifdef _win_
            VirtualFree(pStart, N_CHUNK_SIZE, MEM_DECOMMIT);
#elif defined(_freebsd_)
            madvise(pStart, N_CHUNK_SIZE, MADV_FREE);
#else
            madvise(pStart, N_CHUNK_SIZE, MADV_DONTNEED);
#endif
            AddFreeChunk(nChunk);
        }
    }

    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx)
        globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]);

    SystemFree(nFreeCount);
    return bRes;
}

static Y_FORCE_INLINE void* LFAllocFromCurrentChunk(int nSizeIdx, int blockSize, int count) {
    char* volatile* pFreeArray = &globalCurrentPtr[nSizeIdx];
    while (char* newBlock = *pFreeArray) {
        char* nextFree = newBlock + blockSize * count;

        // check if there is space in chunk
        char* globalEndPtr = ALLOC_START + ((newBlock - ALLOC_START) & ~((uintptr_t)N_CHUNK_SIZE - 1)) + N_CHUNK_SIZE;
        if (nextFree >= globalEndPtr) {
            if (nextFree > globalEndPtr)
                break;
            nextFree = nullptr; // it was last block in chunk
        }
        if (DoCas(pFreeArray, nextFree, newBlock) == newBlock)
            return newBlock;
    }
    return nullptr;
}

enum EDefrag {
    MEM_DEFRAG,
    NO_MEM_DEFRAG,
};

static void* SlowLFAlloc(int nSizeIdx, int blockSize, EDefrag defrag) {
    IncrementCounter(CT_SLOW_ALLOC_CNT, 1);

    TLFLockHolder ls;
    for (;;) {
        bool locked = ls.TryLock(&LFGlobalLock);
        void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1);
        if (res) {
            return res; // might happen when other thread allocated new current chunk
        }
        if (locked) {
            break;
        }
    }
    for (;;) {
        uintptr_t nChunk;
        if (GetFreeChunk(&nChunk)) {
            char* newPlace = ALLOC_START + nChunk * N_CHUNK_SIZE;
#ifdef _MSC_VER
            void* pTest = VirtualAlloc(newPlace, N_CHUNK_SIZE, MEM_COMMIT, PAGE_READWRITE);
            Y_ASSERT_NOBT(pTest == newPlace);
#endif
            chunkSizeIdx[nChunk] = (char)nSizeIdx;
            globalCurrentPtr[nSizeIdx] = newPlace + blockSize;
            return newPlace;
        }

        // out of luck, try to defrag
        if (defrag == MEM_DEFRAG && DefragmentMem()) {
            continue;
        }

        char* largeBlock = AllocWithMMap(N_LARGE_ALLOC_SIZE, MM_NORMAL);
        uintptr_t addr = ((largeBlock - ALLOC_START) + N_CHUNK_SIZE - 1) & (~(N_CHUNK_SIZE - 1));
        uintptr_t endAddr = ((largeBlock - ALLOC_START) + N_LARGE_ALLOC_SIZE) & (~(N_CHUNK_SIZE - 1));
        for (uintptr_t p = addr; p < endAddr; p += N_CHUNK_SIZE) {
            uintptr_t chunk = p / N_CHUNK_SIZE;
            Y_ASSERT_NOBT(chunk * N_CHUNK_SIZE == p);
            Y_ASSERT_NOBT(chunkSizeIdx[chunk] == 0);
            AddFreeChunk(chunk);
        }
    }
    return nullptr;
}

// allocate single block
static Y_FORCE_INLINE void* LFAllocNoCache(int nSizeIdx, EDefrag defrag) {
    int blockSize = nSizeIdxToSize[nSizeIdx];
    void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1);
    if (res)
        return res;

    return SlowLFAlloc(nSizeIdx, blockSize, defrag);
}

// allocate multiple blocks, returns number of blocks allocated (max FL_GROUP_SIZE)
// buf should have space for at least FL_GROUP_SIZE elems
static Y_FORCE_INLINE int LFAllocNoCacheMultiple(int nSizeIdx, char** buf) {
    int blockSize = nSizeIdxToSize[nSizeIdx];
    void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, FL_GROUP_SIZE);
    if (res) {
        char* resPtr = (char*)res;
        for (int k = 0; k < FL_GROUP_SIZE; ++k) {
            buf[k] = resPtr;
            resPtr += blockSize;
        }
        return FL_GROUP_SIZE;
    }
    buf[0] = (char*)SlowLFAlloc(nSizeIdx, blockSize, MEM_DEFRAG);
    return 1;
}

// take several blocks from global free list (max FL_GROUP_SIZE blocks), returns number of blocks taken
// buf should have space for at least FL_GROUP_SIZE elems
static Y_FORCE_INLINE int TakeBlocksFromGlobalFreeList(int nSizeIdx, char** buf) {
    TLFAllocFreeList& fl = globalFreeLists[nSizeIdx];
    TFreeListGroup* g = (TFreeListGroup*)fl.Alloc();
    if (g) {
        int resCount = 0;
        for (auto& ptr : g->Ptrs) {
            if (ptr)
                buf[resCount++] = ptr;
            else
                break;
        }
        blockFreeList.Free(g);
        return resCount;
    }
    return 0;
}

// add several blocks to global free list
static Y_FORCE_INLINE void PutBlocksToGlobalFreeList(ptrdiff_t nSizeIdx, char** buf, int count) {
    for (int startIdx = 0; startIdx < count;) {
        TFreeListGroup* g = (TFreeListGroup*)blockFreeList.Alloc();
        Y_ASSERT_NOBT(sizeof(TFreeListGroup) == nSizeIdxToSize[FREE_LIST_GROUP_SIZEIDX]);
        if (!g) {
            g = (TFreeListGroup*)LFAllocNoCache(FREE_LIST_GROUP_SIZEIDX, NO_MEM_DEFRAG);
        }

        int groupSize = count - startIdx;
        if (groupSize > FL_GROUP_SIZE)
            groupSize = FL_GROUP_SIZE;
        for (int i = 0; i < groupSize; ++i)
            g->Ptrs[i] = buf[startIdx + i];
        for (int i = groupSize; i < FL_GROUP_SIZE; ++i)
            g->Ptrs[i] = nullptr;

        // add free group to the global list
        TLFAllocFreeList& fl = globalFreeLists[nSizeIdx];
        fl.Free(g);

        startIdx += groupSize;
    }
}

//////////////////////////////////////////////////////////////////////////
static TAtomic GlobalCounters[CT_MAX];
const int MAX_LOCAL_UPDATES = 100;
const intptr_t MAX_LOCAL_DELTA = 1*1024*1024;

struct TLocalCounter {
    intptr_t Value;
    int Updates;
    TAtomic* Parent;

    Y_FORCE_INLINE void Init(TAtomic* parent) {
        Parent = parent;
        Value = 0;
        Updates = 0;
    }

    Y_FORCE_INLINE void Increment(size_t value) {
        Value += value;
        if (++Updates > MAX_LOCAL_UPDATES || Value > MAX_LOCAL_DELTA) {
            Flush();
        }
    }

    Y_FORCE_INLINE void Flush() {
        AtomicAdd(*Parent, Value);
        Value = 0;
        Updates = 0;
    }
};

////////////////////////////////////////////////////////////////////////////////
// DBG stuff
////////////////////////////////////////////////////////////////////////////////

#if defined(LFALLOC_DBG)

struct TPerTagAllocCounter {
    TAtomic Size;
    TAtomic Count;

    Y_FORCE_INLINE void Alloc(size_t size) {
        AtomicAdd(Size, size);
        AtomicAdd(Count, 1);
    }

    Y_FORCE_INLINE void Free(size_t size) {
        AtomicSub(Size, size);
        AtomicSub(Count, 1);
    }
};

struct TLocalPerTagAllocCounter {
    intptr_t Size;
    int Count;
    int Updates;

    Y_FORCE_INLINE void Init() {
        Size = 0;
        Count = 0;
        Updates = 0;
    }

    Y_FORCE_INLINE void Alloc(TPerTagAllocCounter& parent, size_t size) {
        Size += size;
        ++Count;
        if (++Updates > MAX_LOCAL_UPDATES) {
            Flush(parent);
        }
    }

    Y_FORCE_INLINE void Free(TPerTagAllocCounter& parent, size_t size) {
        Size -= size;
        --Count;
        if (++Updates > MAX_LOCAL_UPDATES) {
            Flush(parent);
        }
    }

    Y_FORCE_INLINE void Flush(TPerTagAllocCounter& parent) {
        AtomicAdd(parent.Size, Size);
        Size = 0;
        AtomicAdd(parent.Count, Count);
        Count = 0;
        Updates = 0;
    }
};

static const int DBG_ALLOC_MAX_TAG = 1000;
static const int DBG_ALLOC_ALIGNED_TAG = 0xF0000000;
static const int DBG_ALLOC_NUM_SIZES = 30;
static TPerTagAllocCounter GlobalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES];

#endif // LFALLOC_DBG

//////////////////////////////////////////////////////////////////////////
const int THREAD_BUF = 256;
static int borderSizes[N_SIZES];
const int MAX_MEM_PER_SIZE_PER_THREAD = 512 * 1024;
struct TThreadAllocInfo {
    // FreePtrs - pointers to first free blocks in per thread block list
    // LastFreePtrs - pointers to last blocks in lists, may be invalid if FreePtr is zero
    char* FreePtrs[N_SIZES][THREAD_BUF];
    int FreePtrIndex[N_SIZES];
    TThreadAllocInfo* pNextInfo;
    TLocalCounter LocalCounters[CT_MAX];

#if defined(LFALLOC_DBG)
    TLocalPerTagAllocCounter LocalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES];
#endif
#ifdef _win_
    HANDLE hThread;
#endif

    void Init(TThreadAllocInfo** pHead) {
        memset(this, 0, sizeof(*this));
        for (auto& i : FreePtrIndex)
            i = THREAD_BUF;
#ifdef _win_
        BOOL b = DuplicateHandle(
            GetCurrentProcess(), GetCurrentThread(),
            GetCurrentProcess(), &hThread,
            0, FALSE, DUPLICATE_SAME_ACCESS);
        Y_ASSERT_NOBT(b);
#endif
        pNextInfo = *pHead;
        *pHead = this;
        for (int k = 0; k < N_SIZES; ++k) {
            int maxCount = MAX_MEM_PER_SIZE_PER_THREAD / nSizeIdxToSize[k];
            if (maxCount > THREAD_BUF)
                maxCount = THREAD_BUF;
            borderSizes[k] = THREAD_BUF - maxCount;
        }
        for (int i = 0; i < CT_MAX; ++i) {
            LocalCounters[i].Init(&GlobalCounters[i]);
        }
#if defined(LFALLOC_DBG)
        for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
            for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
                auto& local = LocalPerTagAllocCounters[tag][sizeIdx];
                local.Init();
            }
        }
#endif
    }
    void Done() {
        for (auto sizeIdx : FreePtrIndex) {
            Y_ASSERT_NOBT(sizeIdx == THREAD_BUF);
        }
        for (auto& localCounter : LocalCounters) {
            localCounter.Flush();
        }
#if defined(LFALLOC_DBG)
        for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
            for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
                auto& local = LocalPerTagAllocCounters[tag][sizeIdx];
                auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
                local.Flush(global);
            }
        }
#endif
#ifdef _win_
        if (hThread)
            CloseHandle(hThread);
#endif
    }
};
PERTHREAD TThreadAllocInfo* pThreadInfo;
static TThreadAllocInfo* pThreadInfoList;

static TLFLockData LFLockThreadInfo;

static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value) {
#ifdef LFALLOC_YT
    TThreadAllocInfo* thr = pThreadInfo;
    if (thr) {
        thr->LocalCounters[counter].Increment(value);
    } else {
        AtomicAdd(GlobalCounters[counter], value);
    }
#endif
}

extern "C" i64 GetLFAllocCounterFast(int counter) {
#ifdef LFALLOC_YT
    return GlobalCounters[counter];
#else
    return 0;
#endif
}

extern "C" i64 GetLFAllocCounterFull(int counter) {
#ifdef LFALLOC_YT
    i64 ret = GlobalCounters[counter];
    {
        TLFLockHolder ll(&LFLockThreadInfo);
        for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
            TThreadAllocInfo* pInfo = *p;
            ret += pInfo->LocalCounters[counter].Value;
            p = &pInfo->pNextInfo;
        }
    }
    return ret;
#else
    return 0;
#endif
}

static void MoveSingleThreadFreeToGlobal(TThreadAllocInfo* pInfo) {
    for (int sizeIdx = 0; sizeIdx < N_SIZES; ++sizeIdx) {
        int& freePtrIdx = pInfo->FreePtrIndex[sizeIdx];
        char** freePtrs = pInfo->FreePtrs[sizeIdx];
        PutBlocksToGlobalFreeList(sizeIdx, freePtrs + freePtrIdx, THREAD_BUF - freePtrIdx);
        freePtrIdx = THREAD_BUF;
    }
}

#ifdef _win_
static bool IsDeadThread(TThreadAllocInfo* pInfo) {
    DWORD dwExit;
    bool isDead = !GetExitCodeThread(pInfo->hThread, &dwExit) || dwExit != STILL_ACTIVE;
    return isDead;
}

static void CleanupAfterDeadThreads() {
    TLFLockHolder ll(&LFLockThreadInfo);
    for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
        TThreadAllocInfo* pInfo = *p;
        if (IsDeadThread(pInfo)) {
            MoveSingleThreadFreeToGlobal(pInfo);
            pInfo->Done();
            *p = pInfo->pNextInfo;
            SystemFree(pInfo);
        } else
            p = &pInfo->pNextInfo;
    }
}
#endif

#ifndef _win_
static pthread_key_t ThreadCacheCleaner;
static void* volatile ThreadCacheCleanerStarted; // 0 = not started, -1 = started, -2 = is starting
static PERTHREAD bool IsStoppingThread;

static void FreeThreadCache(void*) {
    TThreadAllocInfo* pToDelete = nullptr;
    {
        TLFLockHolder ll(&LFLockThreadInfo);
        pToDelete = pThreadInfo;
        if (pToDelete == nullptr)
            return;

        // remove from the list
        for (TThreadAllocInfo** p = &pThreadInfoList; *p; p = &(*p)->pNextInfo) {
            if (*p == pToDelete) {
                *p = pToDelete->pNextInfo;
                break;
            }
        }
        IsStoppingThread = true;
        pThreadInfo = nullptr;
    }

    // free per thread buf
    MoveSingleThreadFreeToGlobal(pToDelete);
    pToDelete->Done();
    SystemFree(pToDelete);
}
#endif

static void AllocThreadInfo() {
#ifndef _win_
    if (DoCas(&ThreadCacheCleanerStarted, (void*)-2, (void*)nullptr) == (void*)nullptr) {
        pthread_key_create(&ThreadCacheCleaner, FreeThreadCache);
        ThreadCacheCleanerStarted = (void*)-1;
    }
    if (ThreadCacheCleanerStarted != (void*)-1)
        return; // do not use ThreadCacheCleaner until it is constructed

    {
        if (IsStoppingThread)
            return;
        TLFLockHolder ll(&LFLockThreadInfo);
        if (IsStoppingThread) // better safe than sorry
            return;

        pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo));
        pThreadInfo->Init(&pThreadInfoList);
    }
    pthread_setspecific(ThreadCacheCleaner, (void*)-1); // without value destructor will not be called
#else
    CleanupAfterDeadThreads();
    {
        pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo));
        TLFLockHolder ll(&LFLockThreadInfo);
        pThreadInfo->Init(&pThreadInfoList);
    }
#endif
}

    //////////////////////////////////////////////////////////////////////////
    // DBG stuff
    //////////////////////////////////////////////////////////////////////////

#if defined(LFALLOC_DBG)

struct TAllocHeader {
    uint64_t Size;
    int Tag;
    int Cookie;
};

// should be power of 2
static_assert(sizeof(TAllocHeader) == 16);

static inline void* GetAllocPtr(TAllocHeader* p) {
    return p + 1;
}

static inline TAllocHeader* GetAllocHeader(void* p) {
    return ((TAllocHeader*)p) - 1;
}

// if present, uses the fake header stored by LFPosixMemalign() to retrieve the original header.
static inline TAllocHeader* GetOrigAllocHeader(void* p) {
    auto* header = GetAllocHeader(p);
    if (header->Tag == DBG_ALLOC_ALIGNED_TAG) {
        return (TAllocHeader*)header->Size;
    }

    return header;
}

PERTHREAD int AllocationTag;
extern "C" int SetThreadAllocTag(int tag) {
    int prevTag = AllocationTag;
    if (tag < DBG_ALLOC_MAX_TAG && tag >= 0) {
        AllocationTag = tag;
    }
    return prevTag;
}

PERTHREAD bool ProfileCurrentThread;
extern "C" bool SetProfileCurrentThread(bool newVal) {
    bool prevVal = ProfileCurrentThread;
    ProfileCurrentThread = newVal;
    return prevVal;
}

static volatile bool ProfileAllThreads;
extern "C" bool SetProfileAllThreads(bool newVal) {
    bool prevVal = ProfileAllThreads;
    ProfileAllThreads = newVal;
    return prevVal;
}

static volatile bool AllocationSamplingEnabled;
extern "C" bool SetAllocationSamplingEnabled(bool newVal) {
    bool prevVal = AllocationSamplingEnabled;
    AllocationSamplingEnabled = newVal;
    return prevVal;
}

static size_t AllocationSampleRate = 1000;
extern "C" size_t SetAllocationSampleRate(size_t newVal) {
    size_t prevVal = AllocationSampleRate;
    AllocationSampleRate = newVal;
    return prevVal;
}

static size_t AllocationSampleMaxSize = N_MAX_FAST_SIZE;
extern "C" size_t SetAllocationSampleMaxSize(size_t newVal) {
    size_t prevVal = AllocationSampleMaxSize;
    AllocationSampleMaxSize = newVal;
    return prevVal;
}

using TAllocationCallback = int(int tag, size_t size, int sizeIdx);
static TAllocationCallback* AllocationCallback;
extern "C" TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal) {
    TAllocationCallback* prevVal = AllocationCallback;
    AllocationCallback = newVal;
    return prevVal;
}

using TDeallocationCallback = void(int cookie, int tag, size_t size, int sizeIdx);
static TDeallocationCallback* DeallocationCallback;
extern "C" TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal) {
    TDeallocationCallback* prevVal = DeallocationCallback;
    DeallocationCallback = newVal;
    return prevVal;
}

PERTHREAD TAtomic AllocationsCount;
PERTHREAD bool InAllocationCallback;

static const int DBG_ALLOC_INVALID_COOKIE = -1;
static inline int SampleAllocation(TAllocHeader* p, int sizeIdx) {
    int cookie = DBG_ALLOC_INVALID_COOKIE;
    if (AllocationSamplingEnabled && (ProfileCurrentThread || ProfileAllThreads) && !InAllocationCallback) {
        if (p->Size > AllocationSampleMaxSize || ++AllocationsCount % AllocationSampleRate == 0) {
            if (AllocationCallback) {
                InAllocationCallback = true;
                cookie = AllocationCallback(p->Tag, p->Size, sizeIdx);
                InAllocationCallback = false;
            }
        }
    }
    return cookie;
}

static inline void SampleDeallocation(TAllocHeader* p, int sizeIdx) {
    if (p->Cookie != DBG_ALLOC_INVALID_COOKIE && !InAllocationCallback) {
        if (DeallocationCallback) {
            InAllocationCallback = true;
            DeallocationCallback(p->Cookie, p->Tag, p->Size, sizeIdx);
            InAllocationCallback = false;
        }
    }
}

static inline void TrackPerTagAllocation(TAllocHeader* p, int sizeIdx) {
    if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) {
        Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES);
        auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx];

        TThreadAllocInfo* thr = pThreadInfo;
        if (thr) {
            auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx];
            local.Alloc(global, p->Size);
        } else {
            global.Alloc(p->Size);
        }
    }
}

static inline void TrackPerTagDeallocation(TAllocHeader* p, int sizeIdx) {
    if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) {
        Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES);
        auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx];

        TThreadAllocInfo* thr = pThreadInfo;
        if (thr) {
            auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx];
            local.Free(global, p->Size);
        } else {
            global.Free(p->Size);
        }
    }
}

static void* TrackAllocation(void* ptr, size_t size, int sizeIdx) {
    TAllocHeader* p = (TAllocHeader*)ptr;
    p->Size = size;
    p->Tag = AllocationTag;
    p->Cookie = SampleAllocation(p, sizeIdx);
    TrackPerTagAllocation(p, sizeIdx);
    return GetAllocPtr(p);
}

static void TrackDeallocation(void* ptr, int sizeIdx) {
    TAllocHeader* p = (TAllocHeader*)ptr;
    SampleDeallocation(p, sizeIdx);
    TrackPerTagDeallocation(p, sizeIdx);
}

struct TPerTagAllocInfo {
    ssize_t Count;
    ssize_t Size;
};

extern "C" void GetPerTagAllocInfo(
    bool flushPerThreadCounters,
    TPerTagAllocInfo* info,
    int& maxTag,
    int& numSizes) {
    maxTag = DBG_ALLOC_MAX_TAG;
    numSizes = DBG_ALLOC_NUM_SIZES;

    if (info) {
        if (flushPerThreadCounters) {
            TLFLockHolder ll(&LFLockThreadInfo);
            for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
                TThreadAllocInfo* pInfo = *p;
                for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
                    for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
                        auto& local = pInfo->LocalPerTagAllocCounters[tag][sizeIdx];
                        auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
                        local.Flush(global);
                    }
                }
                p = &pInfo->pNextInfo;
            }
        }

        for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
            for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
                auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
                auto& res = info[tag * DBG_ALLOC_NUM_SIZES + sizeIdx];
                res.Count = global.Count;
                res.Size = global.Size;
            }
        }
    }
}

#endif // LFALLOC_DBG

//////////////////////////////////////////////////////////////////////////
static Y_FORCE_INLINE void* LFAllocImpl(size_t _nSize) {
#if defined(LFALLOC_DBG)
    size_t size = _nSize;
    _nSize += sizeof(TAllocHeader);
#endif

    IncrementCounter(CT_USER_ALLOC, _nSize);

    int nSizeIdx;
    if (_nSize > 512) {
        if (_nSize > N_MAX_FAST_SIZE) {
            void* ptr = LargeBlockAlloc(_nSize, CT_LARGE_ALLOC);
#if defined(LFALLOC_DBG)
            ptr = TrackAllocation(ptr, size, N_SIZES);
#endif
            return ptr;
        }
        nSizeIdx = size2idxArr2[(_nSize - 1) >> 8];
    } else
        nSizeIdx = size2idxArr1[1 + (((int)_nSize - 1) >> 3)];

    IncrementCounter(CT_SMALL_ALLOC, nSizeIdxToSize[nSizeIdx]);

    // check per thread buffer
    TThreadAllocInfo* thr = pThreadInfo;
    if (!thr) {
        AllocThreadInfo();
        thr = pThreadInfo;
        if (!thr) {
            void* ptr = LFAllocNoCache(nSizeIdx, MEM_DEFRAG);
#if defined(LFALLOC_DBG)
            ptr = TrackAllocation(ptr, size, nSizeIdx);
#endif
            return ptr;
        }
    }
    {
        int& freePtrIdx = thr->FreePtrIndex[nSizeIdx];
        if (freePtrIdx < THREAD_BUF) {
            void* ptr = thr->FreePtrs[nSizeIdx][freePtrIdx++];
#if defined(LFALLOC_DBG)
            ptr = TrackAllocation(ptr, size, nSizeIdx);
#endif
            return ptr;
        }

        // try to alloc from global free list
        char* buf[FL_GROUP_SIZE];
        int count = TakeBlocksFromGlobalFreeList(nSizeIdx, buf);
        if (count == 0) {
            count = LFAllocNoCacheMultiple(nSizeIdx, buf);
            if (count == 0) {
                NMalloc::AbortFromCorruptedAllocator("no way LFAllocNoCacheMultiple() can fail");
            }
        }
        char** dstBuf = thr->FreePtrs[nSizeIdx] + freePtrIdx - 1;
        for (int i = 0; i < count - 1; ++i)
            dstBuf[-i] = buf[i];
        freePtrIdx -= count - 1;
        void* ptr = buf[count - 1];
#if defined(LFALLOC_DBG)
        ptr = TrackAllocation(ptr, size, nSizeIdx);
#endif
        return ptr;
    }
}

static Y_FORCE_INLINE void* LFAlloc(size_t _nSize) {
    void* res = LFAllocImpl(_nSize);
#ifdef DBG_FILL_MEMORY
    if (FillMemoryOnAllocation && res && (_nSize <= DBG_FILL_MAX_SIZE)) {
        memset(res, 0xcf, _nSize);
    }
#endif
    return res;
}

static Y_FORCE_INLINE void LFFree(void* p) {
#if defined(LFALLOC_DBG)
    if (p == nullptr)
        return;
    p = GetOrigAllocHeader(p);
#endif

    uintptr_t chkOffset = ((char*)p - ALLOC_START) - 1ll;
    if (chkOffset >= N_MAX_WORKSET_SIZE) {
        if (p == nullptr)
            return;
#if defined(LFALLOC_DBG)
        TrackDeallocation(p, N_SIZES);
#endif
        LargeBlockFree(p, CT_LARGE_FREE);
        return;
    }

    uintptr_t chunk = ((char*)p - ALLOC_START) / N_CHUNK_SIZE;
    ptrdiff_t nSizeIdx = chunkSizeIdx[chunk];
    if (nSizeIdx <= 0) {
#if defined(LFALLOC_DBG)
        TrackDeallocation(p, N_SIZES);
#endif
        LargeBlockFree(p, CT_LARGE_FREE);
        return;
    }

#if defined(LFALLOC_DBG)
    TrackDeallocation(p, nSizeIdx);
#endif

#ifdef DBG_FILL_MEMORY
    memset(p, 0xfe, nSizeIdxToSize[nSizeIdx]);
#endif

    IncrementCounter(CT_SMALL_FREE, nSizeIdxToSize[nSizeIdx]);

    // try to store info to per thread buf
    TThreadAllocInfo* thr = pThreadInfo;
    if (thr) {
        int& freePtrIdx = thr->FreePtrIndex[nSizeIdx];
        if (freePtrIdx > borderSizes[nSizeIdx]) {
            thr->FreePtrs[nSizeIdx][--freePtrIdx] = (char*)p;
            return;
        }

        // move several pointers to global free list
        int freeCount = FL_GROUP_SIZE;
        if (freeCount > THREAD_BUF - freePtrIdx)
            freeCount = THREAD_BUF - freePtrIdx;
        char** freePtrs = thr->FreePtrs[nSizeIdx];
        PutBlocksToGlobalFreeList(nSizeIdx, freePtrs + freePtrIdx, freeCount);
        freePtrIdx += freeCount;

        freePtrs[--freePtrIdx] = (char*)p;

    } else {
        AllocThreadInfo();
        PutBlocksToGlobalFreeList(nSizeIdx, (char**)&p, 1);
    }
}

static size_t LFGetSize(const void* p) {
#if defined(LFALLOC_DBG)
    if (p == nullptr)
        return 0;
    return GetOrigAllocHeader(const_cast<void*>(p))->Size;
#endif

    uintptr_t chkOffset = ((const char*)p - ALLOC_START);
    if (chkOffset >= N_MAX_WORKSET_SIZE) {
        if (p == nullptr)
            return 0;
        return TLargeBlk::As(p)->Pages * 4096ll;
    }
    uintptr_t chunk = ((const char*)p - ALLOC_START) / N_CHUNK_SIZE;
    ptrdiff_t nSizeIdx = chunkSizeIdx[chunk];
    if (nSizeIdx <= 0)
        return TLargeBlk::As(p)->Pages * 4096ll;
    return nSizeIdxToSize[nSizeIdx];
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// Output mem alloc stats
const int N_PAGE_SIZE = 4096;
static void DebugTraceMMgr(const char* pszFormat, ...) // __cdecl
{
    static char buff[20000];
    va_list va;
    //
    va_start(va, pszFormat);
    vsprintf(buff, pszFormat, va);
    va_end(va);
//
#ifdef _win_
    OutputDebugStringA(buff);
#else
    fputs(buff, stderr);
#endif
}

struct TChunkStats {
    char *Start, *Finish;
    i64 Size;
    char* Entries;
    i64 FreeCount;

    TChunkStats(size_t chunk, i64 size, char* entries)
        : Size(size)
        , Entries(entries)
        , FreeCount(0)
    {
        Start = ALLOC_START + chunk * N_CHUNK_SIZE;
        Finish = Start + N_CHUNK_SIZE;
    }
    void CheckBlock(char* pBlock) {
        if (pBlock && pBlock >= Start && pBlock < Finish) {
            ++FreeCount;
            i64 nShift = pBlock - Start;
            i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1);
            Entries[nOffsetInStep / Size] = 1;
        }
    }
    void SetGlobalFree(char* ptr) {
        i64 nShift = ptr - Start;
        i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1);
        while (nOffsetInStep + Size <= N_CHUNK_SIZE) {
            ++FreeCount;
            Entries[nOffsetInStep / Size] = 1;
            nOffsetInStep += Size;
        }
    }
};

static void DumpMemoryBlockUtilizationLocked() {
    TFreeListGroup* wholeLists[N_SIZES];
    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) {
        wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList();
    }
    char* bfList = (char*)blockFreeList.GetWholeList();

    DebugTraceMMgr("memory blocks utilisation stats:\n");
    i64 nTotalAllocated = 0, nTotalFree = 0, nTotalBadPages = 0, nTotalPages = 0, nTotalUsed = 0, nTotalLocked = 0;
    i64 nTotalGroupBlocks = 0;
    char* entries;
    entries = (char*)SystemAlloc((N_CHUNK_SIZE / 4));
    for (size_t k = 0; k < N_CHUNKS; ++k) {
        if (chunkSizeIdx[k] <= 0) {
            if (chunkSizeIdx[k] == -1)
                nTotalLocked += N_CHUNK_SIZE;
            continue;
        }
        i64 nSizeIdx = chunkSizeIdx[k];
        i64 nSize = nSizeIdxToSize[nSizeIdx];
        TChunkStats cs(k, nSize, entries);
        int nEntriesTotal = N_CHUNK_SIZE / nSize;
        memset(entries, 0, nEntriesTotal);
        for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) {
            for (auto& ptr : g->Ptrs)
                cs.CheckBlock(ptr);
        }
        TChunkStats csGB(k, nSize, entries);
        if (nSizeIdx == FREE_LIST_GROUP_SIZEIDX) {
            for (auto g : wholeLists) {
                for (; g; g = g->Next)
                    csGB.CheckBlock((char*)g);
            }
            for (char* blk = bfList; blk; blk = *(char**)blk)
                csGB.CheckBlock(blk);
            nTotalGroupBlocks += csGB.FreeCount * nSize;
        }
        if (((globalCurrentPtr[nSizeIdx] - ALLOC_START) / N_CHUNK_SIZE) == k)
            cs.SetGlobalFree(globalCurrentPtr[nSizeIdx]);
        nTotalUsed += (nEntriesTotal - cs.FreeCount - csGB.FreeCount) * nSize;

        char pages[N_CHUNK_SIZE / N_PAGE_SIZE];
        memset(pages, 0, sizeof(pages));
        for (int i = 0, nShift = 0; i < nEntriesTotal; ++i, nShift += nSize) {
            int nBit = 0;
            if (entries[i])
                nBit = 1; // free entry
            else
                nBit = 2; // used entry
            for (i64 nDelta = nSize - 1; nDelta >= 0; nDelta -= N_PAGE_SIZE)
                pages[(nShift + nDelta) / N_PAGE_SIZE] |= nBit;
        }
        i64 nBadPages = 0;
        for (auto page : pages) {
            nBadPages += page == 3;
            nTotalPages += page != 1;
        }
        DebugTraceMMgr("entry = %lld; size = %lld; free = %lld; system %lld; utilisation = %g%%, fragmentation = %g%%\n",
                       k, nSize, cs.FreeCount * nSize, csGB.FreeCount * nSize,
                       (N_CHUNK_SIZE - cs.FreeCount * nSize) * 100.0f / N_CHUNK_SIZE, 100.0f * nBadPages / Y_ARRAY_SIZE(pages));
        nTotalAllocated += N_CHUNK_SIZE;
        nTotalFree += cs.FreeCount * nSize;
        nTotalBadPages += nBadPages;
    }
    SystemFree(entries);
    DebugTraceMMgr("Total allocated = %llu, free = %lld, system = %lld, locked for future use %lld, utilisation = %g, fragmentation = %g\n",
                   nTotalAllocated, nTotalFree, nTotalGroupBlocks, nTotalLocked,
                   100.0f * (nTotalAllocated - nTotalFree) / nTotalAllocated, 100.0f * nTotalBadPages / nTotalPages);
    DebugTraceMMgr("Total %lld bytes used, %lld bytes in used pages\n", nTotalUsed, nTotalPages * N_PAGE_SIZE);

    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx)
        globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]);
    blockFreeList.ReturnWholeList(bfList);
}

void FlushThreadFreeList() {
    if (pThreadInfo)
        MoveSingleThreadFreeToGlobal(pThreadInfo);
}

void DumpMemoryBlockUtilization() {
    // move current thread free to global lists to get better statistics
    FlushThreadFreeList();
    {
        TLFLockHolder ls(&LFGlobalLock);
        DumpMemoryBlockUtilizationLocked();
    }
}

//////////////////////////////////////////////////////////////////////////
// malloc api

static bool LFAlloc_SetParam(const char* param, const char* value) {
    if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE")) {
        LB_LIMIT_TOTAL_SIZE = atoi(value);
        return true;
    }
    if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE_BYTES")) {
        LB_LIMIT_TOTAL_SIZE = (atoi(value) + N_PAGE_SIZE - 1) / N_PAGE_SIZE;
        return true;
    }
#ifdef DBG_FILL_MEMORY
    if (!strcmp(param, "FillMemoryOnAllocation")) {
        FillMemoryOnAllocation = !strcmp(value, "true");
        return true;
    }
#endif
    if (!strcmp(param, "TransparentHugePages")) {
        TransparentHugePages = !strcmp(value, "true");
        return true;
    }
    if (!strcmp(param, "MapHugeTLB")) {
        MapHugeTLB = !strcmp(value, "true");
        return true;
    }
    if (!strcmp(param, "EnableDefrag")) {
        EnableDefrag = !strcmp(value, "true");
        return true;
    }
    return false;
};

static const char* LFAlloc_GetParam(const char* param) {
    struct TParam {
        const char* Name;
        const char* Value;
    };

    static const TParam Params[] = {
        {"GetLFAllocCounterFast", (const char*)&GetLFAllocCounterFast},
        {"GetLFAllocCounterFull", (const char*)&GetLFAllocCounterFull},
#if defined(LFALLOC_DBG)
        {"SetThreadAllocTag", (const char*)&SetThreadAllocTag},
        {"SetProfileCurrentThread", (const char*)&SetProfileCurrentThread},
        {"SetProfileAllThreads", (const char*)&SetProfileAllThreads},
        {"SetAllocationSamplingEnabled", (const char*)&SetAllocationSamplingEnabled},
        {"SetAllocationSampleRate", (const char*)&SetAllocationSampleRate},
        {"SetAllocationSampleMaxSize", (const char*)&SetAllocationSampleMaxSize},
        {"SetAllocationCallback", (const char*)&SetAllocationCallback},
        {"SetDeallocationCallback", (const char*)&SetDeallocationCallback},
        {"GetPerTagAllocInfo", (const char*)&GetPerTagAllocInfo},
#endif // LFALLOC_DBG
    };

    for (int i = 0; i < Y_ARRAY_SIZE(Params); ++i) {
        if (strcmp(param, Params[i].Name) == 0) {
            return Params[i].Value;
        }
    }
    return nullptr;
}

static Y_FORCE_INLINE int LFPosixMemalign(void** memptr, size_t alignment, size_t size) {
    if (Y_UNLIKELY(alignment > 4096)) {
        const char* error = "Larger alignment are not guaranteed with this implementation\n";
#ifdef _win_
        OutputDebugStringA(error);
#endif
        NMalloc::AbortFromCorruptedAllocator(error);
    }
    size_t bigsize = size;
    if (bigsize <= alignment) {
        bigsize = alignment;
    } else if (bigsize < 2 * alignment) {
        bigsize = 2 * alignment;
    }
#if defined(LFALLOC_DBG)
    if (alignment > sizeof(TAllocHeader)) {
        bigsize += alignment;
    }
#endif

    *memptr = LFAlloc(bigsize);

#if defined(LFALLOC_DBG)
    if (alignment > sizeof(TAllocHeader)) {
        // memptr isn't aligned due to alloc header
        const auto* header = GetAllocHeader(*memptr);
        *memptr = (void*)((const char*) (*memptr) + alignment - sizeof(TAllocHeader));

        // make fake header to retrieve original header ptr on dealloc
        auto* next = GetAllocHeader(*memptr);
        next->Tag = DBG_ALLOC_ALIGNED_TAG;
        next->Size = (uint64_t)header;
        next->Cookie = 0;
    }
#endif

    Y_ASSERT_NOBT((intptr_t)*memptr % alignment == 0);
    return 0;
}

static Y_FORCE_INLINE void* LFVAlloc(size_t size) {
    const size_t pg = N_PAGE_SIZE;
    void* p = nullptr;

#if defined(LFALLOC_DBG)
    LFPosixMemalign(&p, pg, size);
#else
    size_t bigsize = (size + pg - 1) & (~(pg - 1));
    p = LFAlloc(bigsize);
#endif

    Y_ASSERT_NOBT((intptr_t)p % N_PAGE_SIZE == 0);
    return p;
}

#endif