diff options
author | udovichenko-r <udovichenko-r@yandex-team.ru> | 2022-07-04 14:16:38 +0300 |
---|---|---|
committer | udovichenko-r <udovichenko-r@yandex-team.ru> | 2022-07-04 14:16:38 +0300 |
commit | 5fe1c2b2d90b4ddbd7d1683191a48851363cf53d (patch) | |
tree | c413b43fb69611ff1185ead7813a8e0973dca3dc /library/cpp/balloc/lib/balloc.h | |
parent | f9651ab5ad67347bf06d6d0789b5d6eb31a7b2cc (diff) | |
download | ydb-5fe1c2b2d90b4ddbd7d1683191a48851363cf53d.tar.gz |
[KIKIMR-15108] Add perf programs to build
ref:8f081efde09627da76e52231d32a83e34b78c1e4
Diffstat (limited to 'library/cpp/balloc/lib/balloc.h')
-rw-r--r-- | library/cpp/balloc/lib/balloc.h | 589 |
1 files changed, 589 insertions, 0 deletions
diff --git a/library/cpp/balloc/lib/balloc.h b/library/cpp/balloc/lib/balloc.h new file mode 100644 index 0000000000..019c9cb7de --- /dev/null +++ b/library/cpp/balloc/lib/balloc.h @@ -0,0 +1,589 @@ +#pragma once + +#include <sys/mman.h> +#include <pthread.h> +#include <dlfcn.h> +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <memory.h> +#include <new> +#include <util/system/defaults.h> +#include <library/cpp/malloc/api/malloc.h> +#include <library/cpp/balloc/lib/alloc_stats.h> +#include <library/cpp/balloc/setup/alloc.h> + +#ifndef NDEBUG +#define DBG_FILL_MEMORY +#endif + +#if defined(Y_COVER_PTR) +#define DBG_FILL_MEMORY +#endif + +#if (defined(_i386_) || defined(_x86_64_)) && defined(_linux_) +#define HAVE_VDSO_GETCPU 1 + +#include <contrib/libs/linuxvdso/interface.h> +#endif + +namespace NBalloc { +#if HAVE_VDSO_GETCPU + // glibc does not provide a wrapper around getcpu, we'll have to load it manually + static int (*getcpu)(unsigned* cpu, unsigned* node, void* unused) = nullptr; +#endif + + static Y_FORCE_INLINE void* Advance(void* block, size_t size) { + return (void*)((char*)block + size); + } + + static constexpr size_t PAGE_CACHE = 16; +#if defined(_ppc64_) || defined(_arm64_) + static constexpr size_t PAGE_ELEM = 65536; +#else + static constexpr size_t PAGE_ELEM = 4096; +#endif + static constexpr size_t SINGLE_ALLOC = (PAGE_ELEM / 2); + static constexpr size_t ORDERS = 1024; + static constexpr size_t DUMP_STAT = 0; + + static void* (*LibcMalloc)(size_t) = nullptr; + static void (*LibcFree)(void*) = nullptr; + + static size_t Y_FORCE_INLINE Align(size_t value, size_t align) { + return (value + align - 1) & ~(align - 1); + } + +#define RDTSC(eax, edx) __asm__ __volatile__("rdtsc" \ + : "=a"(eax), "=d"(edx)); +#define CPUID(func, eax, ebx, ecx, edx) __asm__ __volatile__("cpuid" \ + : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) \ + : "a"(func)); + + static int GetNumaNode() { +#if HAVE_VDSO_GETCPU + if (Y_LIKELY(getcpu)) { + unsigned node = 0; + if (getcpu(nullptr, &node, nullptr)) { + return 0; + } + return node; + } +#endif +#if defined(_i386_) or defined(_x86_64_) + int a = 0, b = 0, c = 0, d = 0; + CPUID(0x1, a, b, c, d); + int acpiID = (b >> 24); + int numCPU = (b >> 16) & 255; + if (numCPU == 0) + return 0; + int ret = acpiID / numCPU; + return ret; +#else + return 0; +#endif + } + + static void AbortFromSystemError() { + char buf[512] = {0}; +#if defined(_freebsd_) or defined(_darwin_) or defined(_musl_) or defined(_bionic_) + strerror_r(errno, buf, sizeof(buf)); + const char* msg = buf; +#elif defined(_linux_) or defined(_cygwin_) + const char* msg = strerror_r(errno, buf, sizeof(buf)); +#endif + NMalloc::AbortFromCorruptedAllocator(msg); + } + + static pthread_key_t key; + static volatile long init = 0; + static unsigned long long counter = 0; + + static void Destructor(void* data); + + template <class T> + Y_FORCE_INLINE bool DoCas(T* volatile* target, T* exchange, T* compare) { + return __sync_bool_compare_and_swap(target, compare, exchange); + } + + class TLFAllocFreeList { + struct TNode { + TNode* Next; + }; + + TNode* volatile Head; + TNode* volatile Pending; + long long volatile PendingToFreeListCounter; + TNode* volatile Destroyed; + long long AllocCount; + + static Y_FORCE_INLINE void Enqueue(TNode* volatile* headPtr, TNode* n) { + for (;;) { + TNode* volatile prevHead = *headPtr; + n->Next = prevHead; + if (DoCas(headPtr, n, prevHead)) + break; + } + } + Y_FORCE_INLINE void* DoAlloc() { + TNode* res; + for (res = Head; res; res = Head) { + TNode* keepNext = res->Next; + if (DoCas(&Head, keepNext, res)) { + //Y_VERIFY(keepNext == res->Next); + break; + } + } + return res; + } + void FreeList(TNode* fl) { + if (!fl) + return; + TNode* flTail = fl; + while (flTail->Next) + flTail = flTail->Next; + for (;;) { + TNode* volatile prevHead = Head; + flTail->Next = prevHead; + if (DoCas(&Head, fl, prevHead)) + break; + } + } + + public: + Y_FORCE_INLINE void Free(void* ptr) { + TNode* newFree = (TNode*)ptr; + if (__sync_add_and_fetch(&AllocCount, 0) == 0) + Enqueue(&Head, newFree); + else + Enqueue(&Pending, newFree); + } + Y_FORCE_INLINE void Destroy(void* ptr, size_t length) { + TNode* newFree = (TNode*)ptr; + TNode* fl = nullptr; + if (__sync_add_and_fetch(&AllocCount, 1) == 1) { + fl = Destroyed; + if (fl && !DoCas(&Destroyed, (TNode*)nullptr, fl)) { + fl = nullptr; + } + Enqueue(&fl, newFree); + } else { + Enqueue(&Destroyed, newFree); + } + __sync_sub_and_fetch(&AllocCount, 1); + + // TODO try to merge blocks to minimize number of syscalls + while (nullptr != fl) { + TNode* next = fl->Next; + if (-1 == munmap(fl, length)) { + AbortFromSystemError(); + } + fl = next; + } + } + Y_FORCE_INLINE void* Alloc() { + long long volatile keepCounter = __sync_add_and_fetch(&PendingToFreeListCounter, 0); + TNode* fl = Pending; + if (__sync_add_and_fetch(&AllocCount, 1) == 1) { + // No other allocs in progress. + // If (keepCounter == PendingToFreeListCounter) then Pending was not freed by other threads. + // Hence Pending is not used in any concurrent DoAlloc() atm and can be safely moved to FreeList + if (fl && + keepCounter == __sync_add_and_fetch(&PendingToFreeListCounter, 0) && + DoCas(&Pending, (TNode*)nullptr, fl)) + { + // pick first element from Pending and return it + void* res = fl; + fl = fl->Next; + // if there are other elements in Pending list, add them to main free list + FreeList(fl); + __sync_sub_and_fetch(&PendingToFreeListCounter, 1); + __sync_sub_and_fetch(&AllocCount, 1); + return res; + } + } + void* res = DoAlloc(); + if (!res && __sync_add_and_fetch(&Pending, 0)) { + // live-lock situation: there are no free items in the "Head" + // list and there are free items in the "Pending" list + // but the items are forbidden to allocate to prevent ABA + NAllocStats::IncLiveLockCounter(); + } + __sync_sub_and_fetch(&AllocCount, 1); + return res; + } + }; + + TLFAllocFreeList nodes[2][ORDERS]; + unsigned long long sizesGC[2][16]; + unsigned long long sizeOS, totalOS; + + struct TBlockHeader { + size_t Size; + int RefCount; + unsigned short AllCount; + unsigned short NumaNode; + }; + + static bool PushPage(void* page, size_t order) { + if (order < ORDERS) { + int node = ((TBlockHeader*)page)->NumaNode; + __sync_add_and_fetch(&sizesGC[node][order % 16], order); + TBlockHeader* blockHeader = (TBlockHeader*)page; + if (!__sync_bool_compare_and_swap(&blockHeader->RefCount, 0, -1)) { + NMalloc::AbortFromCorruptedAllocator(); + } + nodes[node][order].Free(page); + return true; + } + return false; + } + + static void* PopPage(size_t order) { + if (order < ORDERS) { + int numaNode = GetNumaNode() & 1; + void* alloc = nodes[numaNode][order].Alloc(); + if (alloc == nullptr) { + alloc = nodes[1 - numaNode][order].Alloc(); + if (alloc) { + __sync_sub_and_fetch(&sizesGC[1 - numaNode][order % 16], order); + } + } else { + __sync_sub_and_fetch(&sizesGC[numaNode][order % 16], order); + } + if (alloc) { + TBlockHeader* blockHeader = (TBlockHeader*)alloc; + if (!__sync_bool_compare_and_swap(&blockHeader->RefCount, -1, 0)) { + NMalloc::AbortFromCorruptedAllocator(); + } + } + return alloc; + } + return nullptr; + } + +#if DUMP_STAT + static unsigned long long TickCounter() { + int lo = 0, hi = 0; + RDTSC(lo, hi); + return (((unsigned long long)hi) << 32) + (unsigned long long)lo; + } + + struct TTimeHold { + unsigned long long Start; + unsigned long long Finish; + const char* Name; + TTimeHold(const char* name) + : Start(TickCounter()) + , Name(name) + { + } + ~TTimeHold() { + Finish = TickCounter(); + double diff = Finish > Start ? (Finish - Start) / 1000000.0 : 0.0; + if (diff > 20.0) { + fprintf(stderr, "%s %f mticks\n", diff, Name); + } + } + }; +#endif + + long long allocs[ORDERS]; + + static void Map(size_t size, void* pages[], size_t num) { +#if DUMP_STAT + TTimeHold hold("mmap"); + size_t order = size / PAGE_ELEM; + if (order < ORDERS) { + __sync_add_and_fetch(&allocs[order], num); + } +#endif + if (!NAllocSetup::CanAlloc(__sync_add_and_fetch(&sizeOS, size * num), totalOS)) { + NMalloc::AbortFromCorruptedAllocator(); + } + void* map = mmap(nullptr, size * num, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (map == MAP_FAILED) { + AbortFromSystemError(); + } + unsigned short numaNode = (GetNumaNode() & 1); + NAllocStats::IncMmapCounter(size * num / PAGE_ELEM); + for (size_t i = 0; i < num; ++i) { + TBlockHeader* blockHeader = static_cast<TBlockHeader*>(map); + blockHeader->NumaNode = numaNode; + pages[i] = map; + map = Advance(map, size); + } + } + + static void* SysAlloc(size_t& size) { + size = Align(size, PAGE_ELEM); + size_t order = size / PAGE_ELEM; + void* result = PopPage(order); + if (result) { + return result; + } + void* pages[1] = {nullptr}; + Map(size, pages, 1); + return pages[0]; + } + + static void UnMap(void* block, size_t order) { +#if DUMP_STAT + TTimeHold hold("munmap"); + if (order < ORDERS) { + __sync_sub_and_fetch(&allocs[order], 1); + } +#endif + size_t size = order * PAGE_ELEM; + __sync_sub_and_fetch(&sizeOS, size); + TBlockHeader* blockHeader = (TBlockHeader*)block; + if (!__sync_bool_compare_and_swap(&blockHeader->RefCount, 0, -1)) { + NMalloc::AbortFromCorruptedAllocator(); + } + if (order < ORDERS) { + int node = blockHeader->NumaNode; + nodes[node][order].Destroy(block, size); + } else { + if (-1 == munmap(block, size)) { + AbortFromSystemError(); + } + } + } + + static void SysClear(size_t order) { + void* page = PopPage(order); + if (page) { + UnMap(page, order); + } + } + + static void Y_FORCE_INLINE GlobalInit() { + if (__sync_bool_compare_and_swap(&init, 0, 1)) { +#if HAVE_VDSO_GETCPU + getcpu = (int (*)(unsigned*, unsigned*, void*))NVdso::Function("__vdso_getcpu", "LINUX_2.6"); +#endif + LibcMalloc = (void* (*)(size_t))dlsym(RTLD_NEXT, "malloc"); + LibcFree = (void (*)(void*))dlsym(RTLD_NEXT, "free"); + pthread_key_create(&key, Destructor); + __sync_bool_compare_and_swap(&init, 1, 2); + } + while (init < 2) { + }; + } + + enum EMode { + Empty = 0, + Born, + Alive, + Disabled, + Dead, + ToBeEnabled + }; + + struct TLS { + void* PageCache[PAGE_CACHE]; + size_t Cached; + void* Chunk; + size_t Ptr; + void* Block; + int Counter; + EMode Mode; + unsigned char Count0; + unsigned long Count1; + bool NeedGC() { + if (Count0++ != 0) + return false; + __sync_add_and_fetch(&totalOS, 1); + unsigned long long count = 0; + for (size_t i = 0; i < 16; ++i) { + count += sizesGC[0][i]; + count += sizesGC[1][i]; + } + return NAllocSetup::NeedReclaim(count * PAGE_ELEM, ++Count1); + } + void ClearCount() { + Count1 = 0; + } + }; + +#if defined(_darwin_) + + static Y_FORCE_INLINE TLS* PthreadTls() { + GlobalInit(); + TLS* ptls = (TLS*)pthread_getspecific(key); + if (!ptls) { + ptls = (TLS*)LibcMalloc(sizeof(TLS)); + if (!ptls) { + NMalloc::AbortFromCorruptedAllocator(); // what do we do here? + } + memset(ptls, 0, sizeof(TLS)); + pthread_setspecific(key, ptls); + } + return ptls; + } + +#define tls (*PthreadTls()) + +#else + + __thread TLS tls; + +#endif + + static void UnRefHard(void* block, int add, TLS& ltls) { + TBlockHeader* blockHeader = (TBlockHeader*)block; + if ((blockHeader->RefCount == add ? (blockHeader->RefCount = 0, true) : false) || __sync_sub_and_fetch(&blockHeader->RefCount, add) == 0) { + size_t order = blockHeader->Size / PAGE_ELEM; + if (ltls.Mode == Alive) { + // page cache has first priority + if (order == 1 && ltls.Cached < PAGE_CACHE) { + ltls.PageCache[ltls.Cached] = block; + ++ltls.Cached; + return; + } + if (ltls.NeedGC()) { + ltls.ClearCount(); + size_t index = __sync_add_and_fetch(&counter, 1); + SysClear(index % ORDERS); + UnMap(block, order); + return; + } + } + if (!PushPage(block, order)) { + UnMap(block, order); + } + } + } + + static void Init(TLS& ltls) { + bool ShouldEnable = (NAllocSetup::IsEnabledByDefault() || ltls.Mode == ToBeEnabled); + ltls.Mode = Born; + GlobalInit(); + pthread_setspecific(key, (void*)<ls); + if (ShouldEnable) { + ltls.Mode = Alive; + } else { + ltls.Mode = Disabled; + } + } + + static void Y_FORCE_INLINE UnRef(void* block, int counter, TLS& ltls) { + if (ltls.Mode != Alive) { + UnRefHard(block, counter, ltls); + return; + } + if (ltls.Block == block) { + ltls.Counter += counter; + } else { + if (ltls.Block) { + UnRefHard(ltls.Block, ltls.Counter, ltls); + } + ltls.Block = block; + ltls.Counter = counter; + } + } + + static void Destructor(void* data) { + TLS& ltls = *(TLS*)data; + ltls.Mode = Dead; + if (ltls.Chunk) { + TBlockHeader* blockHeader = (TBlockHeader*)ltls.Chunk; + UnRef(ltls.Chunk, PAGE_ELEM - blockHeader->AllCount, ltls); + } + if (ltls.Block) { + UnRef(ltls.Block, ltls.Counter, ltls); + } + for (size_t i = 0; i < ltls.Cached; ++i) { + PushPage(ltls.PageCache[i], 1); + } +#if defined(_darwin_) + LibcFree(data); +#endif + } + + using TAllocHeader = NMalloc::TAllocHeader; + + static Y_FORCE_INLINE TAllocHeader* AllocateRaw(size_t size, size_t signature) { + TLS& ltls = tls; + size = Align(size, sizeof(TAllocHeader)); + if (Y_UNLIKELY(ltls.Mode == Empty || ltls.Mode == ToBeEnabled)) { + Init(ltls); + } + size_t extsize = size + sizeof(TAllocHeader) + sizeof(TBlockHeader); + if (extsize > SINGLE_ALLOC || ltls.Mode != Alive) { + // The dlsym() function in GlobalInit() may call malloc() resulting in recursive call + // of the NBalloc::Malloc(). We have to serve such allocation request via balloc even + // when (IsEnabledByDefault() == false) because at this point we don't know where the + // libc malloc is. + if (extsize > 64 * PAGE_ELEM) { + extsize = Align(extsize, 16 * PAGE_ELEM); + } + NAllocSetup::ThrowOnError(extsize); + void* block = SysAlloc(extsize); + TBlockHeader* blockHeader = (TBlockHeader*)block; + blockHeader->RefCount = 1; + blockHeader->Size = extsize; + blockHeader->AllCount = 0; + TAllocHeader* allocHeader = (TAllocHeader*)Advance(block, sizeof(TBlockHeader)); + allocHeader->Encode(blockHeader, size, signature); + if (NAllocStats::IsEnabled()) { + NAllocStats::IncThreadAllocStats(size); + } +#ifdef DBG_FILL_MEMORY + memset(allocHeader + 1, 0xec, size); +#endif + return allocHeader; + } + + size_t ptr = ltls.Ptr; + void* chunk = ltls.Chunk; + + if (ptr < extsize) { + NAllocSetup::ThrowOnError(PAGE_ELEM); + if (chunk) { + TBlockHeader* blockHeader = (TBlockHeader*)chunk; + UnRef(chunk, PAGE_ELEM - blockHeader->AllCount, ltls); + } + void* block = nullptr; + while (1) { + if (ltls.Cached > 0) { + --ltls.Cached; + block = ltls.PageCache[ltls.Cached]; + break; + } + block = PopPage(1); + if (block) { + break; + } + Map(PAGE_ELEM, ltls.PageCache, PAGE_CACHE); + ltls.Cached = PAGE_CACHE; + } + TBlockHeader* blockHeader = (TBlockHeader*)block; + blockHeader->RefCount = PAGE_ELEM; + blockHeader->Size = PAGE_ELEM; + blockHeader->AllCount = 0; + ltls.Ptr = PAGE_ELEM; + ltls.Chunk = block; + ptr = ltls.Ptr; + chunk = ltls.Chunk; + } + ptr = ptr - size - sizeof(TAllocHeader); + TAllocHeader* allocHeader = (TAllocHeader*)Advance(chunk, ptr); + allocHeader->Encode(chunk, size, signature); + TBlockHeader* blockHeader = (TBlockHeader*)chunk; + ++blockHeader->AllCount; + ltls.Ptr = ptr; + if (NAllocStats::IsEnabled()) { + NAllocStats::IncThreadAllocStats(size); + } +#ifdef DBG_FILL_MEMORY + memset(allocHeader + 1, 0xec, size); +#endif + return allocHeader; + } + + static void Y_FORCE_INLINE FreeRaw(void* ptr) { + UnRef(ptr, 1, tls); + } +} |