diff options
author | udovichenko-r <udovichenko-r@yandex-team.ru> | 2022-07-04 14:16:38 +0300 |
---|---|---|
committer | udovichenko-r <udovichenko-r@yandex-team.ru> | 2022-07-04 14:16:38 +0300 |
commit | 5fe1c2b2d90b4ddbd7d1683191a48851363cf53d (patch) | |
tree | c413b43fb69611ff1185ead7813a8e0973dca3dc /library/cpp/balloc | |
parent | f9651ab5ad67347bf06d6d0789b5d6eb31a7b2cc (diff) | |
download | ydb-5fe1c2b2d90b4ddbd7d1683191a48851363cf53d.tar.gz |
[KIKIMR-15108] Add perf programs to build
ref:8f081efde09627da76e52231d32a83e34b78c1e4
Diffstat (limited to 'library/cpp/balloc')
-rw-r--r-- | library/cpp/balloc/CMakeLists.txt | 21 | ||||
-rw-r--r-- | library/cpp/balloc/aba_agri_test/balloc_aba_ut.cpp | 221 | ||||
-rw-r--r-- | library/cpp/balloc/lib/CMakeLists.darwin.txt | 21 | ||||
-rw-r--r-- | library/cpp/balloc/lib/CMakeLists.linux.txt | 22 | ||||
-rw-r--r-- | library/cpp/balloc/lib/CMakeLists.txt | 13 | ||||
-rw-r--r-- | library/cpp/balloc/lib/alloc_stats.cpp | 106 | ||||
-rw-r--r-- | library/cpp/balloc/lib/alloc_stats.h | 18 | ||||
-rw-r--r-- | library/cpp/balloc/lib/balloc.h | 589 | ||||
-rw-r--r-- | library/cpp/balloc/setup/CMakeLists.txt | 17 | ||||
-rw-r--r-- | library/cpp/balloc/setup/alloc.cpp | 102 | ||||
-rw-r--r-- | library/cpp/balloc/setup/alloc.h | 35 | ||||
-rw-r--r-- | library/cpp/balloc/setup/disable_by_default/disable.cpp | 9 | ||||
-rw-r--r-- | library/cpp/balloc/setup/enable.cpp | 9 | ||||
-rw-r--r-- | library/cpp/balloc/setup/enable.h | 11 | ||||
-rw-r--r-- | library/cpp/balloc/test/do_with_disabled/main.cpp | 24 | ||||
-rw-r--r-- | library/cpp/balloc/test/do_with_enabled/main.cpp | 24 |
16 files changed, 1242 insertions, 0 deletions
diff --git a/library/cpp/balloc/CMakeLists.txt b/library/cpp/balloc/CMakeLists.txt new file mode 100644 index 0000000000..d4ed3b53d2 --- /dev/null +++ b/library/cpp/balloc/CMakeLists.txt @@ -0,0 +1,21 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-balloc) +target_compile_options(library-cpp-balloc PRIVATE + -Wno-everything +) +target_link_libraries(library-cpp-balloc PUBLIC + contrib-libs-cxxsupp + cpp-balloc-lib +) +target_sources(library-cpp-balloc PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/balloc/balloc.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/balloc/malloc-info.cpp +) diff --git a/library/cpp/balloc/aba_agri_test/balloc_aba_ut.cpp b/library/cpp/balloc/aba_agri_test/balloc_aba_ut.cpp new file mode 100644 index 0000000000..0d12dc6764 --- /dev/null +++ b/library/cpp/balloc/aba_agri_test/balloc_aba_ut.cpp @@ -0,0 +1,221 @@ +#include <util/generic/algorithm.h> +#include <util/generic/noncopyable.h> +#include <util/generic/ptr.h> +#include <util/generic/vector.h> +#include <library/cpp/deprecated/atomic/atomic.h> +#include <util/system/info.h> +#include <util/system/spinlock.h> +#include <util/system/thread.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <utility> + +#define PLATFORM_CACHE_LINE 64 + +template <typename T> +struct TNode: private TNonCopyable { + TNode* Next; + T Item; + + TNode(const T& item) + : Next(nullptr) + , Item(item) + { + } + + TNode(T&& item) + : Next(nullptr) + , Item(std::move(item)) + { + } + + char Padding[4000]; +}; + +template <typename TNode> +inline void DeleteList(TNode* node) { + while (node != nullptr) { + TNode* next = node->Next; + delete node; + node = next; + } +} + +typedef void* TMessageLink; + +//////////////////////////////////////////////////////////////////////////////// +// http://www.1024cores.net/home/lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue +// Very fast (as there is no CAS operation), but not exactly lock-free: +// one blocked producer could prevent consumer from obtaining new nodes. + +template <typename T = TMessageLink> +class TFastLF_M1_Queue: private TNonCopyable { +private: + union { + TNode<T>* Head; + char Pad1[PLATFORM_CACHE_LINE]; + }; + union { + TNode<T>* Tail; + char Pad2[PLATFORM_CACHE_LINE]; + }; + +public: + using TItem = T; + + TFastLF_M1_Queue() { + Head = Tail = new TNode<T>(T()); + } + + ~TFastLF_M1_Queue() { + DeleteList(Head); + } + + template <typename TT> + void Push(TT&& item) { + Enqueue(new TNode<T>(std::forward<TT>(item))); + } + + void Enqueue(TNode<T>* node) { + // our goal is to avoid expensive CAS here, + // but now consumer will be blocked until new tail linked. + // fortunately 'window of inconsistency' is extremely small. + TNode<T>* prev = AtomicSwap(&Tail, node); + AtomicSet(prev->Next, node); + } + + T Pop() { + TNode<T>* next = AtomicGet(Head->Next); + if (next) { + auto item = std::move(next->Item); + std::swap(Head, next); // no need atomic here + delete next; + return item; + } + return nullptr; + } + + bool IsEmpty() const { + TNode<T>* next = AtomicGet(Head->Next); + return (next == nullptr); + } +}; + +const size_t NUMBER_OF_PUSHERS = NSystemInfo::NumberOfCpus() + 4; +const size_t NUMBER_OF_QUEUES = NSystemInfo::NumberOfCpus() / 4; + +template <typename TQueueType> +class TQueueTestProcs: public TTestBase { +private: + UNIT_TEST_SUITE_DEMANGLE(TQueueTestProcs<TQueueType>); + UNIT_TEST(RndPush1M_Queues) + UNIT_TEST_SUITE_END(); + +public: + void RndPush1M_Queues() { + TQueueType* queue = new TQueueType[NUMBER_OF_QUEUES]; + + class TPusherThread: public ISimpleThread { + public: + TPusherThread(TQueueType* queues, char* start) + : Queues(queues) + , Arg(start) + { + } + + TQueueType* Queues; + char* Arg; + + void* ThreadProc() override { + auto counters = new int[NUMBER_OF_QUEUES]; + for (size_t i = 0; i < NUMBER_OF_QUEUES; ++i) + counters[i] = 0; +#if defined(_msan_enabled_) || defined(_asan_enabled_) + int limit = 100000; +#else + int limit = 1000000; +#endif + for (int i = 0; i < limit; ++i) { + size_t rnd = GetCycleCount() % NUMBER_OF_QUEUES; + int cookie = counters[rnd]++; + Queues[rnd].Push(Arg + cookie); + } + + for (size_t i = 0; i < NUMBER_OF_QUEUES; ++i) { + Queues[i].Push((void*)1ULL); + } + + delete[] counters; + return nullptr; + } + }; + + class TPopperThread: public ISimpleThread { + public: + TPopperThread(TQueueType* queue, char* base) + : Queue(queue) + , Base(base) + { + } + + TQueueType* Queue; + char* Base; + + void* ThreadProc() override { + auto counters = new int[NUMBER_OF_PUSHERS]; + for (size_t i = 0; i < NUMBER_OF_PUSHERS; ++i) + counters[i] = 0; + + for (size_t fin = 0; fin < NUMBER_OF_PUSHERS;) { + auto msg = Queue->Pop(); + if (msg == nullptr) + continue; + if (msg == (void*)1ULL) { + ++fin; + continue; + } + auto shift = (char*)msg - Base; + auto pusherNum = shift / 20000000; + auto msgNum = shift % 20000000; + + if (counters[pusherNum] != msgNum) { + Cerr << counters[pusherNum] << " " << msgNum << Endl; + } + + UNIT_ASSERT_EQUAL(counters[pusherNum], msgNum); + ++counters[pusherNum]; + } + + delete[] counters; + return nullptr; + } + }; + + TVector<TAutoPtr<TPopperThread>> poppers; + TVector<TAutoPtr<TPusherThread>> pushers; + + for (size_t i = 0; i < NUMBER_OF_QUEUES; ++i) { + poppers.emplace_back(new TPopperThread(&queue[i], (char*)queue)); + poppers.back()->Start(); + } + + for (size_t i = 0; i < NUMBER_OF_PUSHERS; ++i) { + pushers.emplace_back( + new TPusherThread(queue, (char*)queue + 20000000 * i)); + pushers.back()->Start(); + } + + for (size_t i = 0; i < NUMBER_OF_QUEUES; ++i) { + poppers[i]->Join(); + } + + for (size_t i = 0; i < NUMBER_OF_PUSHERS; ++i) { + pushers[i]->Join(); + } + + delete[] queue; + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TQueueTestProcs<TFastLF_M1_Queue<>>); diff --git a/library/cpp/balloc/lib/CMakeLists.darwin.txt b/library/cpp/balloc/lib/CMakeLists.darwin.txt new file mode 100644 index 0000000000..6ce3b4e072 --- /dev/null +++ b/library/cpp/balloc/lib/CMakeLists.darwin.txt @@ -0,0 +1,21 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-balloc-lib) +target_compile_options(cpp-balloc-lib PRIVATE + -Wno-everything +) +target_link_libraries(cpp-balloc-lib PUBLIC + contrib-libs-cxxsupp + cpp-balloc-setup + cpp-malloc-api +) +target_sources(cpp-balloc-lib PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/balloc/lib/alloc_stats.cpp +) diff --git a/library/cpp/balloc/lib/CMakeLists.linux.txt b/library/cpp/balloc/lib/CMakeLists.linux.txt new file mode 100644 index 0000000000..7cd6c1e33b --- /dev/null +++ b/library/cpp/balloc/lib/CMakeLists.linux.txt @@ -0,0 +1,22 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-balloc-lib) +target_compile_options(cpp-balloc-lib PRIVATE + -Wno-everything +) +target_link_libraries(cpp-balloc-lib PUBLIC + contrib-libs-cxxsupp + contrib-libs-linuxvdso + cpp-balloc-setup + cpp-malloc-api +) +target_sources(cpp-balloc-lib PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/balloc/lib/alloc_stats.cpp +) diff --git a/library/cpp/balloc/lib/CMakeLists.txt b/library/cpp/balloc/lib/CMakeLists.txt new file mode 100644 index 0000000000..fc7b1ee73c --- /dev/null +++ b/library/cpp/balloc/lib/CMakeLists.txt @@ -0,0 +1,13 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (APPLE) + include(CMakeLists.darwin.txt) +elseif (UNIX AND NOT APPLE) + include(CMakeLists.linux.txt) +endif() diff --git a/library/cpp/balloc/lib/alloc_stats.cpp b/library/cpp/balloc/lib/alloc_stats.cpp new file mode 100644 index 0000000000..3481cad56c --- /dev/null +++ b/library/cpp/balloc/lib/alloc_stats.cpp @@ -0,0 +1,106 @@ +#include <library/cpp/balloc/lib/alloc_stats.h> + +#include <util/system/compiler.h> +#include <atomic> + + +namespace NAllocStats { + +struct TThreadAllocStats { + i64 CurrSize = 0; + i64 MaxSize = 0; +}; + +struct TGlobalAllocStats { + std::atomic<ui64> LiveLock = {0}; + std::atomic<ui64> Mmap = {0}; +}; + +#if defined(_unix_) && !defined(_darwin_) + +__thread bool isEnabled = false; + +bool IsEnabled() noexcept { + return isEnabled; +} + +void EnableAllocStats(bool enable) noexcept { + isEnabled = enable; +} + +__thread TThreadAllocStats threadAllocStats; + +void IncThreadAllocStats(i64 size) noexcept { + threadAllocStats.CurrSize += size; + if (Y_UNLIKELY(threadAllocStats.CurrSize > threadAllocStats.MaxSize)) { + threadAllocStats.MaxSize = threadAllocStats.CurrSize; + } +} + +void DecThreadAllocStats(i64 size) noexcept { + threadAllocStats.CurrSize -= size; +} + +void ResetThreadAllocStats() noexcept { + threadAllocStats.CurrSize = 0; + threadAllocStats.MaxSize = 0; +} + +i64 GetThreadAllocMax() noexcept { + return threadAllocStats.MaxSize; +} + +#else // _unix_ && ! _darwin_ + +bool IsEnabled() noexcept { + return false; +} +void EnableAllocStats(bool /*enable*/) noexcept { +} +void IncThreadAllocStats(i64 /*size*/) noexcept { +} +void DecThreadAllocStats(i64 /*size*/) noexcept { +} +void ResetThreadAllocStats() noexcept { +} +i64 GetThreadAllocMax() noexcept { + return 0; +} + +#endif // _unix_ && ! _darwin_ + + +#if defined(_x86_64_) || defined(_i386_) + static constexpr size_t CACHE_LINE_SIZE = 64; +#elif defined(_arm64_) || defined(_ppc64_) + static constexpr size_t CACHE_LINE_SIZE = 128; +#else + static constexpr size_t CACHE_LINE_SIZE = 256; // default large enough +#endif + +template <typename T> +struct alignas(sizeof(T)) TCacheLineDoublePaddedAtomic { + char Prefix[CACHE_LINE_SIZE - sizeof(T)]; + T Value; + char Postfix[CACHE_LINE_SIZE - sizeof(T)]; +}; + +TCacheLineDoublePaddedAtomic<TGlobalAllocStats> GlobalCounters; + +void IncLiveLockCounter() noexcept { + GlobalCounters.Value.LiveLock.fetch_add(1, std::memory_order_seq_cst); +} + +ui64 GetLiveLockCounter() noexcept { + return GlobalCounters.Value.LiveLock.load(std::memory_order_acquire); +} + +void IncMmapCounter(ui64 amount) noexcept { + GlobalCounters.Value.Mmap.fetch_add(amount, std::memory_order_seq_cst); +} + +ui64 GetMmapCounter() noexcept { + return GlobalCounters.Value.Mmap.load(std::memory_order_acquire); +} + +} // namespace NAllocStats diff --git a/library/cpp/balloc/lib/alloc_stats.h b/library/cpp/balloc/lib/alloc_stats.h new file mode 100644 index 0000000000..a36686cc85 --- /dev/null +++ b/library/cpp/balloc/lib/alloc_stats.h @@ -0,0 +1,18 @@ +#pragma once + +#include <util/system/types.h> + +namespace NAllocStats { + +bool IsEnabled() noexcept; +void EnableAllocStats(bool enable) noexcept; +void IncThreadAllocStats(i64 size) noexcept; +void DecThreadAllocStats(i64 size) noexcept; +void ResetThreadAllocStats() noexcept; +i64 GetThreadAllocMax() noexcept; +void IncLiveLockCounter() noexcept; +ui64 GetLiveLockCounter() noexcept; +void IncMmapCounter(ui64 amount) noexcept; +ui64 GetMmapCounter() noexcept; + +} // namespace NAllocStats diff --git a/library/cpp/balloc/lib/balloc.h b/library/cpp/balloc/lib/balloc.h new file mode 100644 index 0000000000..019c9cb7de --- /dev/null +++ b/library/cpp/balloc/lib/balloc.h @@ -0,0 +1,589 @@ +#pragma once + +#include <sys/mman.h> +#include <pthread.h> +#include <dlfcn.h> +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <memory.h> +#include <new> +#include <util/system/defaults.h> +#include <library/cpp/malloc/api/malloc.h> +#include <library/cpp/balloc/lib/alloc_stats.h> +#include <library/cpp/balloc/setup/alloc.h> + +#ifndef NDEBUG +#define DBG_FILL_MEMORY +#endif + +#if defined(Y_COVER_PTR) +#define DBG_FILL_MEMORY +#endif + +#if (defined(_i386_) || defined(_x86_64_)) && defined(_linux_) +#define HAVE_VDSO_GETCPU 1 + +#include <contrib/libs/linuxvdso/interface.h> +#endif + +namespace NBalloc { +#if HAVE_VDSO_GETCPU + // glibc does not provide a wrapper around getcpu, we'll have to load it manually + static int (*getcpu)(unsigned* cpu, unsigned* node, void* unused) = nullptr; +#endif + + static Y_FORCE_INLINE void* Advance(void* block, size_t size) { + return (void*)((char*)block + size); + } + + static constexpr size_t PAGE_CACHE = 16; +#if defined(_ppc64_) || defined(_arm64_) + static constexpr size_t PAGE_ELEM = 65536; +#else + static constexpr size_t PAGE_ELEM = 4096; +#endif + static constexpr size_t SINGLE_ALLOC = (PAGE_ELEM / 2); + static constexpr size_t ORDERS = 1024; + static constexpr size_t DUMP_STAT = 0; + + static void* (*LibcMalloc)(size_t) = nullptr; + static void (*LibcFree)(void*) = nullptr; + + static size_t Y_FORCE_INLINE Align(size_t value, size_t align) { + return (value + align - 1) & ~(align - 1); + } + +#define RDTSC(eax, edx) __asm__ __volatile__("rdtsc" \ + : "=a"(eax), "=d"(edx)); +#define CPUID(func, eax, ebx, ecx, edx) __asm__ __volatile__("cpuid" \ + : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) \ + : "a"(func)); + + static int GetNumaNode() { +#if HAVE_VDSO_GETCPU + if (Y_LIKELY(getcpu)) { + unsigned node = 0; + if (getcpu(nullptr, &node, nullptr)) { + return 0; + } + return node; + } +#endif +#if defined(_i386_) or defined(_x86_64_) + int a = 0, b = 0, c = 0, d = 0; + CPUID(0x1, a, b, c, d); + int acpiID = (b >> 24); + int numCPU = (b >> 16) & 255; + if (numCPU == 0) + return 0; + int ret = acpiID / numCPU; + return ret; +#else + return 0; +#endif + } + + static void AbortFromSystemError() { + char buf[512] = {0}; +#if defined(_freebsd_) or defined(_darwin_) or defined(_musl_) or defined(_bionic_) + strerror_r(errno, buf, sizeof(buf)); + const char* msg = buf; +#elif defined(_linux_) or defined(_cygwin_) + const char* msg = strerror_r(errno, buf, sizeof(buf)); +#endif + NMalloc::AbortFromCorruptedAllocator(msg); + } + + static pthread_key_t key; + static volatile long init = 0; + static unsigned long long counter = 0; + + static void Destructor(void* data); + + template <class T> + Y_FORCE_INLINE bool DoCas(T* volatile* target, T* exchange, T* compare) { + return __sync_bool_compare_and_swap(target, compare, exchange); + } + + class TLFAllocFreeList { + struct TNode { + TNode* Next; + }; + + TNode* volatile Head; + TNode* volatile Pending; + long long volatile PendingToFreeListCounter; + TNode* volatile Destroyed; + long long AllocCount; + + static Y_FORCE_INLINE void Enqueue(TNode* volatile* headPtr, TNode* n) { + for (;;) { + TNode* volatile prevHead = *headPtr; + n->Next = prevHead; + if (DoCas(headPtr, n, prevHead)) + break; + } + } + Y_FORCE_INLINE void* DoAlloc() { + TNode* res; + for (res = Head; res; res = Head) { + TNode* keepNext = res->Next; + if (DoCas(&Head, keepNext, res)) { + //Y_VERIFY(keepNext == res->Next); + break; + } + } + return res; + } + void FreeList(TNode* fl) { + if (!fl) + return; + TNode* flTail = fl; + while (flTail->Next) + flTail = flTail->Next; + for (;;) { + TNode* volatile prevHead = Head; + flTail->Next = prevHead; + if (DoCas(&Head, fl, prevHead)) + break; + } + } + + public: + Y_FORCE_INLINE void Free(void* ptr) { + TNode* newFree = (TNode*)ptr; + if (__sync_add_and_fetch(&AllocCount, 0) == 0) + Enqueue(&Head, newFree); + else + Enqueue(&Pending, newFree); + } + Y_FORCE_INLINE void Destroy(void* ptr, size_t length) { + TNode* newFree = (TNode*)ptr; + TNode* fl = nullptr; + if (__sync_add_and_fetch(&AllocCount, 1) == 1) { + fl = Destroyed; + if (fl && !DoCas(&Destroyed, (TNode*)nullptr, fl)) { + fl = nullptr; + } + Enqueue(&fl, newFree); + } else { + Enqueue(&Destroyed, newFree); + } + __sync_sub_and_fetch(&AllocCount, 1); + + // TODO try to merge blocks to minimize number of syscalls + while (nullptr != fl) { + TNode* next = fl->Next; + if (-1 == munmap(fl, length)) { + AbortFromSystemError(); + } + fl = next; + } + } + Y_FORCE_INLINE void* Alloc() { + long long volatile keepCounter = __sync_add_and_fetch(&PendingToFreeListCounter, 0); + TNode* fl = Pending; + if (__sync_add_and_fetch(&AllocCount, 1) == 1) { + // No other allocs in progress. + // If (keepCounter == PendingToFreeListCounter) then Pending was not freed by other threads. + // Hence Pending is not used in any concurrent DoAlloc() atm and can be safely moved to FreeList + if (fl && + keepCounter == __sync_add_and_fetch(&PendingToFreeListCounter, 0) && + DoCas(&Pending, (TNode*)nullptr, fl)) + { + // pick first element from Pending and return it + void* res = fl; + fl = fl->Next; + // if there are other elements in Pending list, add them to main free list + FreeList(fl); + __sync_sub_and_fetch(&PendingToFreeListCounter, 1); + __sync_sub_and_fetch(&AllocCount, 1); + return res; + } + } + void* res = DoAlloc(); + if (!res && __sync_add_and_fetch(&Pending, 0)) { + // live-lock situation: there are no free items in the "Head" + // list and there are free items in the "Pending" list + // but the items are forbidden to allocate to prevent ABA + NAllocStats::IncLiveLockCounter(); + } + __sync_sub_and_fetch(&AllocCount, 1); + return res; + } + }; + + TLFAllocFreeList nodes[2][ORDERS]; + unsigned long long sizesGC[2][16]; + unsigned long long sizeOS, totalOS; + + struct TBlockHeader { + size_t Size; + int RefCount; + unsigned short AllCount; + unsigned short NumaNode; + }; + + static bool PushPage(void* page, size_t order) { + if (order < ORDERS) { + int node = ((TBlockHeader*)page)->NumaNode; + __sync_add_and_fetch(&sizesGC[node][order % 16], order); + TBlockHeader* blockHeader = (TBlockHeader*)page; + if (!__sync_bool_compare_and_swap(&blockHeader->RefCount, 0, -1)) { + NMalloc::AbortFromCorruptedAllocator(); + } + nodes[node][order].Free(page); + return true; + } + return false; + } + + static void* PopPage(size_t order) { + if (order < ORDERS) { + int numaNode = GetNumaNode() & 1; + void* alloc = nodes[numaNode][order].Alloc(); + if (alloc == nullptr) { + alloc = nodes[1 - numaNode][order].Alloc(); + if (alloc) { + __sync_sub_and_fetch(&sizesGC[1 - numaNode][order % 16], order); + } + } else { + __sync_sub_and_fetch(&sizesGC[numaNode][order % 16], order); + } + if (alloc) { + TBlockHeader* blockHeader = (TBlockHeader*)alloc; + if (!__sync_bool_compare_and_swap(&blockHeader->RefCount, -1, 0)) { + NMalloc::AbortFromCorruptedAllocator(); + } + } + return alloc; + } + return nullptr; + } + +#if DUMP_STAT + static unsigned long long TickCounter() { + int lo = 0, hi = 0; + RDTSC(lo, hi); + return (((unsigned long long)hi) << 32) + (unsigned long long)lo; + } + + struct TTimeHold { + unsigned long long Start; + unsigned long long Finish; + const char* Name; + TTimeHold(const char* name) + : Start(TickCounter()) + , Name(name) + { + } + ~TTimeHold() { + Finish = TickCounter(); + double diff = Finish > Start ? (Finish - Start) / 1000000.0 : 0.0; + if (diff > 20.0) { + fprintf(stderr, "%s %f mticks\n", diff, Name); + } + } + }; +#endif + + long long allocs[ORDERS]; + + static void Map(size_t size, void* pages[], size_t num) { +#if DUMP_STAT + TTimeHold hold("mmap"); + size_t order = size / PAGE_ELEM; + if (order < ORDERS) { + __sync_add_and_fetch(&allocs[order], num); + } +#endif + if (!NAllocSetup::CanAlloc(__sync_add_and_fetch(&sizeOS, size * num), totalOS)) { + NMalloc::AbortFromCorruptedAllocator(); + } + void* map = mmap(nullptr, size * num, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (map == MAP_FAILED) { + AbortFromSystemError(); + } + unsigned short numaNode = (GetNumaNode() & 1); + NAllocStats::IncMmapCounter(size * num / PAGE_ELEM); + for (size_t i = 0; i < num; ++i) { + TBlockHeader* blockHeader = static_cast<TBlockHeader*>(map); + blockHeader->NumaNode = numaNode; + pages[i] = map; + map = Advance(map, size); + } + } + + static void* SysAlloc(size_t& size) { + size = Align(size, PAGE_ELEM); + size_t order = size / PAGE_ELEM; + void* result = PopPage(order); + if (result) { + return result; + } + void* pages[1] = {nullptr}; + Map(size, pages, 1); + return pages[0]; + } + + static void UnMap(void* block, size_t order) { +#if DUMP_STAT + TTimeHold hold("munmap"); + if (order < ORDERS) { + __sync_sub_and_fetch(&allocs[order], 1); + } +#endif + size_t size = order * PAGE_ELEM; + __sync_sub_and_fetch(&sizeOS, size); + TBlockHeader* blockHeader = (TBlockHeader*)block; + if (!__sync_bool_compare_and_swap(&blockHeader->RefCount, 0, -1)) { + NMalloc::AbortFromCorruptedAllocator(); + } + if (order < ORDERS) { + int node = blockHeader->NumaNode; + nodes[node][order].Destroy(block, size); + } else { + if (-1 == munmap(block, size)) { + AbortFromSystemError(); + } + } + } + + static void SysClear(size_t order) { + void* page = PopPage(order); + if (page) { + UnMap(page, order); + } + } + + static void Y_FORCE_INLINE GlobalInit() { + if (__sync_bool_compare_and_swap(&init, 0, 1)) { +#if HAVE_VDSO_GETCPU + getcpu = (int (*)(unsigned*, unsigned*, void*))NVdso::Function("__vdso_getcpu", "LINUX_2.6"); +#endif + LibcMalloc = (void* (*)(size_t))dlsym(RTLD_NEXT, "malloc"); + LibcFree = (void (*)(void*))dlsym(RTLD_NEXT, "free"); + pthread_key_create(&key, Destructor); + __sync_bool_compare_and_swap(&init, 1, 2); + } + while (init < 2) { + }; + } + + enum EMode { + Empty = 0, + Born, + Alive, + Disabled, + Dead, + ToBeEnabled + }; + + struct TLS { + void* PageCache[PAGE_CACHE]; + size_t Cached; + void* Chunk; + size_t Ptr; + void* Block; + int Counter; + EMode Mode; + unsigned char Count0; + unsigned long Count1; + bool NeedGC() { + if (Count0++ != 0) + return false; + __sync_add_and_fetch(&totalOS, 1); + unsigned long long count = 0; + for (size_t i = 0; i < 16; ++i) { + count += sizesGC[0][i]; + count += sizesGC[1][i]; + } + return NAllocSetup::NeedReclaim(count * PAGE_ELEM, ++Count1); + } + void ClearCount() { + Count1 = 0; + } + }; + +#if defined(_darwin_) + + static Y_FORCE_INLINE TLS* PthreadTls() { + GlobalInit(); + TLS* ptls = (TLS*)pthread_getspecific(key); + if (!ptls) { + ptls = (TLS*)LibcMalloc(sizeof(TLS)); + if (!ptls) { + NMalloc::AbortFromCorruptedAllocator(); // what do we do here? + } + memset(ptls, 0, sizeof(TLS)); + pthread_setspecific(key, ptls); + } + return ptls; + } + +#define tls (*PthreadTls()) + +#else + + __thread TLS tls; + +#endif + + static void UnRefHard(void* block, int add, TLS& ltls) { + TBlockHeader* blockHeader = (TBlockHeader*)block; + if ((blockHeader->RefCount == add ? (blockHeader->RefCount = 0, true) : false) || __sync_sub_and_fetch(&blockHeader->RefCount, add) == 0) { + size_t order = blockHeader->Size / PAGE_ELEM; + if (ltls.Mode == Alive) { + // page cache has first priority + if (order == 1 && ltls.Cached < PAGE_CACHE) { + ltls.PageCache[ltls.Cached] = block; + ++ltls.Cached; + return; + } + if (ltls.NeedGC()) { + ltls.ClearCount(); + size_t index = __sync_add_and_fetch(&counter, 1); + SysClear(index % ORDERS); + UnMap(block, order); + return; + } + } + if (!PushPage(block, order)) { + UnMap(block, order); + } + } + } + + static void Init(TLS& ltls) { + bool ShouldEnable = (NAllocSetup::IsEnabledByDefault() || ltls.Mode == ToBeEnabled); + ltls.Mode = Born; + GlobalInit(); + pthread_setspecific(key, (void*)<ls); + if (ShouldEnable) { + ltls.Mode = Alive; + } else { + ltls.Mode = Disabled; + } + } + + static void Y_FORCE_INLINE UnRef(void* block, int counter, TLS& ltls) { + if (ltls.Mode != Alive) { + UnRefHard(block, counter, ltls); + return; + } + if (ltls.Block == block) { + ltls.Counter += counter; + } else { + if (ltls.Block) { + UnRefHard(ltls.Block, ltls.Counter, ltls); + } + ltls.Block = block; + ltls.Counter = counter; + } + } + + static void Destructor(void* data) { + TLS& ltls = *(TLS*)data; + ltls.Mode = Dead; + if (ltls.Chunk) { + TBlockHeader* blockHeader = (TBlockHeader*)ltls.Chunk; + UnRef(ltls.Chunk, PAGE_ELEM - blockHeader->AllCount, ltls); + } + if (ltls.Block) { + UnRef(ltls.Block, ltls.Counter, ltls); + } + for (size_t i = 0; i < ltls.Cached; ++i) { + PushPage(ltls.PageCache[i], 1); + } +#if defined(_darwin_) + LibcFree(data); +#endif + } + + using TAllocHeader = NMalloc::TAllocHeader; + + static Y_FORCE_INLINE TAllocHeader* AllocateRaw(size_t size, size_t signature) { + TLS& ltls = tls; + size = Align(size, sizeof(TAllocHeader)); + if (Y_UNLIKELY(ltls.Mode == Empty || ltls.Mode == ToBeEnabled)) { + Init(ltls); + } + size_t extsize = size + sizeof(TAllocHeader) + sizeof(TBlockHeader); + if (extsize > SINGLE_ALLOC || ltls.Mode != Alive) { + // The dlsym() function in GlobalInit() may call malloc() resulting in recursive call + // of the NBalloc::Malloc(). We have to serve such allocation request via balloc even + // when (IsEnabledByDefault() == false) because at this point we don't know where the + // libc malloc is. + if (extsize > 64 * PAGE_ELEM) { + extsize = Align(extsize, 16 * PAGE_ELEM); + } + NAllocSetup::ThrowOnError(extsize); + void* block = SysAlloc(extsize); + TBlockHeader* blockHeader = (TBlockHeader*)block; + blockHeader->RefCount = 1; + blockHeader->Size = extsize; + blockHeader->AllCount = 0; + TAllocHeader* allocHeader = (TAllocHeader*)Advance(block, sizeof(TBlockHeader)); + allocHeader->Encode(blockHeader, size, signature); + if (NAllocStats::IsEnabled()) { + NAllocStats::IncThreadAllocStats(size); + } +#ifdef DBG_FILL_MEMORY + memset(allocHeader + 1, 0xec, size); +#endif + return allocHeader; + } + + size_t ptr = ltls.Ptr; + void* chunk = ltls.Chunk; + + if (ptr < extsize) { + NAllocSetup::ThrowOnError(PAGE_ELEM); + if (chunk) { + TBlockHeader* blockHeader = (TBlockHeader*)chunk; + UnRef(chunk, PAGE_ELEM - blockHeader->AllCount, ltls); + } + void* block = nullptr; + while (1) { + if (ltls.Cached > 0) { + --ltls.Cached; + block = ltls.PageCache[ltls.Cached]; + break; + } + block = PopPage(1); + if (block) { + break; + } + Map(PAGE_ELEM, ltls.PageCache, PAGE_CACHE); + ltls.Cached = PAGE_CACHE; + } + TBlockHeader* blockHeader = (TBlockHeader*)block; + blockHeader->RefCount = PAGE_ELEM; + blockHeader->Size = PAGE_ELEM; + blockHeader->AllCount = 0; + ltls.Ptr = PAGE_ELEM; + ltls.Chunk = block; + ptr = ltls.Ptr; + chunk = ltls.Chunk; + } + ptr = ptr - size - sizeof(TAllocHeader); + TAllocHeader* allocHeader = (TAllocHeader*)Advance(chunk, ptr); + allocHeader->Encode(chunk, size, signature); + TBlockHeader* blockHeader = (TBlockHeader*)chunk; + ++blockHeader->AllCount; + ltls.Ptr = ptr; + if (NAllocStats::IsEnabled()) { + NAllocStats::IncThreadAllocStats(size); + } +#ifdef DBG_FILL_MEMORY + memset(allocHeader + 1, 0xec, size); +#endif + return allocHeader; + } + + static void Y_FORCE_INLINE FreeRaw(void* ptr) { + UnRef(ptr, 1, tls); + } +} diff --git a/library/cpp/balloc/setup/CMakeLists.txt b/library/cpp/balloc/setup/CMakeLists.txt new file mode 100644 index 0000000000..82c9d69c0c --- /dev/null +++ b/library/cpp/balloc/setup/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-balloc-setup) +target_link_libraries(cpp-balloc-setup PUBLIC + contrib-libs-cxxsupp +) +target_sources(cpp-balloc-setup PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/balloc/setup/alloc.cpp + ${CMAKE_SOURCE_DIR}/library/cpp/balloc/setup/enable.cpp +) diff --git a/library/cpp/balloc/setup/alloc.cpp b/library/cpp/balloc/setup/alloc.cpp new file mode 100644 index 0000000000..f32b15df39 --- /dev/null +++ b/library/cpp/balloc/setup/alloc.cpp @@ -0,0 +1,102 @@ +#include <new> +#include <stdio.h> +#include <stdlib.h> + +#include "alloc.h" +#include "enable.h" +#include <util/system/platform.h> + +namespace NAllocSetup { + size_t softLimit = size_t(-1); + size_t hardLimit = size_t(-1); + size_t allocationThreshold = size_t(-1); + size_t softReclaimDivisor = 100; + size_t angryReclaimDivisor = 100; + + struct TThrowInfo { + size_t CurrSize; + size_t MaxSize; + }; +#if defined(_unix_) && !defined(_darwin_) + __thread TThrowInfo info; + void ThrowOnError(size_t allocSize) { + info.CurrSize += allocSize; + if (info.MaxSize && info.MaxSize < info.CurrSize) { +#ifndef NDEBUG + __builtin_trap(); +#endif + info.CurrSize = 0; + throw std::bad_alloc(); + } + } + void SetThrowConditions(size_t currSize, size_t maxSize) { + info.CurrSize = currSize; + info.MaxSize = maxSize; + } +#else // _unix_ && ! _darwin_ + void ThrowOnError(size_t /*allocSize*/) { + } + void SetThrowConditions(size_t /*currSize*/, size_t /*maxSize*/) { + } +#endif // _unix_ && ! _darwin_ + + void SetSoftLimit(size_t softLimit_) { + softLimit = softLimit_; + } + void SetHardLimit(size_t hardLimit_) { + hardLimit = hardLimit_; + } + void SetAllocationThreshold(size_t allocationThreshold_) { + allocationThreshold = allocationThreshold_; + } + void SetSoftReclaimDivisor(size_t softReclaimDivisor_) { + softReclaimDivisor = softReclaimDivisor_; + } + void SetAngryReclaimDivisor(size_t angryReclaimDivisor_) { + angryReclaimDivisor = angryReclaimDivisor_; + } + size_t GetSoftLimit() { + return softLimit; + } + size_t GetHardLimit() { + return hardLimit; + } + size_t GetAllocationThreshold() { + return allocationThreshold; + } + size_t GetSoftReclaimDivisor() { + return softReclaimDivisor; + } + size_t GetAngryReclaimDivisor() { + return angryReclaimDivisor; + } + + size_t allocSize; + size_t totalAllocSize; + size_t gcSize; + + size_t GetTotalAllocSize() { + return totalAllocSize; + } + size_t GetCurSize() { + return allocSize; + } + size_t GetGCSize() { + return gcSize; + } + + bool CanAlloc(size_t allocSize_, size_t totalAllocSize_) { + allocSize = allocSize_; + totalAllocSize = totalAllocSize_; + return allocSize_ < hardLimit || totalAllocSize_ < allocationThreshold; + } + bool NeedReclaim(size_t gcSize_, size_t counter) { + gcSize = gcSize_; + size_t limit = gcSize_ < softLimit ? softReclaimDivisor : angryReclaimDivisor; + return counter > limit; + } + + bool IsEnabledByDefault() { + return EnableByDefault; + } +} diff --git a/library/cpp/balloc/setup/alloc.h b/library/cpp/balloc/setup/alloc.h new file mode 100644 index 0000000000..89fee3e3e7 --- /dev/null +++ b/library/cpp/balloc/setup/alloc.h @@ -0,0 +1,35 @@ +#pragma once + +#include <stddef.h> + +namespace NAllocSetup { + void ThrowOnError(size_t allocSize); + void SetThrowConditions(size_t currSize, size_t maxSize); + void SetSoftLimit(size_t softLimit); + void SetHardLimit(size_t hardLimit); + void SetAllocationThreshold(size_t allocationThreshold); + void SetSoftReclaimDivisor(size_t softReclaimDivisor); + void SetAngryReclaimDivisor(size_t angryReclaimDivisor); + bool CanAlloc(size_t allocSize, size_t totalAllocSize); + bool NeedReclaim(size_t gcSize_, size_t counter); + size_t GetTotalAllocSize(); + size_t GetCurSize(); + size_t GetGCSize(); + + size_t GetSoftLimit(); + size_t GetHardLimit(); + size_t GetAllocationThreshold(); + size_t GetSoftReclaimDivisor(); + size_t GetAngryReclaimDivisor(); + + bool IsEnabledByDefault(); + + struct TAllocGuard { + TAllocGuard(size_t maxSize) { + SetThrowConditions(0, maxSize); + } + ~TAllocGuard() { + SetThrowConditions(0, 0); + } + }; +} diff --git a/library/cpp/balloc/setup/disable_by_default/disable.cpp b/library/cpp/balloc/setup/disable_by_default/disable.cpp new file mode 100644 index 0000000000..fe39d5c559 --- /dev/null +++ b/library/cpp/balloc/setup/disable_by_default/disable.cpp @@ -0,0 +1,9 @@ +#include <library/cpp/balloc/setup/enable.h> + +#include <util/system/compiler.h> + +namespace NAllocSetup { + // Overriding a weak symbol defined in library/cpp/balloc/setup/enable.cpp. + // Don't link with this object if your platform doesn't support weak linkage. + extern const bool EnableByDefault = false; +} diff --git a/library/cpp/balloc/setup/enable.cpp b/library/cpp/balloc/setup/enable.cpp new file mode 100644 index 0000000000..9eba81e7a7 --- /dev/null +++ b/library/cpp/balloc/setup/enable.cpp @@ -0,0 +1,9 @@ +#include "enable.h" + +#include <util/system/compiler.h> + +namespace NAllocSetup { + // This constant can be overridden on platforms that support weak linkage. + // See library/cpp/balloc/setup/disable_by_default/disabled.cpp + extern const bool EnableByDefault Y_WEAK = true; +} diff --git a/library/cpp/balloc/setup/enable.h b/library/cpp/balloc/setup/enable.h new file mode 100644 index 0000000000..78449c1000 --- /dev/null +++ b/library/cpp/balloc/setup/enable.h @@ -0,0 +1,11 @@ +#pragma once + +namespace NAllocSetup { + // The IsEnabledByDefault variable should always have static initialization. It is safe to use it in initialization + // of global and thread-local objects because standard guarantees that static initalization always takes place + // before dynamic initialization: + // * C++11 3.6.2.2: "Static initialization shall be performed before any dynamic initialization takes place." + // * C++17 6.6.2.2: "All static initialization strongly happens before any dynamic initialization." + // On practice a constant value is just baked into the executable during the linking. + extern const bool EnableByDefault; +} diff --git a/library/cpp/balloc/test/do_with_disabled/main.cpp b/library/cpp/balloc/test/do_with_disabled/main.cpp new file mode 100644 index 0000000000..245887cb66 --- /dev/null +++ b/library/cpp/balloc/test/do_with_disabled/main.cpp @@ -0,0 +1,24 @@ +#include <library/cpp/balloc/optional/operators.h> + +#undef NDEBUG + +#include <cstdlib> +#include <cstring> +#include <cassert> + +// internal hook for the testing +extern "C" bool IsOwnedByBalloc(void* ptr); + +int main() { + char* buf = (char*)malloc(100); + assert(false == IsOwnedByBalloc(buf)); + + ThreadEnableBalloc(); + char* buf2 = (char*)malloc(100); + assert(true == IsOwnedByBalloc(buf2)); + + free(buf); + free(buf2); + + return 0; +} diff --git a/library/cpp/balloc/test/do_with_enabled/main.cpp b/library/cpp/balloc/test/do_with_enabled/main.cpp new file mode 100644 index 0000000000..02c165ccc3 --- /dev/null +++ b/library/cpp/balloc/test/do_with_enabled/main.cpp @@ -0,0 +1,24 @@ +#include <library/cpp/balloc/optional/operators.h> + +#undef NDEBUG + +#include <cstdlib> +#include <cstring> +#include <cassert> + +// internal hook for the testing +extern "C" bool IsOwnedByBalloc(void* ptr); + +int main() { + char* buf = (char*)malloc(100); + assert(true == IsOwnedByBalloc(buf)); + + ThreadDisableBalloc(); + char* buf2 = (char*)malloc(100); + assert(false == IsOwnedByBalloc(buf2)); + + free(buf); + free(buf2); + + return 0; +} |