diff options
Diffstat (limited to 'contrib/tools/python3/Objects/obmalloc.c')
| -rw-r--r-- | contrib/tools/python3/Objects/obmalloc.c | 1013 |
1 files changed, 931 insertions, 82 deletions
diff --git a/contrib/tools/python3/Objects/obmalloc.c b/contrib/tools/python3/Objects/obmalloc.c index 9620a8fbb44..3fdc9d01f2d 100644 --- a/contrib/tools/python3/Objects/obmalloc.c +++ b/contrib/tools/python3/Objects/obmalloc.c @@ -2,13 +2,30 @@ #include "Python.h" #include "pycore_code.h" // stats -#include "pycore_pystate.h" // _PyInterpreterState_GET - +#include "pycore_object.h" // _PyDebugAllocatorStats() definition #include "pycore_obmalloc.h" +#include "pycore_pyerrors.h" // _Py_FatalErrorFormat() #include "pycore_pymem.h" +#include "pycore_pystate.h" // _PyInterpreterState_GET +#include "pycore_obmalloc_init.h" #include <stdlib.h> // malloc() #include <stdbool.h> +#ifdef WITH_MIMALLOC +// Forward declarations of functions used in our mimalloc modifications +static void _PyMem_mi_page_clear_qsbr(mi_page_t *page); +static bool _PyMem_mi_page_is_safe_to_free(mi_page_t *page); +static bool _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force); +static void _PyMem_mi_page_reclaimed(mi_page_t *page); +static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap); +# include "pycore_mimalloc.h" +# include "mimalloc/static.c" +# include "mimalloc/internal.h" // for stats +#endif + +#if defined(Py_GIL_DISABLED) && !defined(WITH_MIMALLOC) +# error "Py_GIL_DISABLED requires WITH_MIMALLOC" +#endif #undef uint #define uint pymem_uint @@ -73,25 +90,272 @@ _PyMem_RawFree(void *Py_UNUSED(ctx), void *ptr) free(ptr); } +#ifdef WITH_MIMALLOC + +static void +_PyMem_mi_page_clear_qsbr(mi_page_t *page) +{ +#ifdef Py_GIL_DISABLED + // Clear the QSBR goal and remove the page from the QSBR linked list. + page->qsbr_goal = 0; + if (page->qsbr_node.next != NULL) { + llist_remove(&page->qsbr_node); + } +#endif +} + +// Check if an empty, newly reclaimed page is safe to free now. +static bool +_PyMem_mi_page_is_safe_to_free(mi_page_t *page) +{ + assert(mi_page_all_free(page)); +#ifdef Py_GIL_DISABLED + assert(page->qsbr_node.next == NULL); + if (page->use_qsbr && page->qsbr_goal != 0) { + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + if (tstate == NULL) { + return false; + } + return _Py_qbsr_goal_reached(tstate->qsbr, page->qsbr_goal); + } +#endif + return true; + +} + +#ifdef Py_GIL_DISABLED + +// If we are deferring collection of more than this amount of memory for +// mimalloc pages, advance the write sequence. Advancing allows these +// pages to be re-used in a different thread or for a different size class. +#define QSBR_PAGE_MEM_LIMIT 4096*20 + +// Return true if the global write sequence should be advanced for a mimalloc +// page that is deferred from collection. +static bool +should_advance_qsbr_for_page(struct _qsbr_thread_state *qsbr, mi_page_t *page) +{ + size_t bsize = mi_page_block_size(page); + size_t page_size = page->capacity*bsize; + if (page_size > QSBR_PAGE_MEM_LIMIT) { + qsbr->deferred_page_memory = 0; + return true; + } + qsbr->deferred_page_memory += page_size; + if (qsbr->deferred_page_memory > QSBR_PAGE_MEM_LIMIT) { + qsbr->deferred_page_memory = 0; + return true; + } + return false; +} +#endif + +static bool +_PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force) +{ +#ifdef Py_GIL_DISABLED + assert(mi_page_all_free(page)); + if (page->use_qsbr) { + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)PyThreadState_GET(); + if (page->qsbr_goal != 0 && _Py_qbsr_goal_reached(tstate->qsbr, page->qsbr_goal)) { + _PyMem_mi_page_clear_qsbr(page); + _mi_page_free(page, pq, force); + return true; + } + + _PyMem_mi_page_clear_qsbr(page); + page->retire_expire = 0; + + if (should_advance_qsbr_for_page(tstate->qsbr, page)) { + page->qsbr_goal = _Py_qsbr_advance(tstate->qsbr->shared); + } + else { + page->qsbr_goal = _Py_qsbr_shared_next(tstate->qsbr->shared); + } + + llist_insert_tail(&tstate->mimalloc.page_list, &page->qsbr_node); + return false; + } +#endif + _mi_page_free(page, pq, force); + return true; +} + +static void +_PyMem_mi_page_reclaimed(mi_page_t *page) +{ +#ifdef Py_GIL_DISABLED + assert(page->qsbr_node.next == NULL); + if (page->qsbr_goal != 0) { + if (mi_page_all_free(page)) { + assert(page->qsbr_node.next == NULL); + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)PyThreadState_GET(); + page->retire_expire = 0; + llist_insert_tail(&tstate->mimalloc.page_list, &page->qsbr_node); + } + else { + page->qsbr_goal = 0; + } + } +#endif +} + +static void +_PyMem_mi_heap_collect_qsbr(mi_heap_t *heap) +{ +#ifdef Py_GIL_DISABLED + if (!heap->page_use_qsbr) { + return; + } + + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + struct llist_node *head = &tstate->mimalloc.page_list; + if (llist_empty(head)) { + return; + } + + struct llist_node *node; + llist_for_each_safe(node, head) { + mi_page_t *page = llist_data(node, mi_page_t, qsbr_node); + if (!mi_page_all_free(page)) { + // We allocated from this page some point after the delayed free + _PyMem_mi_page_clear_qsbr(page); + continue; + } + + if (!_Py_qsbr_poll(tstate->qsbr, page->qsbr_goal)) { + return; + } + + _PyMem_mi_page_clear_qsbr(page); + _mi_page_free(page, mi_page_queue_of(page), false); + } +#endif +} + +void * +_PyMem_MiMalloc(void *ctx, size_t size) +{ +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = &tstate->mimalloc.heaps[_Py_MIMALLOC_HEAP_MEM]; + return mi_heap_malloc(heap, size); +#else + return mi_malloc(size); +#endif +} + +void * +_PyMem_MiCalloc(void *ctx, size_t nelem, size_t elsize) +{ +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = &tstate->mimalloc.heaps[_Py_MIMALLOC_HEAP_MEM]; + return mi_heap_calloc(heap, nelem, elsize); +#else + return mi_calloc(nelem, elsize); +#endif +} + +void * +_PyMem_MiRealloc(void *ctx, void *ptr, size_t size) +{ +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = &tstate->mimalloc.heaps[_Py_MIMALLOC_HEAP_MEM]; + return mi_heap_realloc(heap, ptr, size); +#else + return mi_realloc(ptr, size); +#endif +} + +void +_PyMem_MiFree(void *ctx, void *ptr) +{ + mi_free(ptr); +} + +void * +_PyObject_MiMalloc(void *ctx, size_t nbytes) +{ +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = tstate->mimalloc.current_object_heap; + return mi_heap_malloc(heap, nbytes); +#else + return mi_malloc(nbytes); +#endif +} + +void * +_PyObject_MiCalloc(void *ctx, size_t nelem, size_t elsize) +{ +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = tstate->mimalloc.current_object_heap; + return mi_heap_calloc(heap, nelem, elsize); +#else + return mi_calloc(nelem, elsize); +#endif +} + + +void * +_PyObject_MiRealloc(void *ctx, void *ptr, size_t nbytes) +{ +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = tstate->mimalloc.current_object_heap; + return mi_heap_realloc(heap, ptr, nbytes); +#else + return mi_realloc(ptr, nbytes); +#endif +} + +void +_PyObject_MiFree(void *ctx, void *ptr) +{ + mi_free(ptr); +} + +#endif // WITH_MIMALLOC + + #define MALLOC_ALLOC {NULL, _PyMem_RawMalloc, _PyMem_RawCalloc, _PyMem_RawRealloc, _PyMem_RawFree} -#define PYRAW_ALLOC MALLOC_ALLOC -/* the default object allocator */ + +#ifdef WITH_MIMALLOC +# define MIMALLOC_ALLOC {NULL, _PyMem_MiMalloc, _PyMem_MiCalloc, _PyMem_MiRealloc, _PyMem_MiFree} +# define MIMALLOC_OBJALLOC {NULL, _PyObject_MiMalloc, _PyObject_MiCalloc, _PyObject_MiRealloc, _PyObject_MiFree} +#endif + +/* the pymalloc allocator */ // The actual implementation is further down. -#ifdef WITH_PYMALLOC +#if defined(WITH_PYMALLOC) void* _PyObject_Malloc(void *ctx, size_t size); void* _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize); void _PyObject_Free(void *ctx, void *p); void* _PyObject_Realloc(void *ctx, void *ptr, size_t size); # define PYMALLOC_ALLOC {NULL, _PyObject_Malloc, _PyObject_Calloc, _PyObject_Realloc, _PyObject_Free} +#endif // WITH_PYMALLOC + +#if defined(Py_GIL_DISABLED) +// Py_GIL_DISABLED requires using mimalloc for "mem" and "obj" domains. +# define PYRAW_ALLOC MALLOC_ALLOC +# define PYMEM_ALLOC MIMALLOC_ALLOC +# define PYOBJ_ALLOC MIMALLOC_OBJALLOC +#elif defined(WITH_PYMALLOC) +# define PYRAW_ALLOC MALLOC_ALLOC +# define PYMEM_ALLOC PYMALLOC_ALLOC # define PYOBJ_ALLOC PYMALLOC_ALLOC #else +# define PYRAW_ALLOC MALLOC_ALLOC +# define PYMEM_ALLOC MALLOC_ALLOC # define PYOBJ_ALLOC MALLOC_ALLOC -#endif // WITH_PYMALLOC +#endif -#define PYMEM_ALLOC PYOBJ_ALLOC /* the default debug allocators */ @@ -156,8 +420,16 @@ _PyMem_ArenaFree(void *Py_UNUSED(ctx), void *ptr, ) { #ifdef MS_WINDOWS + /* Unlike free(), VirtualFree() does not special-case NULL to noop. */ + if (ptr == NULL) { + return; + } VirtualFree(ptr, 0, MEM_RELEASE); #elif defined(ARENAS_USE_MMAP) + /* Unlike free(), munmap() does not special-case NULL to noop. */ + if (ptr == NULL) { + return; + } munmap(ptr, size); #else free(ptr); @@ -258,13 +530,9 @@ int _PyMem_SetDefaultAllocator(PyMemAllocatorDomain domain, PyMemAllocatorEx *old_alloc) { - if (ALLOCATORS_MUTEX == NULL) { - /* The runtime must be initializing. */ - return set_default_allocator_unlocked(domain, pydebug, old_alloc); - } - PyThread_acquire_lock(ALLOCATORS_MUTEX, WAIT_LOCK); + PyMutex_Lock(&ALLOCATORS_MUTEX); int res = set_default_allocator_unlocked(domain, pydebug, old_alloc); - PyThread_release_lock(ALLOCATORS_MUTEX); + PyMutex_Unlock(&ALLOCATORS_MUTEX); return res; } @@ -283,7 +551,7 @@ _PyMem_GetAllocatorName(const char *name, PyMemAllocatorName *allocator) else if (strcmp(name, "debug") == 0) { *allocator = PYMEM_ALLOCATOR_DEBUG; } -#ifdef WITH_PYMALLOC +#if defined(WITH_PYMALLOC) && !defined(Py_GIL_DISABLED) else if (strcmp(name, "pymalloc") == 0) { *allocator = PYMEM_ALLOCATOR_PYMALLOC; } @@ -291,12 +559,22 @@ _PyMem_GetAllocatorName(const char *name, PyMemAllocatorName *allocator) *allocator = PYMEM_ALLOCATOR_PYMALLOC_DEBUG; } #endif +#ifdef WITH_MIMALLOC + else if (strcmp(name, "mimalloc") == 0) { + *allocator = PYMEM_ALLOCATOR_MIMALLOC; + } + else if (strcmp(name, "mimalloc_debug") == 0) { + *allocator = PYMEM_ALLOCATOR_MIMALLOC_DEBUG; + } +#endif +#ifndef Py_GIL_DISABLED else if (strcmp(name, "malloc") == 0) { *allocator = PYMEM_ALLOCATOR_MALLOC; } else if (strcmp(name, "malloc_debug") == 0) { *allocator = PYMEM_ALLOCATOR_MALLOC_DEBUG; } +#endif else { /* unknown allocator */ return -1; @@ -317,12 +595,14 @@ set_up_allocators_unlocked(PyMemAllocatorName allocator) (void)set_default_allocator_unlocked(PYMEM_DOMAIN_RAW, pydebug, NULL); (void)set_default_allocator_unlocked(PYMEM_DOMAIN_MEM, pydebug, NULL); (void)set_default_allocator_unlocked(PYMEM_DOMAIN_OBJ, pydebug, NULL); + _PyRuntime.allocators.is_debug_enabled = pydebug; break; case PYMEM_ALLOCATOR_DEBUG: (void)set_default_allocator_unlocked(PYMEM_DOMAIN_RAW, 1, NULL); (void)set_default_allocator_unlocked(PYMEM_DOMAIN_MEM, 1, NULL); (void)set_default_allocator_unlocked(PYMEM_DOMAIN_OBJ, 1, NULL); + _PyRuntime.allocators.is_debug_enabled = 1; break; #ifdef WITH_PYMALLOC @@ -336,9 +616,33 @@ set_up_allocators_unlocked(PyMemAllocatorName allocator) set_allocator_unlocked(PYMEM_DOMAIN_MEM, &pymalloc); set_allocator_unlocked(PYMEM_DOMAIN_OBJ, &pymalloc); - if (allocator == PYMEM_ALLOCATOR_PYMALLOC_DEBUG) { + int is_debug = (allocator == PYMEM_ALLOCATOR_PYMALLOC_DEBUG); + _PyRuntime.allocators.is_debug_enabled = is_debug; + if (is_debug) { + set_up_debug_hooks_unlocked(); + } + break; + } +#endif +#ifdef WITH_MIMALLOC + case PYMEM_ALLOCATOR_MIMALLOC: + case PYMEM_ALLOCATOR_MIMALLOC_DEBUG: + { + PyMemAllocatorEx malloc_alloc = MALLOC_ALLOC; + set_allocator_unlocked(PYMEM_DOMAIN_RAW, &malloc_alloc); + + PyMemAllocatorEx pymalloc = MIMALLOC_ALLOC; + set_allocator_unlocked(PYMEM_DOMAIN_MEM, &pymalloc); + + PyMemAllocatorEx objmalloc = MIMALLOC_OBJALLOC; + set_allocator_unlocked(PYMEM_DOMAIN_OBJ, &objmalloc); + + int is_debug = (allocator == PYMEM_ALLOCATOR_MIMALLOC_DEBUG); + _PyRuntime.allocators.is_debug_enabled = is_debug; + if (is_debug) { set_up_debug_hooks_unlocked(); } + break; } #endif @@ -351,7 +655,9 @@ set_up_allocators_unlocked(PyMemAllocatorName allocator) set_allocator_unlocked(PYMEM_DOMAIN_MEM, &malloc_alloc); set_allocator_unlocked(PYMEM_DOMAIN_OBJ, &malloc_alloc); - if (allocator == PYMEM_ALLOCATOR_MALLOC_DEBUG) { + int is_debug = (allocator == PYMEM_ALLOCATOR_MALLOC_DEBUG); + _PyRuntime.allocators.is_debug_enabled = is_debug; + if (is_debug) { set_up_debug_hooks_unlocked(); } break; @@ -368,9 +674,9 @@ set_up_allocators_unlocked(PyMemAllocatorName allocator) int _PyMem_SetupAllocators(PyMemAllocatorName allocator) { - PyThread_acquire_lock(ALLOCATORS_MUTEX, WAIT_LOCK); + PyMutex_Lock(&ALLOCATORS_MUTEX); int res = set_up_allocators_unlocked(allocator); - PyThread_release_lock(ALLOCATORS_MUTEX); + PyMutex_Unlock(&ALLOCATORS_MUTEX); return res; } @@ -389,6 +695,10 @@ get_current_allocator_name_unlocked(void) #ifdef WITH_PYMALLOC PyMemAllocatorEx pymalloc = PYMALLOC_ALLOC; #endif +#ifdef WITH_MIMALLOC + PyMemAllocatorEx mimalloc = MIMALLOC_ALLOC; + PyMemAllocatorEx mimalloc_obj = MIMALLOC_OBJALLOC; +#endif if (pymemallocator_eq(&_PyMem_Raw, &malloc_alloc) && pymemallocator_eq(&_PyMem, &malloc_alloc) && @@ -404,6 +714,14 @@ get_current_allocator_name_unlocked(void) return "pymalloc"; } #endif +#ifdef WITH_MIMALLOC + if (pymemallocator_eq(&_PyMem_Raw, &malloc_alloc) && + pymemallocator_eq(&_PyMem, &mimalloc) && + pymemallocator_eq(&_PyObject, &mimalloc_obj)) + { + return "mimalloc"; + } +#endif PyMemAllocatorEx dbg_raw = PYDBGRAW_ALLOC; PyMemAllocatorEx dbg_mem = PYDBGMEM_ALLOC; @@ -428,6 +746,14 @@ get_current_allocator_name_unlocked(void) return "pymalloc_debug"; } #endif +#ifdef WITH_MIMALLOC + if (pymemallocator_eq(&_PyMem_Debug.raw.alloc, &malloc_alloc) && + pymemallocator_eq(&_PyMem_Debug.mem.alloc, &mimalloc) && + pymemallocator_eq(&_PyMem_Debug.obj.alloc, &mimalloc_obj)) + { + return "mimalloc_debug"; + } +#endif } return NULL; } @@ -435,20 +761,20 @@ get_current_allocator_name_unlocked(void) const char* _PyMem_GetCurrentAllocatorName(void) { - PyThread_acquire_lock(ALLOCATORS_MUTEX, WAIT_LOCK); + PyMutex_Lock(&ALLOCATORS_MUTEX); const char *name = get_current_allocator_name_unlocked(); - PyThread_release_lock(ALLOCATORS_MUTEX); + PyMutex_Unlock(&ALLOCATORS_MUTEX); return name; } -#ifdef WITH_PYMALLOC -static int +int _PyMem_DebugEnabled(void) { - return (_PyObject.malloc == _PyMem_DebugMalloc); + return _PyRuntime.allocators.is_debug_enabled; } +#ifdef WITH_PYMALLOC static int _PyMem_PymallocEnabled(void) { @@ -459,7 +785,25 @@ _PyMem_PymallocEnabled(void) return (_PyObject.malloc == _PyObject_Malloc); } } + +#ifdef WITH_MIMALLOC +static int +_PyMem_MimallocEnabled(void) +{ +#ifdef Py_GIL_DISABLED + return 1; +#else + if (_PyMem_DebugEnabled()) { + return (_PyMem_Debug.obj.alloc.malloc == _PyObject_MiMalloc); + } + else { + return (_PyObject.malloc == _PyObject_MiMalloc); + } #endif +} +#endif // WITH_MIMALLOC + +#endif // WITH_PYMALLOC static void @@ -515,19 +859,15 @@ set_up_debug_hooks_unlocked(void) set_up_debug_hooks_domain_unlocked(PYMEM_DOMAIN_RAW); set_up_debug_hooks_domain_unlocked(PYMEM_DOMAIN_MEM); set_up_debug_hooks_domain_unlocked(PYMEM_DOMAIN_OBJ); + _PyRuntime.allocators.is_debug_enabled = 1; } void PyMem_SetupDebugHooks(void) { - if (ALLOCATORS_MUTEX == NULL) { - /* The runtime must not be completely initialized yet. */ - set_up_debug_hooks_unlocked(); - return; - } - PyThread_acquire_lock(ALLOCATORS_MUTEX, WAIT_LOCK); + PyMutex_Lock(&ALLOCATORS_MUTEX); set_up_debug_hooks_unlocked(); - PyThread_release_lock(ALLOCATORS_MUTEX); + PyMutex_Unlock(&ALLOCATORS_MUTEX); } static void @@ -563,53 +903,33 @@ set_allocator_unlocked(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator) void PyMem_GetAllocator(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator) { - if (ALLOCATORS_MUTEX == NULL) { - /* The runtime must not be completely initialized yet. */ - get_allocator_unlocked(domain, allocator); - return; - } - PyThread_acquire_lock(ALLOCATORS_MUTEX, WAIT_LOCK); + PyMutex_Lock(&ALLOCATORS_MUTEX); get_allocator_unlocked(domain, allocator); - PyThread_release_lock(ALLOCATORS_MUTEX); + PyMutex_Unlock(&ALLOCATORS_MUTEX); } void PyMem_SetAllocator(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator) { - if (ALLOCATORS_MUTEX == NULL) { - /* The runtime must not be completely initialized yet. */ - set_allocator_unlocked(domain, allocator); - return; - } - PyThread_acquire_lock(ALLOCATORS_MUTEX, WAIT_LOCK); + PyMutex_Lock(&ALLOCATORS_MUTEX); set_allocator_unlocked(domain, allocator); - PyThread_release_lock(ALLOCATORS_MUTEX); + PyMutex_Unlock(&ALLOCATORS_MUTEX); } void PyObject_GetArenaAllocator(PyObjectArenaAllocator *allocator) { - if (ALLOCATORS_MUTEX == NULL) { - /* The runtime must not be completely initialized yet. */ - *allocator = _PyObject_Arena; - return; - } - PyThread_acquire_lock(ALLOCATORS_MUTEX, WAIT_LOCK); + PyMutex_Lock(&ALLOCATORS_MUTEX); *allocator = _PyObject_Arena; - PyThread_release_lock(ALLOCATORS_MUTEX); + PyMutex_Unlock(&ALLOCATORS_MUTEX); } void PyObject_SetArenaAllocator(PyObjectArenaAllocator *allocator) { - if (ALLOCATORS_MUTEX == NULL) { - /* The runtime must not be completely initialized yet. */ - _PyObject_Arena = *allocator; - return; - } - PyThread_acquire_lock(ALLOCATORS_MUTEX, WAIT_LOCK); + PyMutex_Lock(&ALLOCATORS_MUTEX); _PyObject_Arena = *allocator; - PyThread_release_lock(ALLOCATORS_MUTEX); + PyMutex_Unlock(&ALLOCATORS_MUTEX); } @@ -783,6 +1103,285 @@ _PyMem_Strdup(const char *str) return copy; } +/***********************************************/ +/* Delayed freeing support for Py_GIL_DISABLED */ +/***********************************************/ + +// So that sizeof(struct _mem_work_chunk) is 4096 bytes on 64-bit platforms. +#define WORK_ITEMS_PER_CHUNK 254 + +// A pointer to be freed once the QSBR read sequence reaches qsbr_goal. +struct _mem_work_item { + uintptr_t ptr; // lowest bit tagged 1 for objects freed with PyObject_Free + uint64_t qsbr_goal; +}; + +// A fixed-size buffer of pointers to be freed +struct _mem_work_chunk { + // Linked list node of chunks in queue + struct llist_node node; + + Py_ssize_t rd_idx; // index of next item to read + Py_ssize_t wr_idx; // index of next item to write + struct _mem_work_item array[WORK_ITEMS_PER_CHUNK]; +}; + +static void +free_work_item(uintptr_t ptr) +{ + if (ptr & 0x01) { + PyObject_Free((char *)(ptr - 1)); + } + else { + PyMem_Free((void *)ptr); + } +} + + +#ifdef Py_GIL_DISABLED + +// For deferred advance on free: the number of deferred items before advancing +// the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally +// want to process a chunk before it overflows. +#define QSBR_DEFERRED_LIMIT 127 + +// If the deferred memory exceeds 1 MiB, advance the write sequence. This +// helps limit memory usage due to QSBR delaying frees too long. +#define QSBR_FREE_MEM_LIMIT 1024*1024 + +// Return true if the global write sequence should be advanced for a deferred +// memory free. +static bool +should_advance_qsbr_for_free(struct _qsbr_thread_state *qsbr, size_t size) +{ + if (size > QSBR_FREE_MEM_LIMIT) { + qsbr->deferred_count = 0; + qsbr->deferred_memory = 0; + qsbr->should_process = true; + return true; + } + qsbr->deferred_count++; + qsbr->deferred_memory += size; + if (qsbr->deferred_count > QSBR_DEFERRED_LIMIT || + qsbr->deferred_memory > QSBR_FREE_MEM_LIMIT) { + qsbr->deferred_count = 0; + qsbr->deferred_memory = 0; + qsbr->should_process = true; + return true; + } + return false; +} +#endif + +static void +free_delayed(uintptr_t ptr, size_t size) +{ +#ifndef Py_GIL_DISABLED + free_work_item(ptr); +#else + if (_PyRuntime.stoptheworld.world_stopped) { + // Free immediately if the world is stopped, including during + // interpreter shutdown. + free_work_item(ptr); + return; + } + + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + struct llist_node *head = &tstate->mem_free_queue; + + struct _mem_work_chunk *buf = NULL; + if (!llist_empty(head)) { + // Try to re-use the last buffer + buf = llist_data(head->prev, struct _mem_work_chunk, node); + if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) { + // already full + buf = NULL; + } + } + + if (buf == NULL) { + buf = PyMem_Calloc(1, sizeof(*buf)); + if (buf != NULL) { + llist_insert_tail(head, &buf->node); + } + } + + if (buf == NULL) { + // failed to allocate a buffer, free immediately + _PyEval_StopTheWorld(tstate->base.interp); + free_work_item(ptr); + _PyEval_StartTheWorld(tstate->base.interp); + return; + } + + assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK); + uint64_t seq; + if (should_advance_qsbr_for_free(tstate->qsbr, size)) { + seq = _Py_qsbr_advance(tstate->qsbr->shared); + } + else { + seq = _Py_qsbr_shared_next(tstate->qsbr->shared); + } + buf->array[buf->wr_idx].ptr = ptr; + buf->array[buf->wr_idx].qsbr_goal = seq; + buf->wr_idx++; + + if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) { + // Normally the processing of delayed items is done from the eval + // breaker. Processing here is a safety measure to ensure too much + // work does not accumulate. + _PyMem_ProcessDelayed((PyThreadState *)tstate); + } +#endif +} + +void +_PyMem_FreeDelayed(void *ptr, size_t size) +{ + assert(!((uintptr_t)ptr & 0x01)); + if (ptr != NULL) { + free_delayed((uintptr_t)ptr, size); + } +} + +void +_PyObject_FreeDelayed(void *ptr) +{ + assert(!((uintptr_t)ptr & 0x01)); + // We use 0 as the size since we don't have an easy way to know the + // actual size. If we are freeing many objects, the write sequence + // will be advanced due to QSBR_DEFERRED_LIMIT. + free_delayed(((uintptr_t)ptr)|0x01, 0); +} + +static struct _mem_work_chunk * +work_queue_first(struct llist_node *head) +{ + return llist_data(head->next, struct _mem_work_chunk, node); +} + +static void +process_queue(struct llist_node *head, struct _qsbr_thread_state *qsbr, + bool keep_empty) +{ + while (!llist_empty(head)) { + struct _mem_work_chunk *buf = work_queue_first(head); + + while (buf->rd_idx < buf->wr_idx) { + struct _mem_work_item *item = &buf->array[buf->rd_idx]; + if (!_Py_qsbr_poll(qsbr, item->qsbr_goal)) { + return; + } + + free_work_item(item->ptr); + buf->rd_idx++; + } + + assert(buf->rd_idx == buf->wr_idx); + if (keep_empty && buf->node.next == head) { + // Keep the last buffer in the queue to reduce re-allocations + buf->rd_idx = buf->wr_idx = 0; + return; + } + + llist_remove(&buf->node); + PyMem_Free(buf); + } +} + +static void +process_interp_queue(struct _Py_mem_interp_free_queue *queue, + struct _qsbr_thread_state *qsbr) +{ + assert(PyMutex_IsLocked(&queue->mutex)); + process_queue(&queue->head, qsbr, false); + + int more_work = !llist_empty(&queue->head); + _Py_atomic_store_int_relaxed(&queue->has_work, more_work); +} + +static void +maybe_process_interp_queue(struct _Py_mem_interp_free_queue *queue, + struct _qsbr_thread_state *qsbr) +{ + if (!_Py_atomic_load_int_relaxed(&queue->has_work)) { + return; + } + + // Try to acquire the lock, but don't block if it's already held. + if (_PyMutex_LockTimed(&queue->mutex, 0, 0) == PY_LOCK_ACQUIRED) { + process_interp_queue(queue, qsbr); + PyMutex_Unlock(&queue->mutex); + } +} + +void +_PyMem_ProcessDelayed(PyThreadState *tstate) +{ + PyInterpreterState *interp = tstate->interp; + _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; + + tstate_impl->qsbr->should_process = false; + + // Process thread-local work + process_queue(&tstate_impl->mem_free_queue, tstate_impl->qsbr, true); + + // Process shared interpreter work + maybe_process_interp_queue(&interp->mem_free_queue, tstate_impl->qsbr); +} + +void +_PyMem_AbandonDelayed(PyThreadState *tstate) +{ + PyInterpreterState *interp = tstate->interp; + struct llist_node *queue = &((_PyThreadStateImpl *)tstate)->mem_free_queue; + + if (llist_empty(queue)) { + return; + } + + // Check if the queue contains one empty buffer + struct _mem_work_chunk *buf = work_queue_first(queue); + if (buf->rd_idx == buf->wr_idx) { + llist_remove(&buf->node); + PyMem_Free(buf); + assert(llist_empty(queue)); + return; + } + + PyMutex_Lock(&interp->mem_free_queue.mutex); + + // Merge the thread's work queue into the interpreter's work queue. + llist_concat(&interp->mem_free_queue.head, queue); + + // Process the merged queue now (see gh-130794). + _PyThreadStateImpl *this_tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + process_interp_queue(&interp->mem_free_queue, this_tstate->qsbr); + + PyMutex_Unlock(&interp->mem_free_queue.mutex); + + assert(llist_empty(queue)); // the thread's queue is now empty +} + +void +_PyMem_FiniDelayed(PyInterpreterState *interp) +{ + struct llist_node *head = &interp->mem_free_queue.head; + while (!llist_empty(head)) { + struct _mem_work_chunk *buf = work_queue_first(head); + + while (buf->rd_idx < buf->wr_idx) { + // Free the remaining items immediately. There should be no other + // threads accessing the memory at this point during shutdown. + struct _mem_work_item *item = &buf->array[buf->rd_idx]; + free_work_item(item->ptr); + buf->rd_idx++; + } + + llist_remove(&buf->node); + PyMem_Free(buf); + } +} /**************************/ /* the "object" allocator */ @@ -852,6 +1451,13 @@ static int running_on_valgrind = -1; typedef struct _obmalloc_state OMState; +/* obmalloc state for main interpreter and shared by all interpreters without + * their own obmalloc state. By not explicitly initalizing this structure, it + * will be allocated in the BSS which is a small performance win. The radix + * tree arrays are fairly large but are sparsely used. */ +static struct _obmalloc_state obmalloc_state_main; +static bool obmalloc_state_initialized; + static inline int has_own_state(PyInterpreterState *interp) { @@ -864,10 +1470,8 @@ static inline OMState * get_state(void) { PyInterpreterState *interp = _PyInterpreterState_GET(); - if (!has_own_state(interp)) { - interp = _PyInterpreterState_Main(); - } - return &interp->obmalloc; + assert(interp->obmalloc != NULL); // otherwise not initialized or freed + return interp->obmalloc; } // These macros all rely on a local "state" variable. @@ -882,9 +1486,51 @@ get_state(void) #define narenas_highwater (state->mgmt.narenas_highwater) #define raw_allocated_blocks (state->mgmt.raw_allocated_blocks) +#ifdef WITH_MIMALLOC +static bool count_blocks( + const mi_heap_t* heap, const mi_heap_area_t* area, + void* block, size_t block_size, void* allocated_blocks) +{ + *(size_t *)allocated_blocks += area->used; + return 1; +} + +static Py_ssize_t +get_mimalloc_allocated_blocks(PyInterpreterState *interp) +{ + size_t allocated_blocks = 0; +#ifdef Py_GIL_DISABLED + for (PyThreadState *t = interp->threads.head; t != NULL; t = t->next) { + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)t; + for (int i = 0; i < _Py_MIMALLOC_HEAP_COUNT; i++) { + mi_heap_t *heap = &tstate->mimalloc.heaps[i]; + mi_heap_visit_blocks(heap, false, &count_blocks, &allocated_blocks); + } + } + + mi_abandoned_pool_t *pool = &interp->mimalloc.abandoned_pool; + for (uint8_t tag = 0; tag < _Py_MIMALLOC_HEAP_COUNT; tag++) { + _mi_abandoned_pool_visit_blocks(pool, tag, false, &count_blocks, + &allocated_blocks); + } +#else + // TODO(sgross): this only counts the current thread's blocks. + mi_heap_t *heap = mi_heap_get_default(); + mi_heap_visit_blocks(heap, false, &count_blocks, &allocated_blocks); +#endif + return allocated_blocks; +} +#endif + Py_ssize_t _PyInterpreterState_GetAllocatedBlocks(PyInterpreterState *interp) { +#ifdef WITH_MIMALLOC + if (_PyMem_MimallocEnabled()) { + return get_mimalloc_allocated_blocks(interp); + } +#endif + #ifdef Py_DEBUG assert(has_own_state(interp)); #else @@ -893,7 +1539,11 @@ _PyInterpreterState_GetAllocatedBlocks(PyInterpreterState *interp) "the interpreter doesn't have its own allocator"); } #endif - OMState *state = &interp->obmalloc; + OMState *state = interp->obmalloc; + + if (state == NULL) { + return 0; + } Py_ssize_t n = raw_allocated_blocks; /* add up allocated blocks for used pools */ @@ -915,19 +1565,36 @@ _PyInterpreterState_GetAllocatedBlocks(PyInterpreterState *interp) return n; } +static void free_obmalloc_arenas(PyInterpreterState *interp); + void _PyInterpreterState_FinalizeAllocatedBlocks(PyInterpreterState *interp) { - if (has_own_state(interp)) { +#ifdef WITH_MIMALLOC + if (_PyMem_MimallocEnabled()) { + return; + } +#endif + if (has_own_state(interp) && interp->obmalloc != NULL) { Py_ssize_t leaked = _PyInterpreterState_GetAllocatedBlocks(interp); assert(has_own_state(interp) || leaked == 0); interp->runtime->obmalloc.interpreter_leaks += leaked; + if (_PyMem_obmalloc_state_on_heap(interp) && leaked == 0) { + // free the obmalloc arenas and radix tree nodes. If leaked > 0 + // then some of the memory allocated by obmalloc has not been + // freed. It might be safe to free the arenas in that case but + // it's possible that extension modules are still using that + // memory. So, it is safer to not free and to leak. Perhaps there + // should be warning when this happens. It should be possible to + // use a tool like "-fsanitize=address" to track down these leaks. + free_obmalloc_arenas(interp); + } } } static Py_ssize_t get_num_global_allocated_blocks(_PyRuntimeState *); -/* We preserve the number of blockss leaked during runtime finalization, +/* We preserve the number of blocks leaked during runtime finalization, so they can be reported if the runtime is initialized again. */ // XXX We don't lose any information by dropping this, // so we should consider doing so. @@ -961,6 +1628,7 @@ get_num_global_allocated_blocks(_PyRuntimeState *runtime) } } else { + _PyEval_StopTheWorldAll(&_PyRuntime); HEAD_LOCK(runtime); PyInterpreterState *interp = PyInterpreterState_Head(); assert(interp != NULL); @@ -980,6 +1648,7 @@ get_num_global_allocated_blocks(_PyRuntimeState *runtime) } } HEAD_UNLOCK(runtime); + _PyEval_StartTheWorldAll(&_PyRuntime); #ifdef Py_DEBUG assert(got_main); #endif @@ -2034,6 +2703,33 @@ write_size_t(void *p, size_t n) } } +static void +fill_mem_debug(debug_alloc_api_t *api, void *data, int c, size_t nbytes, + bool is_alloc) +{ +#ifdef Py_GIL_DISABLED + if (api->api_id == 'o') { + // Don't overwrite the first few bytes of a PyObject allocation in the + // free-threaded build + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + size_t debug_offset; + if (is_alloc) { + debug_offset = tstate->mimalloc.current_object_heap->debug_offset; + } + else { + char *alloc = (char *)data - 2*SST; // start of the allocation + debug_offset = _mi_ptr_page(alloc)->debug_offset; + } + debug_offset -= 2*SST; // account for pymalloc extra bytes + if (debug_offset < nbytes) { + memset((char *)data + debug_offset, c, nbytes - debug_offset); + } + return; + } +#endif + memset(data, c, nbytes); +} + /* Let S = sizeof(size_t). The debug malloc asks for 4 * S extra bytes and fills them with useful stuff, here calling the underlying malloc's result p: @@ -2110,7 +2806,7 @@ _PyMem_DebugRawAlloc(int use_calloc, void *ctx, size_t nbytes) memset(p + SST + 1, PYMEM_FORBIDDENBYTE, SST-1); if (nbytes > 0 && !use_calloc) { - memset(data, PYMEM_CLEANBYTE, nbytes); + fill_mem_debug(api, data, PYMEM_CLEANBYTE, nbytes, true); } /* at tail, write pad (SST bytes) and serialno (SST bytes) */ @@ -2158,8 +2854,9 @@ _PyMem_DebugRawFree(void *ctx, void *p) _PyMem_DebugCheckAddress(__func__, api->api_id, p); nbytes = read_size_t(q); - nbytes += PYMEM_DEBUG_EXTRA_BYTES; - memset(q, PYMEM_DEADBYTE, nbytes); + nbytes += PYMEM_DEBUG_EXTRA_BYTES - 2*SST; + memset(q, PYMEM_DEADBYTE, 2*SST); + fill_mem_debug(api, p, PYMEM_DEADBYTE, nbytes, false); api->alloc.free(api->alloc.ctx, q); } @@ -2179,7 +2876,6 @@ _PyMem_DebugRawRealloc(void *ctx, void *p, size_t nbytes) size_t total; /* 2 * SST + nbytes + 2 * SST */ size_t original_nbytes; #define ERASED_SIZE 64 - uint8_t save[2*ERASED_SIZE]; /* A copy of erased bytes. */ _PyMem_DebugCheckAddress(__func__, api->api_id, p); @@ -2196,9 +2892,11 @@ _PyMem_DebugRawRealloc(void *ctx, void *p, size_t nbytes) #ifdef PYMEM_DEBUG_SERIALNO size_t block_serialno = read_size_t(tail + SST); #endif +#ifndef Py_GIL_DISABLED /* Mark the header, the trailer, ERASED_SIZE bytes at the begin and ERASED_SIZE bytes at the end as dead and save the copy of erased bytes. */ + uint8_t save[2*ERASED_SIZE]; /* A copy of erased bytes. */ if (original_nbytes <= sizeof(save)) { memcpy(save, data, original_nbytes); memset(data - 2 * SST, PYMEM_DEADBYTE, @@ -2211,6 +2909,7 @@ _PyMem_DebugRawRealloc(void *ctx, void *p, size_t nbytes) memset(tail - ERASED_SIZE, PYMEM_DEADBYTE, ERASED_SIZE + PYMEM_DEBUG_EXTRA_BYTES - 2 * SST); } +#endif /* Resize and add decorations. */ r = (uint8_t *)api->alloc.realloc(api->alloc.ctx, head, total); @@ -2238,6 +2937,7 @@ _PyMem_DebugRawRealloc(void *ctx, void *p, size_t nbytes) write_size_t(tail + SST, block_serialno); #endif +#ifndef Py_GIL_DISABLED /* Restore saved bytes. */ if (original_nbytes <= sizeof(save)) { memcpy(data, save, Py_MIN(nbytes, original_nbytes)); @@ -2250,6 +2950,7 @@ _PyMem_DebugRawRealloc(void *ctx, void *p, size_t nbytes) Py_MIN(nbytes - i, ERASED_SIZE)); } } +#endif if (r == NULL) { return NULL; @@ -2511,9 +3212,96 @@ _PyDebugAllocatorStats(FILE *out, (void)printone(out, buf2, num_blocks * sizeof_block); } +// Return true if the obmalloc state structure is heap allocated, +// by PyMem_RawCalloc(). For the main interpreter, this structure +// allocated in the BSS. Allocating that way gives some memory savings +// and a small performance win (at least on a demand paged OS). On +// 64-bit platforms, the obmalloc structure is 256 kB. Most of that +// memory is for the arena_map_top array. Since normally only one entry +// of that array is used, only one page of resident memory is actually +// used, rather than the full 256 kB. +bool _PyMem_obmalloc_state_on_heap(PyInterpreterState *interp) +{ +#if WITH_PYMALLOC + return interp->obmalloc && interp->obmalloc != &obmalloc_state_main; +#else + return false; +#endif +} + +#ifdef WITH_PYMALLOC +static void +init_obmalloc_pools(PyInterpreterState *interp) +{ + // initialize the obmalloc->pools structure. This must be done + // before the obmalloc alloc/free functions can be called. + poolp temp[OBMALLOC_USED_POOLS_SIZE] = + _obmalloc_pools_INIT(interp->obmalloc->pools); + memcpy(&interp->obmalloc->pools.used, temp, sizeof(temp)); +} +#endif /* WITH_PYMALLOC */ + +int _PyMem_init_obmalloc(PyInterpreterState *interp) +{ +#ifdef WITH_PYMALLOC + /* Initialize obmalloc, but only for subinterpreters, + since the main interpreter is initialized statically. */ + if (_Py_IsMainInterpreter(interp) + || _PyInterpreterState_HasFeature(interp, + Py_RTFLAGS_USE_MAIN_OBMALLOC)) { + interp->obmalloc = &obmalloc_state_main; + if (!obmalloc_state_initialized) { + init_obmalloc_pools(interp); + obmalloc_state_initialized = true; + } + } else { + interp->obmalloc = PyMem_RawCalloc(1, sizeof(struct _obmalloc_state)); + if (interp->obmalloc == NULL) { + return -1; + } + init_obmalloc_pools(interp); + } +#endif /* WITH_PYMALLOC */ + return 0; // success +} + #ifdef WITH_PYMALLOC +static void +free_obmalloc_arenas(PyInterpreterState *interp) +{ + OMState *state = interp->obmalloc; + for (uint i = 0; i < maxarenas; ++i) { + // free each obmalloc memory arena + struct arena_object *ao = &allarenas[i]; + _PyObject_Arena.free(_PyObject_Arena.ctx, + (void *)ao->address, ARENA_SIZE); + } + // free the array containing pointers to all arenas + PyMem_RawFree(allarenas); +#if WITH_PYMALLOC_RADIX_TREE +#ifdef USE_INTERIOR_NODES + // Free the middle and bottom nodes of the radix tree. These are allocated + // by arena_map_mark_used() but not freed when arenas are freed. + for (int i1 = 0; i1 < MAP_TOP_LENGTH; i1++) { + arena_map_mid_t *mid = arena_map_root.ptrs[i1]; + if (mid == NULL) { + continue; + } + for (int i2 = 0; i2 < MAP_MID_LENGTH; i2++) { + arena_map_bot_t *bot = arena_map_root.ptrs[i1]->ptrs[i2]; + if (bot == NULL) { + continue; + } + PyMem_RawFree(bot); + } + PyMem_RawFree(mid); + } +#endif +#endif +} + #ifdef Py_DEBUG /* Is target in the list? The list is traversed via the nextpool pointers. * The list may be NULL-terminated, or circular. Return 1 if target is in @@ -2535,19 +3323,55 @@ pool_is_in_list(const poolp target, poolp list) } #endif -/* Print summary info to "out" about the state of pymalloc's structures. - * In Py_DEBUG mode, also perform some expensive internal consistency - * checks. - * - * Return 0 if the memory debug hooks are not installed or no statistics was - * written into out, return 1 otherwise. - */ -int -_PyObject_DebugMallocStats(FILE *out) +#ifdef WITH_MIMALLOC +struct _alloc_stats { + size_t allocated_blocks; + size_t allocated_bytes; + size_t allocated_with_overhead; + size_t bytes_reserved; + size_t bytes_committed; +}; + +static bool _collect_alloc_stats( + const mi_heap_t* heap, const mi_heap_area_t* area, + void* block, size_t block_size, void* arg) +{ + struct _alloc_stats *stats = (struct _alloc_stats *)arg; + stats->allocated_blocks += area->used; + stats->allocated_bytes += area->used * area->block_size; + stats->allocated_with_overhead += area->used * area->full_block_size; + stats->bytes_reserved += area->reserved; + stats->bytes_committed += area->committed; + return 1; +} + +static void +py_mimalloc_print_stats(FILE *out) +{ + fprintf(out, "Small block threshold = %zd, in %u size classes.\n", + MI_SMALL_OBJ_SIZE_MAX, MI_BIN_HUGE); + fprintf(out, "Medium block threshold = %zd\n", + MI_MEDIUM_OBJ_SIZE_MAX); + fprintf(out, "Large object max size = %zd\n", + MI_LARGE_OBJ_SIZE_MAX); + + mi_heap_t *heap = mi_heap_get_default(); + struct _alloc_stats stats; + memset(&stats, 0, sizeof(stats)); + mi_heap_visit_blocks(heap, false, &_collect_alloc_stats, &stats); + + fprintf(out, " Allocated Blocks: %zd\n", stats.allocated_blocks); + fprintf(out, " Allocated Bytes: %zd\n", stats.allocated_bytes); + fprintf(out, " Allocated Bytes w/ Overhead: %zd\n", stats.allocated_with_overhead); + fprintf(out, " Bytes Reserved: %zd\n", stats.bytes_reserved); + fprintf(out, " Bytes Committed: %zd\n", stats.bytes_committed); +} +#endif + + +static void +pymalloc_print_stats(FILE *out) { - if (!_PyMem_PymallocEnabled()) { - return 0; - } OMState *state = get_state(); uint i; @@ -2700,7 +3524,32 @@ _PyObject_DebugMallocStats(FILE *out) #endif #endif - return 1; +} + +/* Print summary info to "out" about the state of pymalloc's structures. + * In Py_DEBUG mode, also perform some expensive internal consistency + * checks. + * + * Return 0 if the memory debug hooks are not installed or no statistics was + * written into out, return 1 otherwise. + */ +int +_PyObject_DebugMallocStats(FILE *out) +{ +#ifdef WITH_MIMALLOC + if (_PyMem_MimallocEnabled()) { + py_mimalloc_print_stats(out); + return 1; + } + else +#endif + if (_PyMem_PymallocEnabled()) { + pymalloc_print_stats(out); + return 1; + } + else { + return 0; + } } #endif /* #ifdef WITH_PYMALLOC */ |
