diff options
author | shadchin <shadchin@yandex-team.com> | 2024-02-12 07:53:52 +0300 |
---|---|---|
committer | shadchin <shadchin@yandex-team.com> | 2024-02-12 08:07:36 +0300 |
commit | ce1b7ca3171f9158180640c6a02a74b4afffedea (patch) | |
tree | e47c1e8391b1b0128262c1e9b1e6ed4c8fff2348 /contrib/tools/python3/src/Python/perf_trampoline.c | |
parent | 57350d96f030db90f220ce50ee591d5c5d403df7 (diff) | |
download | ydb-ce1b7ca3171f9158180640c6a02a74b4afffedea.tar.gz |
Update Python from 3.11.8 to 3.12.2
Diffstat (limited to 'contrib/tools/python3/src/Python/perf_trampoline.c')
-rw-r--r-- | contrib/tools/python3/src/Python/perf_trampoline.c | 493 |
1 files changed, 493 insertions, 0 deletions
diff --git a/contrib/tools/python3/src/Python/perf_trampoline.c b/contrib/tools/python3/src/Python/perf_trampoline.c new file mode 100644 index 0000000000..ea9dc83dd0 --- /dev/null +++ b/contrib/tools/python3/src/Python/perf_trampoline.c @@ -0,0 +1,493 @@ +/* + +Perf trampoline instrumentation +=============================== + +This file contains instrumentation to allow to associate +calls to the CPython eval loop back to the names of the Python +functions and filename being executed. + +Many native performance profilers like the Linux perf tools are +only available to 'see' the C stack when sampling from the profiled +process. This means that if we have the following python code: + + import time + def foo(n): + # Some CPU intensive code + + def bar(n): + foo(n) + + def baz(n): + bar(n) + + baz(10000000) + +A performance profiler that is only able to see native frames will +produce the following backtrace when sampling from foo(): + + _PyEval_EvalFrameDefault -----> Evaluation frame of foo() + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + _PyEval_EvalFrameDefault ------> Evaluation frame of bar() + _PyEval_EvalFrame + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + _PyEval_EvalFrameDefault -------> Evaluation frame of baz() + _PyEval_EvalFrame + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + ... + + Py_RunMain + +Because the profiler is only able to see the native frames and the native +function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault) +then the profiler and any reporter generated by it will not be able to +associate the names of the Python functions and the filenames associated with +those calls, rendering the results useless in the Python world. + +To fix this problem, we introduce the concept of a trampoline frame. A +trampoline frame is a piece of code that is unique per Python code object that +is executed before entering the CPython eval loop. This piece of code just +calls the original Python evaluation function (_PyEval_EvalFrameDefault) and +forwards all the arguments received. In this way, when a profiler samples +frames from the previous example it will see; + + _PyEval_EvalFrameDefault -----> Evaluation frame of foo() + [Jit compiled code 3] + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + _PyEval_EvalFrameDefault ------> Evaluation frame of bar() + [Jit compiled code 2] + _PyEval_EvalFrame + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + _PyEval_EvalFrameDefault -------> Evaluation frame of baz() + [Jit compiled code 1] + _PyEval_EvalFrame + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + ... + + Py_RunMain + +When we generate every unique copy of the trampoline (what here we called "[Jit +compiled code N]") we write the relationship between the compiled code and the +Python function that is associated with it. Every profiler requires this +information in a different format. For example, the Linux "perf" profiler +requires a file in "/tmp/perf-PID.map" (name and location not configurable) +with the following format: + + <compiled code address> <compiled code size> <name of the compiled code> + +If this file is available when "perf" generates reports, it will automatically +associate every trampoline with the Python function that it is associated with +allowing it to generate reports that include Python information. These reports +then can also be filtered in a way that *only* Python information appears. + +Notice that for this to work, there must be a unique copied of the trampoline +per Python code object even if the code in the trampoline is the same. To +achieve this we have a assembly template in Objects/asm_trampiline.S that is +compiled into the Python executable/shared library. This template generates a +symbol that maps the start of the assembly code and another that marks the end +of the assembly code for the trampoline. Then, every time we need a unique +trampoline for a Python code object, we copy the assembly code into a mmaped +area that has executable permissions and we return the start of that area as +our trampoline function. + +Asking for a mmap-ed memory area for trampoline is very wasteful so we +allocate big arenas of memory in a single mmap call, we populate the entire +arena with copies of the trampoline (this allows us to now have to invalidate +the icache for the instructions in the page) and then we return the next +available chunk every time someone asks for a new trampoline. We keep a linked +list of arenas in case the current memory arena is exhausted and another one is +needed. + +For the best results, Python should be compiled with +CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows +profilers to unwind using only the frame pointer and not on DWARF debug +information (note that as trampilines are dynamically generated there won't be +any DWARF information available for them). +*/ + +#include "Python.h" +#include "pycore_ceval.h" +#include "pycore_frame.h" +#include "pycore_interp.h" + + +#ifdef PY_HAVE_PERF_TRAMPOLINE + +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <unistd.h> + +#if defined(__arm__) || defined(__arm64__) || defined(__aarch64__) +#define PY_HAVE_INVALIDATE_ICACHE + +#if defined(__clang__) || defined(__GNUC__) +extern void __clear_cache(void *, void*); +#endif + +static void invalidate_icache(char* begin, char*end) { +#if defined(__clang__) || defined(__GNUC__) + return __clear_cache(begin, end); +#else + return; +#endif +} +#endif + +/* The function pointer is passed as last argument. The other three arguments + * are passed in the same order as the function requires. This results in + * shorter, more efficient ASM code for trampoline. + */ +typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *, + int throwflag); +typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int, + py_evaluator); + +extern void *_Py_trampoline_func_start; // Start of the template of the + // assembly trampoline +extern void * + _Py_trampoline_func_end; // End of the template of the assembly trampoline + +struct code_arena_st { + char *start_addr; // Start of the memory arena + char *current_addr; // Address of the current trampoline within the arena + size_t size; // Size of the memory arena + size_t size_left; // Remaining size of the memory arena + size_t code_size; // Size of the code of every trampoline in the arena + struct code_arena_st + *prev; // Pointer to the arena or NULL if this is the first arena. +}; + +typedef struct code_arena_st code_arena_t; +typedef struct trampoline_api_st trampoline_api_t; + +#define perf_status _PyRuntime.ceval.perf.status +#define extra_code_index _PyRuntime.ceval.perf.extra_code_index +#define perf_code_arena _PyRuntime.ceval.perf.code_arena +#define trampoline_api _PyRuntime.ceval.perf.trampoline_api +#define perf_map_file _PyRuntime.ceval.perf.map_file + + +static void +perf_map_write_entry(void *state, const void *code_addr, + unsigned int code_size, PyCodeObject *co) +{ + const char *entry = ""; + if (co->co_qualname != NULL) { + entry = PyUnicode_AsUTF8(co->co_qualname); + } + const char *filename = ""; + if (co->co_filename != NULL) { + filename = PyUnicode_AsUTF8(co->co_filename); + } + size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; + char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size); + if (perf_map_entry == NULL) { + return; + } + snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename); + PyUnstable_WritePerfMapEntry(code_addr, code_size, perf_map_entry); + PyMem_RawFree(perf_map_entry); +} + +static void* +perf_map_init_state(void) +{ + PyUnstable_PerfMapState_Init(); + return NULL; +} + +static int +perf_map_free_state(void *state) +{ + PyUnstable_PerfMapState_Fini(); + return 0; +} + +_PyPerf_Callbacks _Py_perfmap_callbacks = { + &perf_map_init_state, + &perf_map_write_entry, + &perf_map_free_state, +}; + +static int +new_code_arena(void) +{ + // non-trivial programs typically need 64 to 256 kiB. + size_t mem_size = 4096 * 16; + assert(mem_size % sysconf(_SC_PAGESIZE) == 0); + char *memory = + mmap(NULL, // address + mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, + -1, // fd (not used here) + 0); // offset (not used here) + if (memory == MAP_FAILED) { + PyErr_SetFromErrno(PyExc_OSError); + _PyErr_WriteUnraisableMsg( + "Failed to create new mmap for perf trampoline", NULL); + perf_status = PERF_STATUS_FAILED; + return -1; + } + void *start = &_Py_trampoline_func_start; + void *end = &_Py_trampoline_func_end; + size_t code_size = end - start; + // TODO: Check the effect of alignment of the code chunks. Initial investigation + // showed that this has no effect on performance in x86-64 or aarch64 and the current + // version has the advantage that the unwinder in GDB can unwind across JIT-ed code. + // + // We should check the values in the future and see if there is a + // measurable performance improvement by rounding trampolines up to 32-bit + // or 64-bit alignment. + + size_t n_copies = mem_size / code_size; + for (size_t i = 0; i < n_copies; i++) { + memcpy(memory + i * code_size, start, code_size * sizeof(char)); + } + // Some systems may prevent us from creating executable code on the fly. + int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC); + if (res == -1) { + PyErr_SetFromErrno(PyExc_OSError); + munmap(memory, mem_size); + _PyErr_WriteUnraisableMsg( + "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC", + NULL); + return -1; + } + +#ifdef PY_HAVE_INVALIDATE_ICACHE + // Before the JIT can run a block of code that has been emitted it must invalidate + // the instruction cache on some platforms like arm and aarch64. + invalidate_icache(memory, memory + mem_size); +#endif + + code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t)); + if (new_arena == NULL) { + PyErr_NoMemory(); + munmap(memory, mem_size); + _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct", + NULL); + return -1; + } + + new_arena->start_addr = memory; + new_arena->current_addr = memory; + new_arena->size = mem_size; + new_arena->size_left = mem_size; + new_arena->code_size = code_size; + new_arena->prev = perf_code_arena; + perf_code_arena = new_arena; + return 0; +} + +static void +free_code_arenas(void) +{ + code_arena_t *cur = perf_code_arena; + code_arena_t *prev; + perf_code_arena = NULL; // invalid static pointer + while (cur) { + munmap(cur->start_addr, cur->size); + prev = cur->prev; + PyMem_RawFree(cur); + cur = prev; + } +} + +static inline py_trampoline +code_arena_new_code(code_arena_t *code_arena) +{ + py_trampoline trampoline = (py_trampoline)code_arena->current_addr; + code_arena->size_left -= code_arena->code_size; + code_arena->current_addr += code_arena->code_size; + return trampoline; +} + +static inline py_trampoline +compile_trampoline(void) +{ + if ((perf_code_arena == NULL) || + (perf_code_arena->size_left <= perf_code_arena->code_size)) { + if (new_code_arena() < 0) { + return NULL; + } + } + assert(perf_code_arena->size_left <= perf_code_arena->size); + return code_arena_new_code(perf_code_arena); +} + +static PyObject * +py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame, + int throw) +{ + if (perf_status == PERF_STATUS_FAILED || + perf_status == PERF_STATUS_NO_INIT) { + goto default_eval; + } + PyCodeObject *co = frame->f_code; + py_trampoline f = NULL; + assert(extra_code_index != -1); + int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f); + if (ret != 0 || f == NULL) { + // This is the first time we see this code object so we need + // to compile a trampoline for it. + py_trampoline new_trampoline = compile_trampoline(); + if (new_trampoline == NULL) { + goto default_eval; + } + trampoline_api.write_state(trampoline_api.state, new_trampoline, + perf_code_arena->code_size, co); + _PyCode_SetExtra((PyObject *)co, extra_code_index, + (void *)new_trampoline); + f = new_trampoline; + } + assert(f != NULL); + return f(ts, frame, throw, _PyEval_EvalFrameDefault); +default_eval: + // Something failed, fall back to the default evaluator. + return _PyEval_EvalFrameDefault(ts, frame, throw); +} +#endif // PY_HAVE_PERF_TRAMPOLINE + +int +_PyIsPerfTrampolineActive(void) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + PyThreadState *tstate = _PyThreadState_GET(); + return tstate->interp->eval_frame == py_trampoline_evaluator; +#endif + return 0; +} + +void +_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks) +{ + if (callbacks == NULL) { + return; + } +#ifdef PY_HAVE_PERF_TRAMPOLINE + callbacks->init_state = trampoline_api.init_state; + callbacks->write_state = trampoline_api.write_state; + callbacks->free_state = trampoline_api.free_state; +#endif + return; +} + +int +_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks) +{ + if (callbacks == NULL) { + return -1; + } +#ifdef PY_HAVE_PERF_TRAMPOLINE + if (trampoline_api.state) { + _PyPerfTrampoline_Fini(); + } + trampoline_api.init_state = callbacks->init_state; + trampoline_api.write_state = callbacks->write_state; + trampoline_api.free_state = callbacks->free_state; + trampoline_api.state = NULL; +#endif + return 0; +} + +void _PyPerfTrampoline_FreeArenas(void) { +#ifdef PY_HAVE_PERF_TRAMPOLINE + free_code_arenas(); +#endif + return; +} + +int +_PyPerfTrampoline_Init(int activate) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate->interp->eval_frame && + tstate->interp->eval_frame != py_trampoline_evaluator) { + PyErr_SetString(PyExc_RuntimeError, + "Trampoline cannot be initialized as a custom eval " + "frame is already present"); + return -1; + } + if (!activate) { + tstate->interp->eval_frame = NULL; + perf_status = PERF_STATUS_NO_INIT; + } + else { + tstate->interp->eval_frame = py_trampoline_evaluator; + if (new_code_arena() < 0) { + return -1; + } + extra_code_index = _PyEval_RequestCodeExtraIndex(NULL); + if (extra_code_index == -1) { + return -1; + } + if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) { + trampoline_api.state = trampoline_api.init_state(); + } + perf_status = PERF_STATUS_OK; + } +#endif + return 0; +} + +int +_PyPerfTrampoline_Fini(void) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + if (perf_status != PERF_STATUS_OK) { + return 0; + } + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate->interp->eval_frame == py_trampoline_evaluator) { + tstate->interp->eval_frame = NULL; + } + if (perf_status == PERF_STATUS_OK) { + trampoline_api.free_state(trampoline_api.state); + } + extra_code_index = -1; + perf_status = PERF_STATUS_NO_INIT; +#endif + return 0; +} + +PyStatus +_PyPerfTrampoline_AfterFork_Child(void) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + // Restart trampoline in file in child. + int was_active = _PyIsPerfTrampolineActive(); + _PyPerfTrampoline_Fini(); + PyUnstable_PerfMapState_Fini(); + if (was_active) { + _PyPerfTrampoline_Init(1); + } +#endif + return PyStatus_Ok(); +} |