diff options
author | thegeorg <thegeorg@yandex-team.ru> | 2022-06-03 10:53:07 +0300 |
---|---|---|
committer | thegeorg <thegeorg@yandex-team.ru> | 2022-06-03 10:53:07 +0300 |
commit | a1d4361e379e2c72a469ad1bd64569cbc2db131f (patch) | |
tree | 0caddb240a10132376e4653a31578e117d33f9fd /contrib/libs/cxxsupp | |
parent | 41f55a521834080d9d703c099c0418cfff3a0546 (diff) | |
download | ydb-a1d4361e379e2c72a469ad1bd64569cbc2db131f.tar.gz |
Update contrib/libs/cxxsupp/openmp to 14.0.4
ref:77c6cdda99b217d50c4deadca11f5611fa0dc168
Diffstat (limited to 'contrib/libs/cxxsupp')
40 files changed, 3980 insertions, 998 deletions
diff --git a/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report b/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report index 7e4845f2b7..7fc086467b 100644 --- a/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report +++ b/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report @@ -126,6 +126,7 @@ BELONGS ya.make kmp_atomic.cpp [7:8] kmp_atomic.h [7:8] kmp_barrier.cpp [7:8] + kmp_barrier.h [7:8] kmp_cancel.cpp [4:5] kmp_config.h [6:7] kmp_csupport.cpp [7:8] @@ -193,6 +194,7 @@ BELONGS ya.make kmp_atomic.cpp [7:8] kmp_atomic.h [7:8] kmp_barrier.cpp [7:8] + kmp_barrier.h [7:8] kmp_cancel.cpp [4:5] kmp_config.h [6:7] kmp_csupport.cpp [7:8] @@ -326,6 +328,7 @@ BELONGS ya.make kmp_atomic.cpp [9:9] kmp_atomic.h [9:9] kmp_barrier.cpp [9:9] + kmp_barrier.h [9:9] kmp_cancel.cpp [6:6] kmp_config.h [8:8] kmp_csupport.cpp [9:9] @@ -393,6 +396,7 @@ BELONGS ya.make kmp_atomic.cpp [9:9] kmp_atomic.h [9:9] kmp_barrier.cpp [9:9] + kmp_barrier.h [9:9] kmp_cancel.cpp [6:6] kmp_config.h [8:8] kmp_csupport.cpp [9:9] diff --git a/contrib/libs/cxxsupp/openmp/exports_so.txt b/contrib/libs/cxxsupp/openmp/exports_so.txt index cb79ae72e6..ac188af310 100644 --- a/contrib/libs/cxxsupp/openmp/exports_so.txt +++ b/contrib/libs/cxxsupp/openmp/exports_so.txt @@ -120,5 +120,7 @@ GOMP_4.5 { } GOMP_4.0; GOMP_5.0 { } GOMP_4.5; +GOMP_5.0.1 { +} GOMP_5.0; # end of file # diff --git a/contrib/libs/cxxsupp/openmp/kmp.h b/contrib/libs/cxxsupp/openmp/kmp.h index 0652080277..9502167474 100644 --- a/contrib/libs/cxxsupp/openmp/kmp.h +++ b/contrib/libs/cxxsupp/openmp/kmp.h @@ -115,6 +115,7 @@ typedef unsigned int kmp_hwloc_depth_t; #include "kmp_debug.h" #include "kmp_lock.h" #include "kmp_version.h" +#include "kmp_barrier.h" #if USE_DEBUGGER #error #include "kmp_debugger.h" #endif @@ -263,6 +264,7 @@ typedef union kmp_root kmp_root_p; template <bool C = false, bool S = true> class kmp_flag_32; template <bool C = false, bool S = true> class kmp_flag_64; +template <bool C = false, bool S = true> class kmp_atomic_flag_64; class kmp_flag_oncore; #ifdef __cplusplus @@ -616,6 +618,19 @@ enum kmp_hw_t : int { KMP_HW_LAST }; +typedef enum kmp_hw_core_type_t { + KMP_HW_CORE_TYPE_UNKNOWN = 0x0, +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + KMP_HW_CORE_TYPE_ATOM = 0x20, + KMP_HW_CORE_TYPE_CORE = 0x40, + KMP_HW_MAX_NUM_CORE_TYPES = 3, +#else + KMP_HW_MAX_NUM_CORE_TYPES = 1, +#endif +} kmp_hw_core_type_t; + +#define KMP_HW_MAX_NUM_CORE_EFFS 8 + #define KMP_DEBUG_ASSERT_VALID_HW_TYPE(type) \ KMP_DEBUG_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST) #define KMP_ASSERT_VALID_HW_TYPE(type) \ @@ -627,6 +642,7 @@ enum kmp_hw_t : int { const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural = false); const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural = false); +const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type); /* Only Linux* OS and Windows* OS support thread affinity. */ #if KMP_AFFINITY_SUPPORTED @@ -847,6 +863,7 @@ typedef struct kmp_nested_proc_bind_t { } kmp_nested_proc_bind_t; extern kmp_nested_proc_bind_t __kmp_nested_proc_bind; +extern kmp_proc_bind_t __kmp_teams_proc_bind; extern int __kmp_display_affinity; extern char *__kmp_affinity_format; @@ -987,7 +1004,7 @@ typedef omp_memspace_handle_t kmp_memspace_t; // placeholder typedef struct kmp_allocator_t { omp_memspace_handle_t memspace; void **memkind; // pointer to memkind - int alignment; + size_t alignment; omp_alloctrait_value_t fb; kmp_allocator_t *fb_data; kmp_uint64 pool_size; @@ -1001,13 +1018,25 @@ extern omp_allocator_handle_t __kmpc_init_allocator(int gtid, extern void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t al); extern void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t al); extern omp_allocator_handle_t __kmpc_get_default_allocator(int gtid); +// external interfaces, may be used by compiler extern void *__kmpc_alloc(int gtid, size_t sz, omp_allocator_handle_t al); +extern void *__kmpc_aligned_alloc(int gtid, size_t align, size_t sz, + omp_allocator_handle_t al); extern void *__kmpc_calloc(int gtid, size_t nmemb, size_t sz, omp_allocator_handle_t al); extern void *__kmpc_realloc(int gtid, void *ptr, size_t sz, omp_allocator_handle_t al, omp_allocator_handle_t free_al); extern void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al); +// internal interfaces, contain real implementation +extern void *__kmp_alloc(int gtid, size_t align, size_t sz, + omp_allocator_handle_t al); +extern void *__kmp_calloc(int gtid, size_t align, size_t nmemb, size_t sz, + omp_allocator_handle_t al); +extern void *__kmp_realloc(int gtid, void *ptr, size_t sz, + omp_allocator_handle_t al, + omp_allocator_handle_t free_al); +extern void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al); extern void __kmp_init_memkind(); extern void __kmp_fini_memkind(); @@ -1066,7 +1095,9 @@ extern void __kmp_init_target_mem(); #define KMP_MIN_BLOCKTIME (0) #define KMP_MAX_BLOCKTIME \ (INT_MAX) /* Must be this for "infinite" setting the work */ -#define KMP_DEFAULT_BLOCKTIME (200) /* __kmp_blocktime is in milliseconds */ + +/* __kmp_blocktime is in milliseconds */ +#define KMP_DEFAULT_BLOCKTIME (__kmp_is_hybrid_cpu() ? (0) : (200)) #if KMP_USE_MONITOR #define KMP_DEFAULT_MONITOR_STKSIZE ((size_t)(64 * 1024)) @@ -1204,6 +1235,13 @@ typedef struct kmp_cpuid { kmp_uint32 edx; } kmp_cpuid_t; +typedef struct kmp_cpuinfo_flags_t { + unsigned sse2 : 1; // 0 if SSE2 instructions are not supported, 1 otherwise. + unsigned rtm : 1; // 0 if RTM instructions are not supported, 1 otherwise. + unsigned hybrid : 1; + unsigned reserved : 29; // Ensure size of 32 bits +} kmp_cpuinfo_flags_t; + typedef struct kmp_cpuinfo { int initialized; // If 0, other fields are not initialized. int signature; // CPUID(1).EAX @@ -1211,8 +1249,7 @@ typedef struct kmp_cpuinfo { int model; // ( CPUID(1).EAX[19:16] << 4 ) + CPUID(1).EAX[7:4] ( ( Extended // Model << 4 ) + Model) int stepping; // CPUID(1).EAX[3:0] ( Stepping ) - int sse2; // 0 if SSE2 instructions are not supported, 1 otherwise. - int rtm; // 0 if RTM instructions are not supported, 1 otherwise. + kmp_cpuinfo_flags_t flags; int apic_id; int physical_id; int logical_id; @@ -1278,6 +1315,82 @@ static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = _mm_getcsr(); } #define KMP_X86_MXCSR_MASK 0xffffffc0 /* ignore status flags (6 lsb) */ +// User-level Monitor/Mwait +#if KMP_HAVE_UMWAIT +// We always try for UMWAIT first +#if KMP_HAVE_WAITPKG_INTRINSICS +#if KMP_HAVE_IMMINTRIN_H +#include <immintrin.h> +#elif KMP_HAVE_INTRIN_H +#include <intrin.h> +#endif +#endif // KMP_HAVE_WAITPKG_INTRINSICS + +KMP_ATTRIBUTE_TARGET_WAITPKG +static inline int __kmp_tpause(uint32_t hint, uint64_t counter) { +#if !KMP_HAVE_WAITPKG_INTRINSICS + uint32_t timeHi = uint32_t(counter >> 32); + uint32_t timeLo = uint32_t(counter & 0xffffffff); + char flag; + __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n" + "setb %0" + : "=r"(flag) + : "a"(timeLo), "d"(timeHi), "c"(hint) + :); + return flag; +#else + return _tpause(hint, counter); +#endif +} +KMP_ATTRIBUTE_TARGET_WAITPKG +static inline void __kmp_umonitor(void *cacheline) { +#if !KMP_HAVE_WAITPKG_INTRINSICS + __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 " + : + : "a"(cacheline) + :); +#else + _umonitor(cacheline); +#endif +} +KMP_ATTRIBUTE_TARGET_WAITPKG +static inline int __kmp_umwait(uint32_t hint, uint64_t counter) { +#if !KMP_HAVE_WAITPKG_INTRINSICS + uint32_t timeHi = uint32_t(counter >> 32); + uint32_t timeLo = uint32_t(counter & 0xffffffff); + char flag; + __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n" + "setb %0" + : "=r"(flag) + : "a"(timeLo), "d"(timeHi), "c"(hint) + :); + return flag; +#else + return _umwait(hint, counter); +#endif +} +#elif KMP_HAVE_MWAIT +#if KMP_OS_UNIX +#include <pmmintrin.h> +#else +#include <intrin.h> +#endif +#if KMP_OS_UNIX +__attribute__((target("sse3"))) +#endif +static inline void +__kmp_mm_monitor(void *cacheline, unsigned extensions, unsigned hints) { + _mm_monitor(cacheline, extensions, hints); +} +#if KMP_OS_UNIX +__attribute__((target("sse3"))) +#endif +static inline void +__kmp_mm_mwait(unsigned extensions, unsigned hints) { + _mm_mwait(extensions, hints); +} +#endif // KMP_HAVE_UMWAIT + #if KMP_ARCH_X86 extern void __kmp_x86_pause(void); #elif KMP_MIC @@ -1307,6 +1420,9 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); } #define KMP_INIT_YIELD(count) \ { (count) = __kmp_yield_init; } +#define KMP_INIT_BACKOFF(time) \ + { (time) = __kmp_pause_init; } + #define KMP_OVERSUBSCRIBED \ (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) @@ -1344,7 +1460,36 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); } } \ } -#define KMP_YIELD_OVERSUB_ELSE_SPIN(count) \ +// If TPAUSE is available & enabled, use it. If oversubscribed, use the slower +// (C0.2) state, which improves performance of other SMT threads on the same +// core, otherwise, use the fast (C0.1) default state, or whatever the user has +// requested. Uses a timed TPAUSE, and exponential backoff. If TPAUSE isn't +// available, fall back to the regular CPU pause and yield combination. +#if KMP_HAVE_UMWAIT +#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time) \ + { \ + if (__kmp_tpause_enabled) { \ + if (KMP_OVERSUBSCRIBED) { \ + __kmp_tpause(0, (time)); \ + } else { \ + __kmp_tpause(__kmp_tpause_hint, (time)); \ + } \ + (time) *= 2; \ + } else { \ + KMP_CPU_PAUSE(); \ + if ((KMP_TRY_YIELD_OVERSUB)) { \ + __kmp_yield(); \ + } else if (__kmp_use_yield == 1) { \ + (count) -= 2; \ + if (!(count)) { \ + __kmp_yield(); \ + (count) = __kmp_yield_next; \ + } \ + } \ + } \ + } +#else +#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time) \ { \ KMP_CPU_PAUSE(); \ if ((KMP_TRY_YIELD_OVERSUB)) \ @@ -1357,86 +1502,14 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); } } \ } \ } - -// User-level Monitor/Mwait -#if KMP_HAVE_UMWAIT -// We always try for UMWAIT first -#if KMP_HAVE_WAITPKG_INTRINSICS -#if KMP_HAVE_IMMINTRIN_H -#include <immintrin.h> -#elif KMP_HAVE_INTRIN_H -#include <intrin.h> -#endif -#endif // KMP_HAVE_WAITPKG_INTRINSICS -KMP_ATTRIBUTE_TARGET_WAITPKG -static inline int __kmp_tpause(uint32_t hint, uint64_t counter) { -#if !KMP_HAVE_WAITPKG_INTRINSICS - uint32_t timeHi = uint32_t(counter >> 32); - uint32_t timeLo = uint32_t(counter & 0xffffffff); - char flag; - __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n" - "setb %0" - : "=r"(flag) - : "a"(timeLo), "d"(timeHi), "c"(hint) - :); - return flag; -#else - return _tpause(hint, counter); -#endif -} -KMP_ATTRIBUTE_TARGET_WAITPKG -static inline void __kmp_umonitor(void *cacheline) { -#if !KMP_HAVE_WAITPKG_INTRINSICS - __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 " - : - : "a"(cacheline) - :); -#else - _umonitor(cacheline); -#endif -} -KMP_ATTRIBUTE_TARGET_WAITPKG -static inline int __kmp_umwait(uint32_t hint, uint64_t counter) { -#if !KMP_HAVE_WAITPKG_INTRINSICS - uint32_t timeHi = uint32_t(counter >> 32); - uint32_t timeLo = uint32_t(counter & 0xffffffff); - char flag; - __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n" - "setb %0" - : "=r"(flag) - : "a"(timeLo), "d"(timeHi), "c"(hint) - :); - return flag; -#else - return _umwait(hint, counter); -#endif -} -#elif KMP_HAVE_MWAIT -#if KMP_OS_UNIX -#include <pmmintrin.h> -#else -#include <intrin.h> -#endif -#if KMP_OS_UNIX -__attribute__((target("sse3"))) -#endif -static inline void -__kmp_mm_monitor(void *cacheline, unsigned extensions, unsigned hints) { - _mm_monitor(cacheline, extensions, hints); -} -#if KMP_OS_UNIX -__attribute__((target("sse3"))) -#endif -static inline void -__kmp_mm_mwait(unsigned extensions, unsigned hints) { - _mm_mwait(extensions, hints); -} #endif // KMP_HAVE_UMWAIT /* ------------------------------------------------------------------------ */ /* Support datatypes for the orphaned construct nesting checks. */ /* ------------------------------------------------------------------------ */ +/* When adding to this enum, add its corresponding string in cons_text_c[] + * array in kmp_error.cpp */ enum cons_type { ct_none, ct_parallel, @@ -1879,6 +1952,15 @@ typedef struct kmp_disp { 0 // Thread th_reap_state: not safe to reap (tasking) #define KMP_SAFE_TO_REAP 1 // Thread th_reap_state: safe to reap (not tasking) +// The flag_type describes the storage used for the flag. +enum flag_type { + flag32, /**< atomic 32 bit flags */ + flag64, /**< 64 bit flags */ + atomic_flag64, /**< atomic 64 bit flags */ + flag_oncore, /**< special 64-bit flag for on-core barrier (hierarchical) */ + flag_unset +}; + enum barrier_type { bs_plain_barrier = 0, /* 0, All non-fork/join barriers (except reduction barriers if enabled) */ @@ -1902,6 +1984,7 @@ typedef enum kmp_bar_pat { /* Barrier communication patterns */ bp_hyper_bar = 2, /* Hypercube-embedded tree with min branching factor 2^n */ bp_hierarchical_bar = 3, /* Machine hierarchy tree */ + bp_dist_bar = 4, /* Distributed barrier */ bp_last_bar /* Placeholder to mark the end */ } kmp_bar_pat_e; @@ -2241,22 +2324,26 @@ typedef union kmp_depnode kmp_depnode_t; typedef struct kmp_depnode_list kmp_depnode_list_t; typedef struct kmp_dephash_entry kmp_dephash_entry_t; +// macros for checking dep flag as an integer #define KMP_DEP_IN 0x1 #define KMP_DEP_OUT 0x2 #define KMP_DEP_INOUT 0x3 #define KMP_DEP_MTX 0x4 #define KMP_DEP_SET 0x8 +#define KMP_DEP_ALL 0x80 // Compiler sends us this info: typedef struct kmp_depend_info { kmp_intptr_t base_addr; size_t len; union { - kmp_uint8 flag; - struct { + kmp_uint8 flag; // flag as an unsigned char + struct { // flag as a set of 8 bits unsigned in : 1; unsigned out : 1; unsigned mtx : 1; unsigned set : 1; + unsigned unused : 3; + unsigned all : 1; } flags; }; } kmp_depend_info_t; @@ -2302,6 +2389,7 @@ struct kmp_dephash_entry { typedef struct kmp_dephash { kmp_dephash_entry_t **buckets; size_t size; + kmp_depnode_t *last_all; size_t generation; kmp_uint32 nelements; kmp_uint32 nconflicts; @@ -2409,13 +2497,6 @@ struct kmp_taskdata { /* aligned during dynamic allocation */ kmp_depnode_t *td_depnode; // Pointer to graph node if this task has dependencies kmp_task_team_t *td_task_team; - // The global thread id of the encountering thread. We need it because when a - // regular task depends on a hidden helper task, and the hidden helper task - // is finished on a hidden helper thread, it will call __kmp_release_deps to - // release all dependences. If now the task is a regular task, we need to pass - // the encountering gtid such that the task will be picked up and executed by - // its encountering team instead of hidden helper team. - kmp_int32 encountering_gtid; size_t td_size_alloc; // Size of task structure, including shareds etc. #if defined(KMP_GOMP_COMPAT) // 4 or 8 byte integers for the loop bounds in GOMP_taskloop @@ -2626,6 +2707,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info { /* while awaiting queuing lock acquire */ volatile void *th_sleep_loc; // this points at a kmp_flag<T> + flag_type th_sleep_loc_type; // enum type of flag stored in th_sleep_loc ident_t *th_ident; unsigned th_x; // Random number generator data @@ -2646,6 +2728,9 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info { written by the worker thread) */ kmp_uint8 th_active_in_pool; // included in count of #active threads in pool int th_active; // ! sleeping; 32 bits for TCR/TCW + std::atomic<kmp_uint32> th_used_in_team; // Flag indicating use in team + // 0 = not used in team; 1 = used in team; + // 2 = transitioning to not used in team; 3 = transitioning to used in team struct cons_header *th_cons; // used for consistency check #if KMP_USE_HIER_SCHED // used for hierarchical scheduling @@ -2825,6 +2910,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team { #if USE_ITT_BUILD void *t_stack_id; // team specific stack stitching id (for ittnotify) #endif /* USE_ITT_BUILD */ + distributedBarrier *b; // Distributed barrier data associated with team } kmp_base_team_t; union KMP_ALIGN_CACHE kmp_team { @@ -2949,6 +3035,9 @@ extern int __kmp_storage_map_verbose_specified; #if KMP_ARCH_X86 || KMP_ARCH_X86_64 extern kmp_cpuinfo_t __kmp_cpuinfo; +static inline bool __kmp_is_hybrid_cpu() { return __kmp_cpuinfo.flags.hybrid; } +#else +static inline bool __kmp_is_hybrid_cpu() { return false; } #endif extern volatile int __kmp_init_serial; @@ -3033,6 +3122,7 @@ extern kmp_int32 __kmp_use_yield; extern kmp_int32 __kmp_use_yield_exp_set; extern kmp_uint32 __kmp_yield_init; extern kmp_uint32 __kmp_yield_next; +extern kmp_uint64 __kmp_pause_init; /* ------------------------------------------------------------------------- */ extern int __kmp_allThreadsSpecified; @@ -3235,6 +3325,13 @@ extern int __kmp_mwait_enabled; // Runtime check if ring3 mwait is enabled extern int __kmp_mwait_hints; // Hints to pass in to mwait #endif +#if KMP_HAVE_UMWAIT +extern int __kmp_waitpkg_enabled; // Runtime check if waitpkg exists +extern int __kmp_tpause_state; // 0 (default), 1=C0.1, 2=C0.2; from KMP_TPAUSE +extern int __kmp_tpause_hint; // 1=C0.1 (default), 0=C0.2; from KMP_TPAUSE +extern int __kmp_tpause_enabled; // 0 (default), 1 (KMP_TPAUSE is non-zero) +#endif + /* ------------------------------------------------------------------------- */ extern kmp_global_t __kmp_global; /* global status */ @@ -4118,6 +4215,10 @@ typedef enum kmp_severity_t { } kmp_severity_t; extern void __kmpc_error(ident_t *loc, int severity, const char *message); +// Support for scope directive +KMP_EXPORT void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved); +KMP_EXPORT void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved); + #ifdef __cplusplus } #endif @@ -4126,18 +4227,26 @@ template <bool C, bool S> extern void __kmp_suspend_32(int th_gtid, kmp_flag_32<C, S> *flag); template <bool C, bool S> extern void __kmp_suspend_64(int th_gtid, kmp_flag_64<C, S> *flag); +template <bool C, bool S> +extern void __kmp_atomic_suspend_64(int th_gtid, + kmp_atomic_flag_64<C, S> *flag); extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag); #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT template <bool C, bool S> extern void __kmp_mwait_32(int th_gtid, kmp_flag_32<C, S> *flag); template <bool C, bool S> extern void __kmp_mwait_64(int th_gtid, kmp_flag_64<C, S> *flag); +template <bool C, bool S> +extern void __kmp_atomic_mwait_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag); extern void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag); #endif template <bool C, bool S> extern void __kmp_resume_32(int target_gtid, kmp_flag_32<C, S> *flag); template <bool C, bool S> extern void __kmp_resume_64(int target_gtid, kmp_flag_64<C, S> *flag); +template <bool C, bool S> +extern void __kmp_atomic_resume_64(int target_gtid, + kmp_atomic_flag_64<C, S> *flag); extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag); template <bool C, bool S> @@ -4156,6 +4265,14 @@ int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, void *itt_sync_obj, #endif /* USE_ITT_BUILD */ kmp_int32 is_constrained); +template <bool C, bool S> +int __kmp_atomic_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, + kmp_atomic_flag_64<C, S> *flag, + int final_spin, int *thread_finished, +#if USE_ITT_BUILD + void *itt_sync_obj, +#endif /* USE_ITT_BUILD */ + kmp_int32 is_constrained); int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, int *thread_finished, @@ -4213,6 +4330,15 @@ public: } } } + /// Instead of erroring out, return non-zero when + /// unsuccessful fopen() for any reason + int try_open(const char *filename, const char *mode) { + KMP_ASSERT(!f); + f = fopen(filename, mode); + if (!f) + return errno; + return 0; + } /// Set the FILE* object to stdout and output there /// No open call should happen before this call. void set_stdout() { diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp index 8b40bd7ecd..414a27fb05 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp @@ -26,6 +26,7 @@ #define HWLOC_GROUP_KIND_INTEL_DIE 104 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 #endif +#include <ctype.h> // The machine topology kmp_topology_t *__kmp_topology = nullptr; @@ -123,6 +124,20 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) { return ((plural) ? "unknowns" : "unknown"); } +const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) { + switch (type) { + case KMP_HW_CORE_TYPE_UNKNOWN: + return "unknown"; +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + case KMP_HW_CORE_TYPE_ATOM: + return "Intel Atom(R) processor"; + case KMP_HW_CORE_TYPE_CORE: + return "Intel(R) Core(TM) processor"; +#endif + } + return "unknown"; +} + //////////////////////////////////////////////////////////////////////////////// // kmp_hw_thread_t methods int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { @@ -174,20 +189,94 @@ void kmp_hw_thread_t::print() const { for (int i = 0; i < depth; ++i) { printf("%4d ", ids[i]); } + if (attrs) { + if (attrs.is_core_type_valid()) + printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type())); + if (attrs.is_core_eff_valid()) + printf(" (eff=%d)", attrs.get_core_eff()); + } printf("\n"); } //////////////////////////////////////////////////////////////////////////////// // kmp_topology_t methods +// Add a layer to the topology based on the ids. Assume the topology +// is perfectly nested (i.e., so no object has more than one parent) +void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) { + // Figure out where the layer should go by comparing the ids of the current + // layers with the new ids + int target_layer; + int previous_id = kmp_hw_thread_t::UNKNOWN_ID; + int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID; + + // Start from the highest layer and work down to find target layer + // If new layer is equal to another layer then put the new layer above + for (target_layer = 0; target_layer < depth; ++target_layer) { + bool layers_equal = true; + bool strictly_above_target_layer = false; + for (int i = 0; i < num_hw_threads; ++i) { + int id = hw_threads[i].ids[target_layer]; + int new_id = ids[i]; + if (id != previous_id && new_id == previous_new_id) { + // Found the layer we are strictly above + strictly_above_target_layer = true; + layers_equal = false; + break; + } else if (id == previous_id && new_id != previous_new_id) { + // Found a layer we are below. Move to next layer and check. + layers_equal = false; + break; + } + previous_id = id; + previous_new_id = new_id; + } + if (strictly_above_target_layer || layers_equal) + break; + } + + // Found the layer we are above. Now move everything to accommodate the new + // layer. And put the new ids and type into the topology. + for (int i = depth - 1, j = depth; i >= target_layer; --i, --j) + types[j] = types[i]; + types[target_layer] = type; + for (int k = 0; k < num_hw_threads; ++k) { + for (int i = depth - 1, j = depth; i >= target_layer; --i, --j) + hw_threads[k].ids[j] = hw_threads[k].ids[i]; + hw_threads[k].ids[target_layer] = ids[k]; + } + equivalent[type] = type; + depth++; +} + +#if KMP_GROUP_AFFINITY +// Insert the Windows Processor Group structure into the topology +void kmp_topology_t::_insert_windows_proc_groups() { + // Do not insert the processor group structure for a single group + if (__kmp_num_proc_groups == 1) + return; + kmp_affin_mask_t *mask; + int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads); + KMP_CPU_ALLOC(mask); + for (int i = 0; i < num_hw_threads; ++i) { + KMP_CPU_ZERO(mask); + KMP_CPU_SET(hw_threads[i].os_id, mask); + ids[i] = __kmp_get_proc_group(mask); + } + KMP_CPU_FREE(mask); + _insert_layer(KMP_HW_PROC_GROUP, ids); + __kmp_free(ids); +} +#endif + // Remove layers that don't add information to the topology. // This is done by having the layer take on the id = UNKNOWN_ID (-1) void kmp_topology_t::_remove_radix1_layers() { int preference[KMP_HW_LAST]; int top_index1, top_index2; // Set up preference associative array - preference[KMP_HW_PROC_GROUP] = 110; - preference[KMP_HW_SOCKET] = 100; + preference[KMP_HW_SOCKET] = 110; + preference[KMP_HW_PROC_GROUP] = 100; preference[KMP_HW_CORE] = 95; preference[KMP_HW_THREAD] = 90; preference[KMP_HW_NUMA] = 85; @@ -305,6 +394,7 @@ void kmp_topology_t::_gather_enumeration_information() { count[i] = 0; ratio[i] = 0; } + int core_level = get_level(KMP_HW_CORE); for (int i = 0; i < num_hw_threads; ++i) { kmp_hw_thread_t &hw_thread = hw_threads[i]; for (int layer = 0; layer < depth; ++layer) { @@ -320,6 +410,29 @@ void kmp_topology_t::_gather_enumeration_information() { ratio[l] = max[l]; max[l] = 1; } + // Figure out the number of different core types + // and efficiencies for hybrid CPUs + if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) { + if (hw_thread.attrs.is_core_eff_valid() && + hw_thread.attrs.core_eff >= num_core_efficiencies) { + // Because efficiencies can range from 0 to max efficiency - 1, + // the number of efficiencies is max efficiency + 1 + num_core_efficiencies = hw_thread.attrs.core_eff + 1; + } + if (hw_thread.attrs.is_core_type_valid()) { + bool found = false; + for (int j = 0; j < num_core_types; ++j) { + if (hw_thread.attrs.get_core_type() == core_types[j]) { + found = true; + break; + } + } + if (!found) { + KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES); + core_types[num_core_types++] = hw_thread.attrs.get_core_type(); + } + } + } break; } } @@ -333,6 +446,42 @@ void kmp_topology_t::_gather_enumeration_information() { } } +int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr, + int above_level, + bool find_all) const { + int current, current_max; + int previous_id[KMP_HW_LAST]; + for (int i = 0; i < depth; ++i) + previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; + int core_level = get_level(KMP_HW_CORE); + if (find_all) + above_level = -1; + KMP_ASSERT(above_level < core_level); + current_max = 0; + current = 0; + for (int i = 0; i < num_hw_threads; ++i) { + kmp_hw_thread_t &hw_thread = hw_threads[i]; + if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) { + if (current > current_max) + current_max = current; + current = hw_thread.attrs.contains(attr); + } else { + for (int level = above_level + 1; level <= core_level; ++level) { + if (hw_thread.ids[level] != previous_id[level]) { + if (hw_thread.attrs.contains(attr)) + current++; + break; + } + } + } + for (int level = 0; level < depth; ++level) + previous_id[level] = hw_thread.ids[level]; + } + if (current > current_max) + current_max = current; + return current_max; +} + // Find out if the topology is uniform void kmp_topology_t::_discover_uniformity() { int num = 1; @@ -406,7 +555,7 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, kmp_topology_t *retval; // Allocate all data in one large allocation size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc + - sizeof(int) * ndepth * 3; + sizeof(int) * (size_t)KMP_HW_LAST * 3; char *bytes = (char *)__kmp_allocate(size); retval = (kmp_topology_t *)bytes; if (nproc > 0) { @@ -419,8 +568,12 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, int *arr = (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc); retval->types = (kmp_hw_t *)arr; - retval->ratio = arr + ndepth; - retval->count = arr + 2 * ndepth; + retval->ratio = arr + (size_t)KMP_HW_LAST; + retval->count = arr + 2 * (size_t)KMP_HW_LAST; + retval->num_core_efficiencies = 0; + retval->num_core_types = 0; + for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) + retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN; KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; } for (int i = 0; i < ndepth; ++i) { retval->types[i] = types[i]; @@ -478,6 +631,13 @@ void kmp_topology_t::dump() const { } printf("\n"); + printf("* num_core_eff: %d\n", num_core_efficiencies); + printf("* num_core_types: %d\n", num_core_types); + printf("* core_types: "); + for (int i = 0; i < num_core_types; ++i) + printf("%3d ", core_types[i]); + printf("\n"); + printf("* equivalent map:\n"); KMP_FOREACH_HW_TYPE(i) { const char *key = __kmp_hw_get_keyword(i); @@ -571,6 +731,29 @@ void kmp_topology_t::print(const char *env_var) const { } KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores); + // Hybrid topology information + if (__kmp_is_hybrid_cpu()) { + for (int i = 0; i < num_core_types; ++i) { + kmp_hw_core_type_t core_type = core_types[i]; + kmp_hw_attr_t attr; + attr.clear(); + attr.set_core_type(core_type); + int ncores = get_ncores_with_attr(attr); + if (ncores > 0) { + KMP_INFORM(TopologyHybrid, env_var, ncores, + __kmp_hw_get_core_type_string(core_type)); + KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS) + for (int eff = 0; eff < num_core_efficiencies; ++eff) { + attr.set_core_eff(eff); + int ncores_with_eff = get_ncores_with_attr(attr); + if (ncores_with_eff > 0) { + KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff); + } + } + } + } + } + if (num_hw_threads <= 0) { __kmp_str_buf_free(&buf); return; @@ -585,6 +768,10 @@ void kmp_topology_t::print(const char *env_var) const { __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type)); __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]); } + if (__kmp_is_hybrid_cpu()) + __kmp_str_buf_print( + &buf, "(%s)", + __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type())); KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str); } @@ -592,6 +779,9 @@ void kmp_topology_t::print(const char *env_var) const { } void kmp_topology_t::canonicalize() { +#if KMP_GROUP_AFFINITY + _insert_windows_proc_groups(); +#endif _remove_radix1_layers(); _gather_enumeration_information(); _discover_uniformity(); @@ -640,6 +830,25 @@ void kmp_topology_t::canonicalize() { __kmp_hw_get_catalog_string(gran_type)); __kmp_affinity_gran = gran_type; } +#if KMP_GROUP_AFFINITY + // If more than one processor group exists, and the level of + // granularity specified by the user is too coarse, then the + // granularity must be adjusted "down" to processor group affinity + // because threads can only exist within one processor group. + // For example, if a user sets granularity=socket and there are two + // processor groups that cover a socket, then the runtime must + // restrict the granularity down to the processor group level. + if (__kmp_num_proc_groups > 1) { + int gran_depth = __kmp_topology->get_level(gran_type); + int proc_group_depth = __kmp_topology->get_level(KMP_HW_PROC_GROUP); + if (gran_depth >= 0 && proc_group_depth >= 0 && + gran_depth < proc_group_depth) { + KMP_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY", + __kmp_hw_get_catalog_string(__kmp_affinity_gran)); + __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP; + } + } +#endif __kmp_affinity_gran_levels = 0; for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i) __kmp_affinity_gran_levels++; @@ -673,6 +882,56 @@ void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg, _discover_uniformity(); } +// Represents running sub IDs for a single core attribute where +// attribute values have SIZE possibilities. +template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t { + int last_level; // last level in topology to consider for sub_ids + int sub_id[SIZE]; // The sub ID for a given attribute value + int prev_sub_id[KMP_HW_LAST]; + IndexFunc indexer; + +public: + kmp_sub_ids_t(int last_level) : last_level(last_level) { + KMP_ASSERT(last_level < KMP_HW_LAST); + for (size_t i = 0; i < SIZE; ++i) + sub_id[i] = -1; + for (size_t i = 0; i < KMP_HW_LAST; ++i) + prev_sub_id[i] = -1; + } + void update(const kmp_hw_thread_t &hw_thread) { + int idx = indexer(hw_thread); + KMP_ASSERT(idx < (int)SIZE); + for (int level = 0; level <= last_level; ++level) { + if (hw_thread.sub_ids[level] != prev_sub_id[level]) { + if (level < last_level) + sub_id[idx] = -1; + sub_id[idx]++; + break; + } + } + for (int level = 0; level <= last_level; ++level) + prev_sub_id[level] = hw_thread.sub_ids[level]; + } + int get_sub_id(const kmp_hw_thread_t &hw_thread) const { + return sub_id[indexer(hw_thread)]; + } +}; + +static kmp_str_buf_t * +__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf, + bool plural) { + __kmp_str_buf_init(buf); + if (attr.is_core_type_valid()) + __kmp_str_buf_print(buf, "%s %s", + __kmp_hw_get_core_type_string(attr.get_core_type()), + __kmp_hw_get_catalog_string(KMP_HW_CORE, plural)); + else + __kmp_str_buf_print(buf, "%s eff=%d", + __kmp_hw_get_catalog_string(KMP_HW_CORE, plural), + attr.get_core_eff()); + return buf; +} + // Apply the KMP_HW_SUBSET envirable to the topology // Returns true if KMP_HW_SUBSET filtered any processors // otherwise, returns false @@ -681,18 +940,27 @@ bool kmp_topology_t::filter_hw_subset() { if (!__kmp_hw_subset) return false; + // First, sort the KMP_HW_SUBSET items by the machine topology + __kmp_hw_subset->sort(); + // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology + bool using_core_types = false; + bool using_core_effs = false; int hw_subset_depth = __kmp_hw_subset->get_depth(); kmp_hw_t specified[KMP_HW_LAST]; + int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth); KMP_ASSERT(hw_subset_depth > 0); KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; } + int core_level = get_level(KMP_HW_CORE); for (int i = 0; i < hw_subset_depth; ++i) { int max_count; - int num = __kmp_hw_subset->at(i).num; - int offset = __kmp_hw_subset->at(i).offset; - kmp_hw_t type = __kmp_hw_subset->at(i).type; + const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i); + int num = item.num[0]; + int offset = item.offset[0]; + kmp_hw_t type = item.type; kmp_hw_t equivalent_type = equivalent[type]; int level = get_level(type); + topology_levels[i] = level; // Check to see if current layer is in detected machine topology if (equivalent_type != KMP_HW_UNKNOWN) { @@ -703,8 +971,8 @@ bool kmp_topology_t::filter_hw_subset() { return false; } - // Check to see if current layer has already been specified - // either directly or through an equivalent type + // Check to see if current layer has already been + // specified either directly or through an equivalent type if (specified[equivalent_type] != KMP_HW_UNKNOWN) { KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), __kmp_hw_get_catalog_string(specified[equivalent_type])); @@ -712,66 +980,247 @@ bool kmp_topology_t::filter_hw_subset() { } specified[equivalent_type] = type; - // Check to see if layers are in order - if (i + 1 < hw_subset_depth) { - kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type); - if (next_type == KMP_HW_UNKNOWN) { - KMP_WARNING( - AffHWSubsetNotExistGeneric, - __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type)); - return false; - } - int next_topology_level = get_level(next_type); - if (level > next_topology_level) { - KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type), - __kmp_hw_get_catalog_string(next_type)); - return false; - } - } - // Check to see if each layer's num & offset parameters are valid max_count = get_ratio(level); - if (max_count < 0 || num + offset > max_count) { + if (max_count < 0 || + (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) { bool plural = (num > 1); KMP_WARNING(AffHWSubsetManyGeneric, __kmp_hw_get_catalog_string(type, plural)); return false; } + + // Check to see if core attributes are consistent + if (core_level == level) { + // Determine which core attributes are specified + for (int j = 0; j < item.num_attrs; ++j) { + if (item.attr[j].is_core_type_valid()) + using_core_types = true; + if (item.attr[j].is_core_eff_valid()) + using_core_effs = true; + } + + // Check if using a single core attribute on non-hybrid arch. + // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute. + // + // Check if using multiple core attributes on non-hyrbid arch. + // Ignore all of KMP_HW_SUBSET if this is the case. + if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) { + if (item.num_attrs == 1) { + if (using_core_effs) { + KMP_WARNING(AffHWSubsetIgnoringAttr, "efficiency"); + } else { + KMP_WARNING(AffHWSubsetIgnoringAttr, "core_type"); + } + using_core_effs = false; + using_core_types = false; + } else { + KMP_WARNING(AffHWSubsetAttrsNonHybrid); + return false; + } + } + + // Check if using both core types and core efficiencies together + if (using_core_types && using_core_effs) { + KMP_WARNING(AffHWSubsetIncompat, "core_type", "efficiency"); + return false; + } + + // Check that core efficiency values are valid + if (using_core_effs) { + for (int j = 0; j < item.num_attrs; ++j) { + if (item.attr[j].is_core_eff_valid()) { + int core_eff = item.attr[j].get_core_eff(); + if (core_eff < 0 || core_eff >= num_core_efficiencies) { + kmp_str_buf_t buf; + __kmp_str_buf_init(&buf); + __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff()); + __kmp_msg(kmp_ms_warning, + KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str), + KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1), + __kmp_msg_null); + __kmp_str_buf_free(&buf); + return false; + } + } + } + } + + // Check that the number of requested cores with attributes is valid + if (using_core_types || using_core_effs) { + for (int j = 0; j < item.num_attrs; ++j) { + int num = item.num[j]; + int offset = item.offset[j]; + int level_above = core_level - 1; + if (level_above >= 0) { + max_count = get_ncores_with_attr_per(item.attr[j], level_above); + if (max_count <= 0 || + (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) { + kmp_str_buf_t buf; + __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0); + KMP_WARNING(AffHWSubsetManyGeneric, buf.str); + __kmp_str_buf_free(&buf); + return false; + } + } + } + } + + if ((using_core_types || using_core_effs) && item.num_attrs > 1) { + for (int j = 0; j < item.num_attrs; ++j) { + // Ambiguous use of specific core attribute + generic core + // e.g., 4c & 3c:intel_core or 4c & 3c:eff1 + if (!item.attr[j]) { + kmp_hw_attr_t other_attr; + for (int k = 0; k < item.num_attrs; ++k) { + if (item.attr[k] != item.attr[j]) { + other_attr = item.attr[k]; + break; + } + } + kmp_str_buf_t buf; + __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0); + KMP_WARNING(AffHWSubsetIncompat, + __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str); + __kmp_str_buf_free(&buf); + return false; + } + // Allow specifying a specific core type or core eff exactly once + for (int k = 0; k < j; ++k) { + if (!item.attr[j] || !item.attr[k]) + continue; + if (item.attr[k] == item.attr[j]) { + kmp_str_buf_t buf; + __kmp_hw_get_catalog_core_string(item.attr[j], &buf, + item.num[j] > 0); + KMP_WARNING(AffHWSubsetAttrRepeat, buf.str); + __kmp_str_buf_free(&buf); + return false; + } + } + } + } + } } - // Apply the filtered hardware subset - int new_index = 0; + struct core_type_indexer { + int operator()(const kmp_hw_thread_t &t) const { + switch (t.attrs.get_core_type()) { +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + case KMP_HW_CORE_TYPE_ATOM: + return 1; + case KMP_HW_CORE_TYPE_CORE: + return 2; +#endif + case KMP_HW_CORE_TYPE_UNKNOWN: + return 0; + } + KMP_ASSERT(0); + return 0; + } + }; + struct core_eff_indexer { + int operator()(const kmp_hw_thread_t &t) const { + return t.attrs.get_core_eff(); + } + }; + + kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids( + core_level); + kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids( + core_level); + + // Determine which hardware threads should be filtered. + int num_filtered = 0; + bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads); for (int i = 0; i < num_hw_threads; ++i) { kmp_hw_thread_t &hw_thread = hw_threads[i]; + // Update type_sub_id + if (using_core_types) + core_type_sub_ids.update(hw_thread); + if (using_core_effs) + core_eff_sub_ids.update(hw_thread); + // Check to see if this hardware thread should be filtered bool should_be_filtered = false; - for (int level = 0, hw_subset_index = 0; - level < depth && hw_subset_index < hw_subset_depth; ++level) { - kmp_hw_t topology_type = types[level]; - auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index); - kmp_hw_t hw_subset_type = hw_subset_item.type; - if (topology_type != hw_subset_type) + for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth; + ++hw_subset_index) { + const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index); + int level = topology_levels[hw_subset_index]; + if (level == -1) continue; - int num = hw_subset_item.num; - int offset = hw_subset_item.offset; - hw_subset_index++; - if (hw_thread.sub_ids[level] < offset || - hw_thread.sub_ids[level] >= offset + num) { - should_be_filtered = true; - break; + if ((using_core_effs || using_core_types) && level == core_level) { + // Look for the core attribute in KMP_HW_SUBSET which corresponds + // to this hardware thread's core attribute. Use this num,offset plus + // the running sub_id for the particular core attribute of this hardware + // thread to determine if the hardware thread should be filtered or not. + int attr_idx; + kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type(); + int core_eff = hw_thread.attrs.get_core_eff(); + for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) { + if (using_core_types && + hw_subset_item.attr[attr_idx].get_core_type() == core_type) + break; + if (using_core_effs && + hw_subset_item.attr[attr_idx].get_core_eff() == core_eff) + break; + } + // This core attribute isn't in the KMP_HW_SUBSET so always filter it. + if (attr_idx == hw_subset_item.num_attrs) { + should_be_filtered = true; + break; + } + int sub_id; + int num = hw_subset_item.num[attr_idx]; + int offset = hw_subset_item.offset[attr_idx]; + if (using_core_types) + sub_id = core_type_sub_ids.get_sub_id(hw_thread); + else + sub_id = core_eff_sub_ids.get_sub_id(hw_thread); + if (sub_id < offset || + (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) { + should_be_filtered = true; + break; + } + } else { + int num = hw_subset_item.num[0]; + int offset = hw_subset_item.offset[0]; + if (hw_thread.sub_ids[level] < offset || + (num != kmp_hw_subset_t::USE_ALL && + hw_thread.sub_ids[level] >= offset + num)) { + should_be_filtered = true; + break; + } } } - if (!should_be_filtered) { + // Collect filtering information + filtered[i] = should_be_filtered; + if (should_be_filtered) + num_filtered++; + } + + // One last check that we shouldn't allow filtering entire machine + if (num_filtered == num_hw_threads) { + KMP_WARNING(AffHWSubsetAllFiltered); + __kmp_free(filtered); + return false; + } + + // Apply the filter + int new_index = 0; + for (int i = 0; i < num_hw_threads; ++i) { + if (!filtered[i]) { if (i != new_index) - hw_threads[new_index] = hw_thread; + hw_threads[new_index] = hw_threads[i]; new_index++; } else { #if KMP_AFFINITY_SUPPORTED - KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask); + KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask); #endif __kmp_avail_proc--; } } + KMP_DEBUG_ASSERT(new_index <= num_hw_threads); num_hw_threads = new_index; @@ -780,6 +1229,7 @@ bool kmp_topology_t::filter_hw_subset() { _discover_uniformity(); _set_globals(); _set_last_level_cache(); + __kmp_free(filtered); return true; } @@ -986,7 +1436,67 @@ kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, return buf; } -void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { +// Return (possibly empty) affinity mask representing the offline CPUs +// Caller must free the mask +kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() { + kmp_affin_mask_t *offline; + KMP_CPU_ALLOC(offline); + KMP_CPU_ZERO(offline); +#if KMP_OS_LINUX + int n, begin_cpu, end_cpu; + kmp_safe_raii_file_t offline_file; + auto skip_ws = [](FILE *f) { + int c; + do { + c = fgetc(f); + } while (isspace(c)); + if (c != EOF) + ungetc(c, f); + }; + // File contains CSV of integer ranges representing the offline CPUs + // e.g., 1,2,4-7,9,11-15 + int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r"); + if (status != 0) + return offline; + while (!feof(offline_file)) { + skip_ws(offline_file); + n = fscanf(offline_file, "%d", &begin_cpu); + if (n != 1) + break; + skip_ws(offline_file); + int c = fgetc(offline_file); + if (c == EOF || c == ',') { + // Just single CPU + end_cpu = begin_cpu; + } else if (c == '-') { + // Range of CPUs + skip_ws(offline_file); + n = fscanf(offline_file, "%d", &end_cpu); + if (n != 1) + break; + skip_ws(offline_file); + c = fgetc(offline_file); // skip ',' + } else { + // Syntax problem + break; + } + // Ensure a valid range of CPUs + if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 || + end_cpu >= __kmp_xproc || begin_cpu > end_cpu) { + continue; + } + // Insert [begin_cpu, end_cpu] into offline mask + for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) { + KMP_CPU_SET(cpu, offline); + } + } +#endif + return offline; +} + +// Return the number of available procs +int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { + int avail_proc = 0; KMP_CPU_ZERO(mask); #if KMP_GROUP_AFFINITY @@ -999,6 +1509,7 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { int num = __kmp_GetActiveProcessorCount(group); for (i = 0; i < num; i++) { KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); + avail_proc++; } } } else @@ -1007,10 +1518,18 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { { int proc; + kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus(); for (proc = 0; proc < __kmp_xproc; proc++) { + // Skip offline CPUs + if (KMP_CPU_ISSET(proc, offline_cpus)) + continue; KMP_CPU_SET(proc, mask); + avail_proc++; } + KMP_CPU_FREE(offline_cpus); } + + return avail_proc; } // All of the __kmp_affinity_create_*_map() routines should allocate the @@ -1156,6 +1675,45 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { return true; } + // Handle multiple types of cores if they exist on the system + int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0); + + typedef struct kmp_hwloc_cpukinds_info_t { + int efficiency; + kmp_hw_core_type_t core_type; + hwloc_bitmap_t mask; + } kmp_hwloc_cpukinds_info_t; + kmp_hwloc_cpukinds_info_t *cpukinds = nullptr; + + if (nr_cpu_kinds > 0) { + unsigned nr_infos; + struct hwloc_info_s *infos; + cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate( + sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds); + for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) { + cpukinds[idx].efficiency = -1; + cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN; + cpukinds[idx].mask = hwloc_bitmap_alloc(); + if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask, + &cpukinds[idx].efficiency, &nr_infos, &infos, + 0) == 0) { + for (unsigned i = 0; i < nr_infos; ++i) { + if (__kmp_str_match("CoreType", 8, infos[i].name)) { +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + if (__kmp_str_match("IntelAtom", 9, infos[i].value)) { + cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM; + break; + } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) { + cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE; + break; + } +#endif + } + } + } + } + } + root = hwloc_get_root_obj(tp); // Figure out the depth and types in the topology @@ -1215,6 +1773,20 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { hw_thread.clear(); hw_thread.ids[index] = pu->logical_index; hw_thread.os_id = pu->os_index; + // If multiple core types, then set that attribute for the hardware thread + if (cpukinds) { + int cpukind_index = -1; + for (int i = 0; i < nr_cpu_kinds; ++i) { + if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) { + cpukind_index = i; + break; + } + } + if (cpukind_index >= 0) { + hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type); + hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency); + } + } index--; } obj = pu; @@ -1258,6 +1830,13 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { if (included) hw_thread_index++; } + + // Free the core types information + if (cpukinds) { + for (int idx = 0; idx < nr_cpu_kinds; ++idx) + hwloc_bitmap_free(cpukinds[idx].mask); + __kmp_free(cpukinds); + } __kmp_topology->sort_ids(); return true; } @@ -1782,6 +2361,26 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) { return true; } +// Hybrid cpu detection using CPUID.1A +// Thread should be pinned to processor already +static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency, + unsigned *native_model_id) { + kmp_cpuid buf; + __kmp_x86_cpuid(0x1a, 0, &buf); + *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax); + switch (*type) { + case KMP_HW_CORE_TYPE_ATOM: + *efficiency = 0; + break; + case KMP_HW_CORE_TYPE_CORE: + *efficiency = 1; + break; + default: + *efficiency = 0; + } + *native_model_id = __kmp_extract_bits<0, 23>(buf.eax); +} + // Intel(R) microarchitecture code name Nehalem, Dunnington and later // architectures support a newer interface for specifying the x2APIC Ids, // based on CPUID.B or CPUID.1F @@ -2051,6 +2650,15 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) { hw_thread.ids[idx] >>= my_levels[j - 1].mask_width; } } + // Hybrid information + if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) { + kmp_hw_core_type_t type; + unsigned native_model_id; + int efficiency; + __kmp_get_hybrid_info(&type, &efficiency, &native_model_id); + hw_thread.attrs.set_core_type(type); + hw_thread.attrs.set_core_eff(efficiency); + } hw_thread_index++; } KMP_ASSERT(hw_thread_index > 0); @@ -2386,7 +2994,10 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line, unsigned val; if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; - KMP_ASSERT(nodeIdIndex + level <= maxIndex); + // validate the input before using level: + if (level > (unsigned)__kmp_xproc) { // level is too big + level = __kmp_xproc; + } if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; threadInfo[num_avail][nodeIdIndex + level] = val; @@ -3497,8 +4108,8 @@ static void __kmp_aux_affinity_initialize(void) { __kmp_affin_fullMask); KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); } - __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); - __kmp_avail_proc = __kmp_xproc; + __kmp_avail_proc = + __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); #if KMP_OS_WINDOWS // Set the process affinity mask since threads' affinity // masks must be subset of process mask in Windows* OS @@ -4145,14 +4756,19 @@ int __kmp_aux_set_affinity(void **mask) { int __kmp_aux_get_affinity(void **mask) { int gtid; int retval; +#if KMP_OS_WINDOWS || KMP_DEBUG kmp_info_t *th; - +#endif if (!KMP_AFFINITY_CAPABLE()) { return -1; } gtid = __kmp_entry_gtid(); +#if KMP_OS_WINDOWS || KMP_DEBUG th = __kmp_threads[gtid]; +#else + (void)gtid; // unused variable +#endif KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); KA_TRACE( diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.h b/contrib/libs/cxxsupp/openmp/kmp_affinity.h index 8e72922d2c..ce00362f04 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_affinity.h +++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.h @@ -15,6 +15,7 @@ #include "kmp.h" #include "kmp_os.h" +#include <limits> #if KMP_AFFINITY_SUPPORTED #if KMP_USE_HWLOC @@ -598,6 +599,63 @@ class KMPNativeAffinity : public KMPAffinity { #endif /* KMP_OS_WINDOWS */ #endif /* KMP_AFFINITY_SUPPORTED */ +// Describe an attribute for a level in the machine topology +struct kmp_hw_attr_t { + int core_type : 8; + int core_eff : 8; + unsigned valid : 1; + unsigned reserved : 15; + + static const int UNKNOWN_CORE_EFF = -1; + + kmp_hw_attr_t() + : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF), + valid(0), reserved(0) {} + void set_core_type(kmp_hw_core_type_t type) { + valid = 1; + core_type = type; + } + void set_core_eff(int eff) { + valid = 1; + core_eff = eff; + } + kmp_hw_core_type_t get_core_type() const { + return (kmp_hw_core_type_t)core_type; + } + int get_core_eff() const { return core_eff; } + bool is_core_type_valid() const { + return core_type != KMP_HW_CORE_TYPE_UNKNOWN; + } + bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; } + operator bool() const { return valid; } + void clear() { + core_type = KMP_HW_CORE_TYPE_UNKNOWN; + core_eff = UNKNOWN_CORE_EFF; + valid = 0; + } + bool contains(const kmp_hw_attr_t &other) const { + if (!valid && !other.valid) + return true; + if (valid && other.valid) { + if (other.is_core_type_valid()) { + if (!is_core_type_valid() || (get_core_type() != other.get_core_type())) + return false; + } + if (other.is_core_eff_valid()) { + if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff())) + return false; + } + return true; + } + return false; + } + bool operator==(const kmp_hw_attr_t &rhs) const { + return (rhs.valid == valid && rhs.core_eff == core_eff && + rhs.core_type == core_type); + } + bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); } +}; + class kmp_hw_thread_t { public: static const int UNKNOWN_ID = -1; @@ -607,11 +665,14 @@ public: int sub_ids[KMP_HW_LAST]; bool leader; int os_id; + kmp_hw_attr_t attrs; + void print() const; void clear() { for (int i = 0; i < (int)KMP_HW_LAST; ++i) ids[i] = UNKNOWN_ID; leader = false; + attrs.clear(); } }; @@ -624,7 +685,9 @@ class kmp_topology_t { int depth; - // The following arrays are all 'depth' long + // The following arrays are all 'depth' long and have been + // allocated to hold up to KMP_HW_LAST number of objects if + // needed so layers can be added without reallocation of any array // Orderd array of the types in the topology kmp_hw_t *types; @@ -637,6 +700,12 @@ class kmp_topology_t { // Storage containing the absolute number of each topology layer int *count; + // The number of core efficiencies. This is only useful for hybrid + // topologies. Core efficiencies will range from 0 to num efficiencies - 1 + int num_core_efficiencies; + int num_core_types; + kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES]; + // The hardware threads array // hw_threads is num_hw_threads long // Each hw_thread's ids and sub_ids are depth deep @@ -652,6 +721,14 @@ class kmp_topology_t { // Flags describing the topology flags_t flags; + // Insert a new topology layer after allocation + void _insert_layer(kmp_hw_t type, const int *ids); + +#if KMP_GROUP_AFFINITY + // Insert topology information about Windows Processor groups + void _insert_windows_proc_groups(); +#endif + // Count each item & get the num x's per y // e.g., get the number of cores and the number of threads per core // for each (x, y) in (KMP_HW_* , KMP_HW_*) @@ -675,6 +752,12 @@ class kmp_topology_t { // Set the last level cache equivalent type void _set_last_level_cache(); + // Return the number of cores with a particular attribute, 'attr'. + // If 'find_all' is true, then find all cores on the machine, otherwise find + // all cores per the layer 'above' + int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above, + bool find_all = false) const; + public: // Force use of allocate()/deallocate() kmp_topology_t() = delete; @@ -764,6 +847,16 @@ public: KMP_DEBUG_ASSERT(level >= 0 && level < depth); return count[level]; } + // Return the total number of cores with attribute 'attr' + int get_ncores_with_attr(const kmp_hw_attr_t &attr) const { + return _get_ncores_with_attr(attr, -1, true); + } + // Return the number of cores with attribute + // 'attr' per topology level 'above' + int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const { + return _get_ncores_with_attr(attr, above, false); + } + #if KMP_AFFINITY_SUPPORTED void sort_compact() { qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), @@ -773,14 +866,22 @@ public: void print(const char *env_var = "KMP_AFFINITY") const; void dump() const; }; +extern kmp_topology_t *__kmp_topology; class kmp_hw_subset_t { + const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS; + public: + // Describe a machine topology item in KMP_HW_SUBSET struct item_t { - int num; kmp_hw_t type; - int offset; + int num_attrs; + int num[MAX_ATTRS]; + int offset[MAX_ATTRS]; + kmp_hw_attr_t attr[MAX_ATTRS]; }; + // Put parenthesis around max to avoid accidental use of Windows max macro. + const static int USE_ALL = (std::numeric_limits<int>::max)(); private: int depth; @@ -790,6 +891,15 @@ private: bool absolute; // The set must be able to handle up to KMP_HW_LAST number of layers KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); + // Sorting the KMP_HW_SUBSET items to follow topology order + // All unknown topology types will be at the beginning of the subset + static int hw_subset_compare(const void *i1, const void *i2) { + kmp_hw_t type1 = ((const item_t *)i1)->type; + kmp_hw_t type2 = ((const item_t *)i2)->type; + int level1 = __kmp_topology->get_level(type1); + int level2 = __kmp_topology->get_level(type2); + return level1 - level2; + } public: // Force use of allocate()/deallocate() @@ -816,7 +926,20 @@ public: } void set_absolute() { absolute = true; } bool is_absolute() const { return absolute; } - void push_back(int num, kmp_hw_t type, int offset) { + void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) { + for (int i = 0; i < depth; ++i) { + // Found an existing item for this layer type + // Add the num, offset, and attr to this item + if (items[i].type == type) { + int idx = items[i].num_attrs++; + if ((size_t)idx >= MAX_ATTRS) + return; + items[i].num[idx] = num; + items[i].offset[idx] = offset; + items[i].attr[idx] = attr; + return; + } + } if (depth == capacity - 1) { capacity *= 2; item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); @@ -825,9 +948,11 @@ public: __kmp_free(items); items = new_items; } - items[depth].num = num; + items[depth].num_attrs = 1; items[depth].type = type; - items[depth].offset = offset; + items[depth].num[0] = num; + items[depth].offset[0] = offset; + items[depth].attr[0] = attr; depth++; set |= (1ull << type); } @@ -848,6 +973,10 @@ public: } depth--; } + void sort() { + KMP_DEBUG_ASSERT(__kmp_topology); + qsort(items, depth, sizeof(item_t), hw_subset_compare); + } bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } void dump() const { printf("**********************\n"); @@ -855,16 +984,25 @@ public: printf("* depth: %d\n", depth); printf("* items:\n"); for (int i = 0; i < depth; ++i) { - printf("num: %d, type: %s, offset: %d\n", items[i].num, - __kmp_hw_get_keyword(items[i].type), items[i].offset); + printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type)); + for (int j = 0; j < items[i].num_attrs; ++j) { + printf(" num: %d, offset: %d, attr: ", items[i].num[j], + items[i].offset[j]); + if (!items[i].attr[j]) { + printf(" (none)\n"); + } else { + printf( + " core_type = %s, core_eff = %d\n", + __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()), + items[i].attr[j].get_core_eff()); + } + } } printf("* set: 0x%llx\n", set); printf("* absolute: %d\n", absolute); printf("**********************\n"); } }; - -extern kmp_topology_t *__kmp_topology; extern kmp_hw_subset_t *__kmp_hw_subset; /* A structure for holding machine-specific hierarchy info to be computed once diff --git a/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp b/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp index b373353ddd..120cad17c2 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp @@ -895,7 +895,7 @@ static void bpool(kmp_info_t *th, void *buf, bufsize len) { __kmp_bget_dequeue(th); /* Release any queued buffers */ #ifdef SizeQuant - len &= ~(SizeQuant - 1); + len &= ~((bufsize)(SizeQuant - 1)); #endif if (thr->pool_len == 0) { thr->pool_len = len; @@ -1496,31 +1496,74 @@ typedef struct kmp_mem_desc { // Memory block descriptor void *ptr_align; // Pointer to aligned memory, returned kmp_allocator_t *allocator; // allocator } kmp_mem_desc_t; -static int alignment = sizeof(void *); // let's align to pointer size +static int alignment = sizeof(void *); // align to pointer size by default +// external interfaces are wrappers over internal implementation void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) { + KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator)); + void *ptr = __kmp_alloc(gtid, 0, size, allocator); + KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", ptr, gtid)); + return ptr; +} + +void *__kmpc_aligned_alloc(int gtid, size_t algn, size_t size, + omp_allocator_handle_t allocator) { + KE_TRACE(25, ("__kmpc_aligned_alloc: T#%d (%d, %d, %p)\n", gtid, (int)algn, + (int)size, allocator)); + void *ptr = __kmp_alloc(gtid, algn, size, allocator); + KE_TRACE(25, ("__kmpc_aligned_alloc returns %p, T#%d\n", ptr, gtid)); + return ptr; +} + +void *__kmpc_calloc(int gtid, size_t nmemb, size_t size, + omp_allocator_handle_t allocator) { + KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb, + (int)size, allocator)); + void *ptr = __kmp_calloc(gtid, 0, nmemb, size, allocator); + KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid)); + return ptr; +} + +void *__kmpc_realloc(int gtid, void *ptr, size_t size, + omp_allocator_handle_t allocator, + omp_allocator_handle_t free_allocator) { + KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size, + allocator, free_allocator)); + void *nptr = __kmp_realloc(gtid, ptr, size, allocator, free_allocator); + KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid)); + return nptr; +} + +void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) { + KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator)); + ___kmpc_free(gtid, ptr, allocator); + KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, ptr, allocator)); + return; +} + +// internal implementation, called from inside the library +void *__kmp_alloc(int gtid, size_t algn, size_t size, + omp_allocator_handle_t allocator) { void *ptr = NULL; kmp_allocator_t *al; KMP_DEBUG_ASSERT(__kmp_init_serial); - if (size == 0) return NULL; - if (allocator == omp_null_allocator) allocator = __kmp_threads[gtid]->th.th_def_allocator; - KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator)); - al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator)); + al = RCAST(kmp_allocator_t *, allocator); int sz_desc = sizeof(kmp_mem_desc_t); kmp_mem_desc_t desc; kmp_uintptr_t addr; // address returned by allocator kmp_uintptr_t addr_align; // address to return to caller kmp_uintptr_t addr_descr; // address of memory block descriptor - int align = alignment; // default alignment - if (allocator > kmp_max_mem_alloc && al->alignment > 0) { - align = al->alignment; // alignment requested by user - } + size_t align = alignment; // default alignment + if (allocator > kmp_max_mem_alloc && al->alignment > align) + align = al->alignment; // alignment required by allocator trait + if (align < algn) + align = algn; // max of allocator trait, parameter and sizeof(void*) desc.size_orig = size; desc.size_a = size + sz_desc + align; @@ -1549,7 +1592,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) { } else if (al->fb == omp_atv_allocator_fb) { KMP_ASSERT(al != al->fb_data); al = al->fb_data; - return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al); + return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al); } // else ptr == NULL; } else { // pool has enough space @@ -1563,7 +1606,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) { } else if (al->fb == omp_atv_allocator_fb) { KMP_ASSERT(al != al->fb_data); al = al->fb_data; - return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al); + return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al); } } } @@ -1579,7 +1622,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) { } else if (al->fb == omp_atv_allocator_fb) { KMP_ASSERT(al != al->fb_data); al = al->fb_data; - return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al); + return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al); } } } @@ -1635,7 +1678,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) { } else if (al->fb == omp_atv_allocator_fb) { KMP_ASSERT(al != al->fb_data); al = al->fb_data; - return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al); + return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al); } // else ptr == NULL; } else { // pool has enough space @@ -1651,7 +1694,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) { KMP_ASSERT(0); // abort fallback requested } // no sense to look for another fallback because of same internal alloc } - KE_TRACE(10, ("__kmpc_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a)); + KE_TRACE(10, ("__kmp_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a)); if (ptr == NULL) return NULL; @@ -1665,12 +1708,11 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) { *((kmp_mem_desc_t *)addr_descr) = desc; // save descriptor contents KMP_MB(); - KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", desc.ptr_align, gtid)); return desc.ptr_align; } -void *__kmpc_calloc(int gtid, size_t nmemb, size_t size, - omp_allocator_handle_t allocator) { +void *__kmp_calloc(int gtid, size_t algn, size_t nmemb, size_t size, + omp_allocator_handle_t allocator) { void *ptr = NULL; kmp_allocator_t *al; KMP_DEBUG_ASSERT(__kmp_init_serial); @@ -1678,10 +1720,7 @@ void *__kmpc_calloc(int gtid, size_t nmemb, size_t size, if (allocator == omp_null_allocator) allocator = __kmp_threads[gtid]->th.th_def_allocator; - KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb, - (int)size, allocator)); - - al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator)); + al = RCAST(kmp_allocator_t *, allocator); if (nmemb == 0 || size == 0) return ptr; @@ -1693,31 +1732,27 @@ void *__kmpc_calloc(int gtid, size_t nmemb, size_t size, return ptr; } - ptr = __kmpc_alloc(gtid, nmemb * size, allocator); + ptr = __kmp_alloc(gtid, algn, nmemb * size, allocator); if (ptr) { memset(ptr, 0x00, nmemb * size); } - KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid)); return ptr; } -void *__kmpc_realloc(int gtid, void *ptr, size_t size, - omp_allocator_handle_t allocator, - omp_allocator_handle_t free_allocator) { +void *__kmp_realloc(int gtid, void *ptr, size_t size, + omp_allocator_handle_t allocator, + omp_allocator_handle_t free_allocator) { void *nptr = NULL; KMP_DEBUG_ASSERT(__kmp_init_serial); if (size == 0) { if (ptr != NULL) - __kmpc_free(gtid, ptr, free_allocator); + ___kmpc_free(gtid, ptr, free_allocator); return nptr; } - KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size, - allocator, free_allocator)); - - nptr = __kmpc_alloc(gtid, size, allocator); + nptr = __kmp_alloc(gtid, 0, size, allocator); if (nptr != NULL && ptr != NULL) { kmp_mem_desc_t desc; @@ -1736,15 +1771,13 @@ void *__kmpc_realloc(int gtid, void *ptr, size_t size, } if (nptr != NULL) { - __kmpc_free(gtid, ptr, free_allocator); + ___kmpc_free(gtid, ptr, free_allocator); } - KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid)); return nptr; } -void __kmpc_free(int gtid, void *ptr, const omp_allocator_handle_t allocator) { - KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator)); +void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) { if (ptr == NULL) return; @@ -1804,8 +1837,6 @@ void __kmpc_free(int gtid, void *ptr, const omp_allocator_handle_t allocator) { } __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc); } - KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, desc.ptr_alloc, - allocator)); } /* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes @@ -1939,9 +1970,10 @@ void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) { In debug mode, fill the memory block with 0xEF before call to free(). */ void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) { kmp_mem_descr_t descr; +#if KMP_DEBUG kmp_uintptr_t addr_allocated; // Address returned by malloc(). kmp_uintptr_t addr_aligned; // Aligned address passed by caller. - +#endif KE_TRACE(25, ("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM)); KMP_ASSERT(ptr != NULL); @@ -1953,18 +1985,15 @@ void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) { "ptr_aligned=%p, size_aligned=%d\n", descr.ptr_allocated, (int)descr.size_allocated, descr.ptr_aligned, (int)descr.size_aligned)); - +#if KMP_DEBUG addr_allocated = (kmp_uintptr_t)descr.ptr_allocated; addr_aligned = (kmp_uintptr_t)descr.ptr_aligned; - KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0); KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr); KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned); KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated); KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <= addr_allocated + descr.size_allocated); - -#ifdef KMP_DEBUG memset(descr.ptr_allocated, 0xEF, descr.size_allocated); // Fill memory block with 0xEF, it helps catch using freed memory. #endif diff --git a/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp b/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp index fcc06216a4..0bd7b1a41a 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp @@ -732,7 +732,7 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs, #define OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID) \ __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid); \ - (*lhs) = (TYPE)((*lhs)OP((TYPE)rhs)); \ + (*lhs) = (TYPE)((*lhs)OP rhs); \ __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid); // ------------------------------------------------------------------------ @@ -791,14 +791,14 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs, { \ TYPE old_value, new_value; \ old_value = *(TYPE volatile *)lhs; \ - new_value = (TYPE)(old_value OP((TYPE)rhs)); \ + new_value = (TYPE)(old_value OP rhs); \ while (!KMP_COMPARE_AND_STORE_ACQ##BITS( \ (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value, \ *VOLATILE_CAST(kmp_int##BITS *) & new_value)) { \ KMP_DO_PAUSE; \ \ old_value = *(TYPE volatile *)lhs; \ - new_value = (TYPE)(old_value OP((TYPE)rhs)); \ + new_value = (TYPE)(old_value OP rhs); \ } \ } @@ -1235,6 +1235,12 @@ MIN_MAX_COMPXCHG(float8, max, kmp_real64, 64, <, 8r, 7, KMP_ARCH_X86) // __kmpc_atomic_float8_max MIN_MAX_COMPXCHG(float8, min, kmp_real64, 64, >, 8r, 7, KMP_ARCH_X86) // __kmpc_atomic_float8_min +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 +MIN_MAX_CRITICAL(float10, max, long double, <, 10r, + 1) // __kmpc_atomic_float10_max +MIN_MAX_CRITICAL(float10, min, long double, >, 10r, + 1) // __kmpc_atomic_float10_min +#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64 #if KMP_HAVE_QUAD MIN_MAX_CRITICAL(float16, max, QUAD_LEGACY, <, 16r, 1) // __kmpc_atomic_float16_max @@ -1313,6 +1319,7 @@ ATOMIC_CMPX_EQV(fixed8, eqv, kmp_int64, 64, ^~, 8i, 7, } /* ------------------------------------------------------------------------- */ +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 // routines for long double type ATOMIC_CRITICAL(float10, add, long double, +, 10r, 1) // __kmpc_atomic_float10_add @@ -1322,6 +1329,7 @@ ATOMIC_CRITICAL(float10, mul, long double, *, 10r, 1) // __kmpc_atomic_float10_mul ATOMIC_CRITICAL(float10, div, long double, /, 10r, 1) // __kmpc_atomic_float10_div +#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64 #if KMP_HAVE_QUAD // routines for _Quad type ATOMIC_CRITICAL(float16, add, QUAD_LEGACY, +, 16r, @@ -1367,6 +1375,7 @@ ATOMIC_CRITICAL(cmplx8, add, kmp_cmplx64, +, 16c, 1) // __kmpc_atomic_cmplx8_add ATOMIC_CRITICAL(cmplx8, sub, kmp_cmplx64, -, 16c, 1) // __kmpc_atomic_cmplx8_sub ATOMIC_CRITICAL(cmplx8, mul, kmp_cmplx64, *, 16c, 1) // __kmpc_atomic_cmplx8_mul ATOMIC_CRITICAL(cmplx8, div, kmp_cmplx64, /, 16c, 1) // __kmpc_atomic_cmplx8_div +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 ATOMIC_CRITICAL(cmplx10, add, kmp_cmplx80, +, 20c, 1) // __kmpc_atomic_cmplx10_add ATOMIC_CRITICAL(cmplx10, sub, kmp_cmplx80, -, 20c, @@ -1375,6 +1384,7 @@ ATOMIC_CRITICAL(cmplx10, mul, kmp_cmplx80, *, 20c, 1) // __kmpc_atomic_cmplx10_mul ATOMIC_CRITICAL(cmplx10, div, kmp_cmplx80, /, 20c, 1) // __kmpc_atomic_cmplx10_div +#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64 #if KMP_HAVE_QUAD ATOMIC_CRITICAL(cmplx16, add, CPLX128_LEG, +, 32c, 1) // __kmpc_atomic_cmplx16_add @@ -1793,6 +1803,7 @@ ATOMIC_CMPXCHG_MIX(float8, kmp_real64, mul, 64, *, fp, _Quad, 8r, 7, ATOMIC_CMPXCHG_MIX(float8, kmp_real64, div, 64, /, fp, _Quad, 8r, 7, KMP_ARCH_X86) // __kmpc_atomic_float8_div_fp +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 ATOMIC_CRITICAL_FP(float10, long double, add, +, fp, _Quad, 10r, 1) // __kmpc_atomic_float10_add_fp ATOMIC_CRITICAL_FP(float10, long double, sub, -, fp, _Quad, 10r, @@ -1802,7 +1813,6 @@ ATOMIC_CRITICAL_FP(float10, long double, mul, *, fp, _Quad, 10r, ATOMIC_CRITICAL_FP(float10, long double, div, /, fp, _Quad, 10r, 1) // __kmpc_atomic_float10_div_fp -#if KMP_ARCH_X86 || KMP_ARCH_X86_64 // Reverse operations ATOMIC_CMPXCHG_REV_MIX(fixed1, char, sub_rev, 8, -, fp, _Quad, 1i, 0, KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_rev_fp @@ -2717,6 +2727,10 @@ MIN_MAX_COMPXCHG_CPT(float8, max_cpt, kmp_real64, 64, <, KMP_ARCH_X86) // __kmpc_atomic_float8_max_cpt MIN_MAX_COMPXCHG_CPT(float8, min_cpt, kmp_real64, 64, >, KMP_ARCH_X86) // __kmpc_atomic_float8_min_cpt +MIN_MAX_CRITICAL_CPT(float10, max_cpt, long double, <, 10r, + 1) // __kmpc_atomic_float10_max_cpt +MIN_MAX_CRITICAL_CPT(float10, min_cpt, long double, >, 10r, + 1) // __kmpc_atomic_float10_min_cpt #if KMP_HAVE_QUAD MIN_MAX_CRITICAL_CPT(float16, max_cpt, QUAD_LEGACY, <, 16r, 1) // __kmpc_atomic_float16_max_cpt @@ -3586,7 +3600,7 @@ void __kmpc_atomic_8(ident_t *id_ref, int gtid, void *lhs, void *rhs, __kmp_release_atomic_lock(&__kmp_atomic_lock_8i, gtid); } } - +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 void __kmpc_atomic_10(ident_t *id_ref, int gtid, void *lhs, void *rhs, void (*f)(void *, void *, void *)) { KMP_DEBUG_ASSERT(__kmp_init_serial); @@ -3607,6 +3621,7 @@ void __kmpc_atomic_10(ident_t *id_ref, int gtid, void *lhs, void *rhs, #endif /* KMP_GOMP_COMPAT */ __kmp_release_atomic_lock(&__kmp_atomic_lock_10r, gtid); } +#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64 void __kmpc_atomic_16(ident_t *id_ref, int gtid, void *lhs, void *rhs, void (*f)(void *, void *, void *)) { @@ -3628,7 +3643,7 @@ void __kmpc_atomic_16(ident_t *id_ref, int gtid, void *lhs, void *rhs, #endif /* KMP_GOMP_COMPAT */ __kmp_release_atomic_lock(&__kmp_atomic_lock_16c, gtid); } - +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 void __kmpc_atomic_20(ident_t *id_ref, int gtid, void *lhs, void *rhs, void (*f)(void *, void *, void *)) { KMP_DEBUG_ASSERT(__kmp_init_serial); @@ -3649,7 +3664,7 @@ void __kmpc_atomic_20(ident_t *id_ref, int gtid, void *lhs, void *rhs, #endif /* KMP_GOMP_COMPAT */ __kmp_release_atomic_lock(&__kmp_atomic_lock_20c, gtid); } - +#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64 void __kmpc_atomic_32(ident_t *id_ref, int gtid, void *lhs, void *rhs, void (*f)(void *, void *, void *)) { KMP_DEBUG_ASSERT(__kmp_init_serial); @@ -3686,6 +3701,171 @@ void __kmpc_atomic_end(void) { __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid); } +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + +// OpenMP 5.1 compare and swap + +/*! +@param loc Source code location +@param gtid Global thread id +@param x Memory location to operate on +@param e Expected value +@param d Desired value +@return Result of comparison + +Implements Compare And Swap atomic operation. + +Sample code: +#pragma omp atomic compare update capture + { r = x == e; if(r) { x = d; } } +*/ +bool __kmpc_atomic_bool_1_cas(ident_t *loc, int gtid, char *x, char e, char d) { + return KMP_COMPARE_AND_STORE_ACQ8(x, e, d); +} +bool __kmpc_atomic_bool_2_cas(ident_t *loc, int gtid, short *x, short e, + short d) { + return KMP_COMPARE_AND_STORE_ACQ16(x, e, d); +} +bool __kmpc_atomic_bool_4_cas(ident_t *loc, int gtid, kmp_int32 *x, kmp_int32 e, + kmp_int32 d) { + return KMP_COMPARE_AND_STORE_ACQ32(x, e, d); +} +bool __kmpc_atomic_bool_8_cas(ident_t *loc, int gtid, kmp_int64 *x, kmp_int64 e, + kmp_int64 d) { + return KMP_COMPARE_AND_STORE_ACQ64(x, e, d); +} + +/*! +@param loc Source code location +@param gtid Global thread id +@param x Memory location to operate on +@param e Expected value +@param d Desired value +@return Old value of x + +Implements Compare And Swap atomic operation. + +Sample code: +#pragma omp atomic compare update capture + { v = x; if (x == e) { x = d; } } +*/ +char __kmpc_atomic_val_1_cas(ident_t *loc, int gtid, char *x, char e, char d) { + return KMP_COMPARE_AND_STORE_RET8(x, e, d); +} +short __kmpc_atomic_val_2_cas(ident_t *loc, int gtid, short *x, short e, + short d) { + return KMP_COMPARE_AND_STORE_RET16(x, e, d); +} +kmp_int32 __kmpc_atomic_val_4_cas(ident_t *loc, int gtid, kmp_int32 *x, + kmp_int32 e, kmp_int32 d) { + return KMP_COMPARE_AND_STORE_RET32(x, e, d); +} +kmp_int64 __kmpc_atomic_val_8_cas(ident_t *loc, int gtid, kmp_int64 *x, + kmp_int64 e, kmp_int64 d) { + return KMP_COMPARE_AND_STORE_RET64(x, e, d); +} + +/*! +@param loc Source code location +@param gtid Global thread id +@param x Memory location to operate on +@param e Expected value +@param d Desired value +@param pv Captured value location +@return Result of comparison + +Implements Compare And Swap + Capture atomic operation. + +v gets old valie of x if comparison failed, untouched otherwise. +Sample code: +#pragma omp atomic compare update capture + { r = x == e; if(r) { x = d; } else { v = x; } } +*/ +bool __kmpc_atomic_bool_1_cas_cpt(ident_t *loc, int gtid, char *x, char e, + char d, char *pv) { + char old = KMP_COMPARE_AND_STORE_RET8(x, e, d); + if (old == e) + return true; + KMP_ASSERT(pv != NULL); + *pv = old; + return false; +} +bool __kmpc_atomic_bool_2_cas_cpt(ident_t *loc, int gtid, short *x, short e, + short d, short *pv) { + short old = KMP_COMPARE_AND_STORE_RET16(x, e, d); + if (old == e) + return true; + KMP_ASSERT(pv != NULL); + *pv = old; + return false; +} +bool __kmpc_atomic_bool_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x, + kmp_int32 e, kmp_int32 d, kmp_int32 *pv) { + kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(x, e, d); + if (old == e) + return true; + KMP_ASSERT(pv != NULL); + *pv = old; + return false; +} +bool __kmpc_atomic_bool_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x, + kmp_int64 e, kmp_int64 d, kmp_int64 *pv) { + kmp_int64 old = KMP_COMPARE_AND_STORE_RET64(x, e, d); + if (old == e) + return true; + KMP_ASSERT(pv != NULL); + *pv = old; + return false; +} + +/*! +@param loc Source code location +@param gtid Global thread id +@param x Memory location to operate on +@param e Expected value +@param d Desired value +@param pv Captured value location +@return Old value of x + +Implements Compare And Swap + Capture atomic operation. + +v gets new valie of x. +Sample code: +#pragma omp atomic compare update capture + { if (x == e) { x = d; }; v = x; } +*/ +char __kmpc_atomic_val_1_cas_cpt(ident_t *loc, int gtid, char *x, char e, + char d, char *pv) { + char old = KMP_COMPARE_AND_STORE_RET8(x, e, d); + KMP_ASSERT(pv != NULL); + *pv = old == e ? d : old; + return old; +} +short __kmpc_atomic_val_2_cas_cpt(ident_t *loc, int gtid, short *x, short e, + short d, short *pv) { + short old = KMP_COMPARE_AND_STORE_RET16(x, e, d); + KMP_ASSERT(pv != NULL); + *pv = old == e ? d : old; + return old; +} +kmp_int32 __kmpc_atomic_val_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x, + kmp_int32 e, kmp_int32 d, kmp_int32 *pv) { + kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(x, e, d); + KMP_ASSERT(pv != NULL); + *pv = old == e ? d : old; + return old; +} +kmp_int64 __kmpc_atomic_val_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x, + kmp_int64 e, kmp_int64 d, kmp_int64 *pv) { + kmp_int64 old = KMP_COMPARE_AND_STORE_RET64(x, e, d); + KMP_ASSERT(pv != NULL); + *pv = old == e ? d : old; + return old; +} + +// End OpenMP 5.1 compare + capture +#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64 + /*! @} */ diff --git a/contrib/libs/cxxsupp/openmp/kmp_atomic.h b/contrib/libs/cxxsupp/openmp/kmp_atomic.h index 6a0827aaf1..079b917285 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_atomic.h +++ b/contrib/libs/cxxsupp/openmp/kmp_atomic.h @@ -578,6 +578,10 @@ void __kmpc_atomic_float8_max(ident_t *id_ref, int gtid, kmp_real64 *lhs, kmp_real64 rhs); void __kmpc_atomic_float8_min(ident_t *id_ref, int gtid, kmp_real64 *lhs, kmp_real64 rhs); +void __kmpc_atomic_float10_max(ident_t *id_ref, int gtid, long double *lhs, + long double rhs); +void __kmpc_atomic_float10_min(ident_t *id_ref, int gtid, long double *lhs, + long double rhs); #if KMP_HAVE_QUAD void __kmpc_atomic_float16_max(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs, QUAD_LEGACY rhs); @@ -1254,6 +1258,12 @@ kmp_real64 __kmpc_atomic_float8_max_cpt(ident_t *id_ref, int gtid, kmp_real64 __kmpc_atomic_float8_min_cpt(ident_t *id_ref, int gtid, kmp_real64 *lhs, kmp_real64 rhs, int flag); +long double __kmpc_atomic_float10_max_cpt(ident_t *id_ref, int gtid, + long double *lhs, long double rhs, + int flag); +long double __kmpc_atomic_float10_min_cpt(ident_t *id_ref, int gtid, + long double *lhs, long double rhs, + int flag); #if KMP_HAVE_QUAD QUAD_LEGACY __kmpc_atomic_float16_max_cpt(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs, QUAD_LEGACY rhs, @@ -1756,6 +1766,78 @@ long double __kmpc_atomic_float10_div_cpt_rev_fp(ident_t *id_ref, int gtid, // End of OpenMP 4.0 capture +// OpenMP 5.1 compare and swap +/* + __kmpc_atomic_bool_1_cas + __kmpc_atomic_bool_2_cas + __kmpc_atomic_bool_4_cas + __kmpc_atomic_bool_8_cas + __kmpc_atomic_val_1_cas + __kmpc_atomic_val_2_cas + __kmpc_atomic_val_4_cas + __kmpc_atomic_val_8_cas + __kmpc_atomic_bool_1_cas_cpt + __kmpc_atomic_bool_2_cas_cpt + __kmpc_atomic_bool_4_cas_cpt + __kmpc_atomic_bool_8_cas_cpt + __kmpc_atomic_val_1_cas_cpt + __kmpc_atomic_val_2_cas_cpt + __kmpc_atomic_val_4_cas_cpt + __kmpc_atomic_val_8_cas_cpt +*/ +// In all interfaces of CAS (Compare And Swap): +// r is the boolean result of comparison +// x is memory location to operate on +// e is expected (old) value +// d is desired (new) value +// pv is pointer to captured value v whose location may coincide with e + +// { r = x == e; if(r) { x = d; } } +// functions return result of comparison +bool __kmpc_atomic_bool_1_cas(ident_t *loc, int gtid, char *x, char e, char d); +bool __kmpc_atomic_bool_2_cas(ident_t *loc, int gtid, short *x, short e, + short d); +bool __kmpc_atomic_bool_4_cas(ident_t *loc, int gtid, kmp_int32 *x, kmp_int32 e, + kmp_int32 d); +bool __kmpc_atomic_bool_8_cas(ident_t *loc, int gtid, kmp_int64 *x, kmp_int64 e, + kmp_int64 d); + +// { v = x; if (x == e) { x = d; } } +// functions return old value +char __kmpc_atomic_val_1_cas(ident_t *loc, int gtid, char *x, char e, char d); +short __kmpc_atomic_val_2_cas(ident_t *loc, int gtid, short *x, short e, + short d); +kmp_int32 __kmpc_atomic_val_4_cas(ident_t *loc, int gtid, kmp_int32 *x, + kmp_int32 e, kmp_int32 d); +kmp_int64 __kmpc_atomic_val_8_cas(ident_t *loc, int gtid, kmp_int64 *x, + kmp_int64 e, kmp_int64 d); + +// { r = x == e; if(r) { x = d; } else { v = x; } } +// v gets old value if comparison failed, untouched otherwise +// functions return result of comparison +bool __kmpc_atomic_bool_1_cas_cpt(ident_t *loc, int gtid, char *x, char e, + char d, char *pv); +bool __kmpc_atomic_bool_2_cas_cpt(ident_t *loc, int gtid, short *x, short e, + short d, short *pv); +bool __kmpc_atomic_bool_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x, + kmp_int32 e, kmp_int32 d, kmp_int32 *pv); +bool __kmpc_atomic_bool_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x, + kmp_int64 e, kmp_int64 d, kmp_int64 *pv); + +// { if (x == e) { x = d; }; v = x; } +// v gets old value if comparison failed, new value otherwise +// functions return old value +char __kmpc_atomic_val_1_cas_cpt(ident_t *loc, int gtid, char *x, char e, + char d, char *pv); +short __kmpc_atomic_val_2_cas_cpt(ident_t *loc, int gtid, short *x, short e, + short d, short *pv); +kmp_int32 __kmpc_atomic_val_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x, + kmp_int32 e, kmp_int32 d, kmp_int32 *pv); +kmp_int64 __kmpc_atomic_val_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x, + kmp_int64 e, kmp_int64 d, kmp_int64 *pv); + +// End OpenMP 5.1 compare + capture + #endif // KMP_ARCH_X86 || KMP_ARCH_X86_64 /* ------------------------------------------------------------------------ */ diff --git a/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp b/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp index 93112156a1..ee05bb3587 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp @@ -10,12 +10,14 @@ // //===----------------------------------------------------------------------===// -#include "kmp.h" #include "kmp_wait_release.h" +#include "kmp_barrier.h" #include "kmp_itt.h" #include "kmp_os.h" #include "kmp_stats.h" #include "ompt-specific.h" +// for distributed barrier +#include "kmp_affinity.h" #if KMP_MIC #include <immintrin.h> @@ -38,6 +40,516 @@ void __kmp_print_structure(void); // Forward declaration // ---------------------------- Barrier Algorithms ---------------------------- +// Distributed barrier + +// Compute how many threads to have polling each cache-line. +// We want to limit the number of writes to IDEAL_GO_RESOLUTION. +void distributedBarrier::computeVarsForN(size_t n) { + int nsockets = 1; + if (__kmp_topology) { + int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET); + int core_level = __kmp_topology->get_level(KMP_HW_CORE); + int ncores_per_socket = + __kmp_topology->calculate_ratio(core_level, socket_level); + nsockets = __kmp_topology->get_count(socket_level); + + if (nsockets <= 0) + nsockets = 1; + if (ncores_per_socket <= 0) + ncores_per_socket = 1; + + threads_per_go = ncores_per_socket >> 1; + if (!fix_threads_per_go) { + // Minimize num_gos + if (threads_per_go > 4) { + if (KMP_OPTIMIZE_FOR_REDUCTIONS) { + threads_per_go = threads_per_go >> 1; + } + if (threads_per_go > 4 && nsockets == 1) + threads_per_go = threads_per_go >> 1; + } + } + if (threads_per_go == 0) + threads_per_go = 1; + fix_threads_per_go = true; + num_gos = n / threads_per_go; + if (n % threads_per_go) + num_gos++; + if (nsockets == 1 || num_gos == 1) + num_groups = 1; + else { + num_groups = num_gos / nsockets; + if (num_gos % nsockets) + num_groups++; + } + if (num_groups <= 0) + num_groups = 1; + gos_per_group = num_gos / num_groups; + if (num_gos % num_groups) + gos_per_group++; + threads_per_group = threads_per_go * gos_per_group; + } else { + num_gos = n / threads_per_go; + if (n % threads_per_go) + num_gos++; + if (num_gos == 1) + num_groups = 1; + else { + num_groups = num_gos / 2; + if (num_gos % 2) + num_groups++; + } + gos_per_group = num_gos / num_groups; + if (num_gos % num_groups) + gos_per_group++; + threads_per_group = threads_per_go * gos_per_group; + } +} + +void distributedBarrier::computeGo(size_t n) { + // Minimize num_gos + for (num_gos = 1;; num_gos++) + if (IDEAL_CONTENTION * num_gos >= n) + break; + threads_per_go = n / num_gos; + if (n % num_gos) + threads_per_go++; + while (num_gos > MAX_GOS) { + threads_per_go++; + num_gos = n / threads_per_go; + if (n % threads_per_go) + num_gos++; + } + computeVarsForN(n); +} + +// This function is to resize the barrier arrays when the new number of threads +// exceeds max_threads, which is the current size of all the arrays +void distributedBarrier::resize(size_t nthr) { + KMP_DEBUG_ASSERT(nthr > max_threads); + + // expand to requested size * 2 + max_threads = nthr * 2; + + // allocate arrays to new max threads + for (int i = 0; i < MAX_ITERS; ++i) { + if (flags[i]) + flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i], + max_threads * sizeof(flags_s)); + else + flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s)); + } + + if (go) + go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s)); + else + go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s)); + + if (iter) + iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s)); + else + iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s)); + + if (sleep) + sleep = + (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s)); + else + sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s)); +} + +// This function is to set all the go flags that threads might be waiting +// on, and when blocktime is not infinite, it should be followed by a wake-up +// call to each thread +kmp_uint64 distributedBarrier::go_release() { + kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS; + for (size_t j = 0; j < num_gos; j++) { + go[j].go.store(next_go); + } + return next_go; +} + +void distributedBarrier::go_reset() { + for (size_t j = 0; j < max_threads; ++j) { + for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) { + flags[i][j].stillNeed = 1; + } + go[j].go.store(0); + iter[j].iter = 0; + } +} + +// This function inits/re-inits the distributed barrier for a particular number +// of threads. If a resize of arrays is needed, it calls the resize function. +void distributedBarrier::init(size_t nthr) { + size_t old_max = max_threads; + if (nthr > max_threads) { // need more space in arrays + resize(nthr); + } + + for (size_t i = 0; i < max_threads; i++) { + for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) { + flags[j][i].stillNeed = 1; + } + go[i].go.store(0); + iter[i].iter = 0; + if (i >= old_max) + sleep[i].sleep = false; + } + + // Recalculate num_gos, etc. based on new nthr + computeVarsForN(nthr); + + num_threads = nthr; + + if (team_icvs == NULL) + team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t)); +} + +// This function is used only when KMP_BLOCKTIME is not infinite. +// static +void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team, + size_t start, size_t stop, size_t inc, + size_t tid) { + KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME); + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + kmp_info_t **other_threads = team->t.t_threads; + for (size_t thr = start; thr < stop; thr += inc) { + KMP_DEBUG_ASSERT(other_threads[thr]); + int gtid = other_threads[thr]->th.th_info.ds.ds_gtid; + // Wake up worker regardless of if it appears to be sleeping or not + __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL); + } +} + +static void __kmp_dist_barrier_gather( + enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather); + kmp_team_t *team; + distributedBarrier *b; + kmp_info_t **other_threads; + kmp_uint64 my_current_iter, my_next_iter; + kmp_uint32 nproc; + bool group_leader; + + team = this_thr->th.th_team; + nproc = this_thr->th.th_team_nproc; + other_threads = team->t.t_threads; + b = team->t.b; + my_current_iter = b->iter[tid].iter; + my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS; + group_leader = ((tid % b->threads_per_group) == 0); + + KA_TRACE(20, + ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + +#if USE_ITT_BUILD && USE_ITT_NOTIFY + // Barrier imbalance - save arrive time to the thread + if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = + __itt_get_timestamp(); + } +#endif + + if (group_leader) { + // Start from the thread after the group leader + size_t group_start = tid + 1; + size_t group_end = tid + b->threads_per_group; + size_t threads_pending = 0; + + if (group_end > nproc) + group_end = nproc; + do { // wait for threads in my group + threads_pending = 0; + // Check all the flags every time to avoid branch misspredict + for (size_t thr = group_start; thr < group_end; thr++) { + // Each thread uses a different cache line + threads_pending += b->flags[my_current_iter][thr].stillNeed; + } + // Execute tasks here + if (__kmp_tasking_mode != tskm_immediate_exec) { + kmp_task_team_t *task_team = this_thr->th.th_task_team; + if (task_team != NULL) { + if (TCR_SYNC_4(task_team->tt.tt_active)) { + if (KMP_TASKING_ENABLED(task_team)) { + int tasks_completed = FALSE; + __kmp_atomic_execute_tasks_64( + this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE, + &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); + } else + this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; + } + } else { + this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; + } // if + } + if (TCR_4(__kmp_global.g.g_done)) { + if (__kmp_global.g.g_abort) + __kmp_abort_thread(); + break; + } else if (__kmp_tasking_mode != tskm_immediate_exec && + this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { + this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; + } + } while (threads_pending > 0); + + if (reduce) { // Perform reduction if needed + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; + // Group leader reduces all threads in group + for (size_t thr = group_start; thr < group_end; thr++) { + (*reduce)(this_thr->th.th_local.reduce_data, + other_threads[thr]->th.th_local.reduce_data); + } + OMPT_REDUCTION_END; + } + + // Set flag for next iteration + b->flags[my_next_iter][tid].stillNeed = 1; + // Each thread uses a different cache line; resets stillNeed to 0 to + // indicate it has reached the barrier + b->flags[my_current_iter][tid].stillNeed = 0; + + do { // wait for all group leaders + threads_pending = 0; + for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) { + threads_pending += b->flags[my_current_iter][thr].stillNeed; + } + // Execute tasks here + if (__kmp_tasking_mode != tskm_immediate_exec) { + kmp_task_team_t *task_team = this_thr->th.th_task_team; + if (task_team != NULL) { + if (TCR_SYNC_4(task_team->tt.tt_active)) { + if (KMP_TASKING_ENABLED(task_team)) { + int tasks_completed = FALSE; + __kmp_atomic_execute_tasks_64( + this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE, + &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); + } else + this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; + } + } else { + this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; + } // if + } + if (TCR_4(__kmp_global.g.g_done)) { + if (__kmp_global.g.g_abort) + __kmp_abort_thread(); + break; + } else if (__kmp_tasking_mode != tskm_immediate_exec && + this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { + this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; + } + } while (threads_pending > 0); + + if (reduce) { // Perform reduction if needed + if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; + for (size_t thr = b->threads_per_group; thr < nproc; + thr += b->threads_per_group) { + (*reduce)(this_thr->th.th_local.reduce_data, + other_threads[thr]->th.th_local.reduce_data); + } + OMPT_REDUCTION_END; + } + } + } else { + // Set flag for next iteration + b->flags[my_next_iter][tid].stillNeed = 1; + // Each thread uses a different cache line; resets stillNeed to 0 to + // indicate it has reached the barrier + b->flags[my_current_iter][tid].stillNeed = 0; + } + + KMP_MFENCE(); + + KA_TRACE(20, + ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +} + +static void __kmp_dist_barrier_release( + enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release); + kmp_team_t *team; + distributedBarrier *b; + kmp_bstate_t *thr_bar; + kmp_uint64 my_current_iter, next_go; + size_t my_go_index; + bool group_leader; + + KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n", + gtid, tid, bt)); + + thr_bar = &this_thr->th.th_bar[bt].bb; + + if (!KMP_MASTER_TID(tid)) { + // workers and non-master group leaders need to check their presence in team + do { + if (this_thr->th.th_used_in_team.load() != 1 && + this_thr->th.th_used_in_team.load() != 3) { + // Thread is not in use in a team. Wait on location in tid's thread + // struct. The 0 value tells anyone looking that this thread is spinning + // or sleeping until this location becomes 3 again; 3 is the transition + // state to get to 1 which is waiting on go and being in the team + kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3); + if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2, + 0) || + this_thr->th.th_used_in_team.load() == 0) { + my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj)); + } +#if USE_ITT_BUILD && USE_ITT_NOTIFY + if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { + // In fork barrier where we could not get the object reliably + itt_sync_obj = + __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); + // Cancel wait on previous parallel region... + __kmp_itt_task_starting(itt_sync_obj); + + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); + if (itt_sync_obj != NULL) + // Call prepare as early as possible for "new" barrier + __kmp_itt_task_finished(itt_sync_obj); + } else +#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + } + if (this_thr->th.th_used_in_team.load() != 1 && + this_thr->th.th_used_in_team.load() != 3) // spurious wake-up? + continue; + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + // At this point, the thread thinks it is in use in a team, or in + // transition to be used in a team, but it might have reached this barrier + // before it was marked unused by the team. Unused threads are awoken and + // shifted to wait on local thread struct elsewhere. It also might reach + // this point by being picked up for use by a different team. Either way, + // we need to update the tid. + tid = __kmp_tid_from_gtid(gtid); + team = this_thr->th.th_team; + KMP_DEBUG_ASSERT(tid >= 0); + KMP_DEBUG_ASSERT(team); + b = team->t.b; + my_current_iter = b->iter[tid].iter; + next_go = my_current_iter + distributedBarrier::MAX_ITERS; + my_go_index = tid / b->threads_per_go; + if (this_thr->th.th_used_in_team.load() == 3) { + KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1); + } + // Check if go flag is set + if (b->go[my_go_index].go.load() != next_go) { + // Wait on go flag on team + kmp_atomic_flag_64<false, true> my_flag( + &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep)); + my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj)); + KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter || + b->iter[tid].iter == 0); + KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false); + } + + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + // At this point, the thread's go location was set. This means the primary + // thread is safely in the barrier, and so this thread's data is + // up-to-date, but we should check again that this thread is really in + // use in the team, as it could have been woken up for the purpose of + // changing team size, or reaping threads at shutdown. + if (this_thr->th.th_used_in_team.load() == 1) + break; + } while (1); + + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + group_leader = ((tid % b->threads_per_group) == 0); + if (group_leader) { + // Tell all the threads in my group they can go! + for (size_t go_idx = my_go_index + 1; + go_idx < my_go_index + b->gos_per_group; go_idx++) { + b->go[go_idx].go.store(next_go); + } + // Fence added so that workers can see changes to go. sfence inadequate. + KMP_MFENCE(); + } + +#if KMP_BARRIER_ICV_PUSH + if (propagate_icvs) { // copy ICVs to final dest + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, + tid, FALSE); + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, + (kmp_internal_control_t *)team->t.b->team_icvs); + copy_icvs(&thr_bar->th_fixed_icvs, + &team->t.t_implicit_task_taskdata[tid].td_icvs); + } +#endif + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) { + // This thread is now awake and participating in the barrier; + // wake up the other threads in the group + size_t nproc = this_thr->th.th_team_nproc; + size_t group_end = tid + b->threads_per_group; + if (nproc < group_end) + group_end = nproc; + __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid); + } + } else { // Primary thread + team = this_thr->th.th_team; + b = team->t.b; + my_current_iter = b->iter[tid].iter; + next_go = my_current_iter + distributedBarrier::MAX_ITERS; +#if KMP_BARRIER_ICV_PUSH + if (propagate_icvs) { + // primary thread has ICVs in final destination; copy + copy_icvs(&thr_bar->th_fixed_icvs, + &team->t.t_implicit_task_taskdata[tid].td_icvs); + } +#endif + // Tell all the group leaders they can go! + for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) { + b->go[go_idx].go.store(next_go); + } + + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { + // Wake-up the group leaders + size_t nproc = this_thr->th.th_team_nproc; + __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc, + b->threads_per_group, tid); + } + + // Tell all the threads in my group they can go! + for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) { + b->go[go_idx].go.store(next_go); + } + + // Fence added so that workers can see changes to go. sfence inadequate. + KMP_MFENCE(); + + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { + // Wake-up the other threads in my group + size_t nproc = this_thr->th.th_team_nproc; + size_t group_end = tid + b->threads_per_group; + if (nproc < group_end) + group_end = nproc; + __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid); + } + } + // Update to next iteration + KMP_ASSERT(my_current_iter == b->iter[tid].iter); + b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS; + + KA_TRACE( + 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +} // Linear Barrier template <bool cancellable = false> @@ -1354,6 +1866,11 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); } else { switch (__kmp_barrier_gather_pattern[bt]) { + case bp_dist_bar: { + __kmp_dist_barrier_gather(bt, this_thr, gtid, tid, + reduce USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } case bp_hyper_bar: { // don't set branch bits to 0; use linear KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); @@ -1467,6 +1984,12 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); } else { switch (__kmp_barrier_release_pattern[bt]) { + case bp_dist_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); + __kmp_dist_barrier_release(bt, this_thr, gtid, tid, + FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } case bp_hyper_bar: { KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, @@ -1514,8 +2037,10 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, } #endif - KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == - TRUE); + KMP_DEBUG_ASSERT( + this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE || + this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered == + TRUE); __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); __kmp_task_team_setup(this_thr, team, 0); @@ -1596,6 +2121,11 @@ void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { if (!team->t.t_serialized) { if (KMP_MASTER_GTID(gtid)) { switch (__kmp_barrier_release_pattern[bt]) { + case bp_dist_bar: { + __kmp_dist_barrier_release(bt, this_thr, gtid, tid, + FALSE USE_ITT_BUILD_ARG(NULL)); + break; + } case bp_hyper_bar: { KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, @@ -1634,7 +2164,6 @@ void __kmp_join_barrier(int gtid) { kmp_info_t *this_thr = __kmp_threads[gtid]; kmp_team_t *team; kmp_uint nproc; - kmp_info_t *master_thread; int tid; #ifdef KMP_DEBUG int team_id; @@ -1656,9 +2185,7 @@ void __kmp_join_barrier(int gtid) { tid = __kmp_tid_from_gtid(gtid); #ifdef KMP_DEBUG team_id = team->t.t_id; -#endif /* KMP_DEBUG */ - master_thread = this_thr->th.th_team_master; -#ifdef KMP_DEBUG + kmp_info_t *master_thread = this_thr->th.th_team_master; if (master_thread != team->t.t_threads[0]) { __kmp_print_structure(); } @@ -1705,8 +2232,8 @@ void __kmp_join_barrier(int gtid) { if (__kmp_tasking_mode == tskm_extra_barrier) { __kmp_tasking_barrier(team, this_thr, gtid); - KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, - team_id, tid)); + KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n", + gtid, team_id, tid)); } #ifdef KMP_DEBUG if (__kmp_tasking_mode != tskm_immediate_exec) { @@ -1715,8 +2242,9 @@ void __kmp_join_barrier(int gtid) { __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state], this_thr->th.th_task_team)); - KMP_DEBUG_ASSERT(this_thr->th.th_task_team == - team->t.t_task_team[this_thr->th.th_task_state]); + if (this_thr->th.th_task_team) + KMP_DEBUG_ASSERT(this_thr->th.th_task_team == + team->t.t_task_team[this_thr->th.th_task_state]); } #endif /* KMP_DEBUG */ @@ -1742,6 +2270,11 @@ void __kmp_join_barrier(int gtid) { #endif /* USE_ITT_BUILD */ switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { + case bp_dist_bar: { + __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, + NULL USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } case bp_hyper_bar: { KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, @@ -1787,8 +2320,7 @@ void __kmp_join_barrier(int gtid) { team_thread->th.th_stats->setIdleFlag(); if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL) - __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), - team_thread->th.th_sleep_loc); + __kmp_null_resume_wrapper(team_thread); } #endif #if USE_ITT_BUILD @@ -1806,8 +2338,6 @@ void __kmp_join_barrier(int gtid) { kmp_uint64 cur_time = __itt_get_timestamp(); ident_t *loc = team->t.t_ident; kmp_info_t **other_threads = team->t.t_threads; - int nproc = this_thr->th.th_team_nproc; - int i; switch (__kmp_forkjoin_frames_mode) { case 1: __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, @@ -1824,7 +2354,7 @@ void __kmp_join_barrier(int gtid) { // Set arrive time to zero to be able to check it in // __kmp_invoke_task(); the same is done inside the loop below this_thr->th.th_bar_arrive_time = 0; - for (i = 1; i < nproc; ++i) { + for (kmp_uint i = 1; i < nproc; ++i) { delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); other_threads[i]->th.th_bar_arrive_time = 0; } @@ -1933,6 +2463,11 @@ void __kmp_fork_barrier(int gtid, int tid) { } // primary thread switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { + case bp_dist_bar: { + __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, + TRUE USE_ITT_BUILD_ARG(NULL)); + break; + } case bp_hyper_bar: { KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, diff --git a/contrib/libs/cxxsupp/openmp/kmp_barrier.h b/contrib/libs/cxxsupp/openmp/kmp_barrier.h new file mode 100644 index 0000000000..ac28a13217 --- /dev/null +++ b/contrib/libs/cxxsupp/openmp/kmp_barrier.h @@ -0,0 +1,141 @@ +/* + * kmp_barrier.h + */ + +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef KMP_BARRIER_H +#define KMP_BARRIER_H + +#include "kmp.h" +#include "kmp_i18n.h" + +#if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC +#include <xmmintrin.h> +#define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment) +#define KMP_ALIGNED_FREE(ptr) _mm_free(ptr) +#elif KMP_HAVE_ALIGNED_ALLOC +#define KMP_ALIGNED_ALLOCATE(size, alignment) aligned_alloc(alignment, size) +#define KMP_ALIGNED_FREE(ptr) free(ptr) +#elif KMP_HAVE_POSIX_MEMALIGN +static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) { + void *ptr; + int n = posix_memalign(&ptr, alignment, size); + if (n != 0) { + if (ptr) + free(ptr); + return nullptr; + } + return ptr; +} +#define KMP_ALIGNED_FREE(ptr) free(ptr) +#elif KMP_HAVE__ALIGNED_MALLOC +#include <malloc.h> +#define KMP_ALIGNED_ALLOCATE(size, alignment) _aligned_malloc(size, alignment) +#define KMP_ALIGNED_FREE(ptr) _aligned_free(ptr) +#else +#define KMP_ALIGNED_ALLOCATE(size, alignment) KMP_INTERNAL_MALLOC(size) +#define KMP_ALIGNED_FREE(ptr) KMP_INTERNAL_FREE(ptr) +#endif + +// Use four cache lines: MLC tends to prefetch the next or previous cache line +// creating a possible fake conflict between cores, so this is the only way to +// guarantee that no such prefetch can happen. +#ifndef KMP_FOURLINE_ALIGN_CACHE +#define KMP_FOURLINE_ALIGN_CACHE KMP_ALIGN(4 * CACHE_LINE) +#endif + +#define KMP_OPTIMIZE_FOR_REDUCTIONS 0 + +class distributedBarrier { + struct flags_s { + kmp_uint32 volatile KMP_FOURLINE_ALIGN_CACHE stillNeed; + }; + + struct go_s { + std::atomic<kmp_uint64> KMP_FOURLINE_ALIGN_CACHE go; + }; + + struct iter_s { + kmp_uint64 volatile KMP_FOURLINE_ALIGN_CACHE iter; + }; + + struct sleep_s { + std::atomic<bool> KMP_FOURLINE_ALIGN_CACHE sleep; + }; + + void init(size_t nthr); + void resize(size_t nthr); + void computeGo(size_t n); + void computeVarsForN(size_t n); + +public: + enum { + MAX_ITERS = 3, + MAX_GOS = 8, + IDEAL_GOS = 4, + IDEAL_CONTENTION = 16, + }; + + flags_s *flags[MAX_ITERS]; + go_s *go; + iter_s *iter; + sleep_s *sleep; + + size_t KMP_ALIGN_CACHE num_threads; // number of threads in barrier + size_t KMP_ALIGN_CACHE max_threads; // size of arrays in data structure + // number of go signals each requiring one write per iteration + size_t KMP_ALIGN_CACHE num_gos; + // number of groups of gos + size_t KMP_ALIGN_CACHE num_groups; + // threads per go signal + size_t KMP_ALIGN_CACHE threads_per_go; + bool KMP_ALIGN_CACHE fix_threads_per_go; + // threads per group + size_t KMP_ALIGN_CACHE threads_per_group; + // number of go signals in a group + size_t KMP_ALIGN_CACHE gos_per_group; + void *team_icvs; + + distributedBarrier() = delete; + ~distributedBarrier() = delete; + + // Used instead of constructor to create aligned data + static distributedBarrier *allocate(int nThreads) { + distributedBarrier *d = (distributedBarrier *)KMP_ALIGNED_ALLOCATE( + sizeof(distributedBarrier), 4 * CACHE_LINE); + if (!d) { + KMP_FATAL(MemoryAllocFailed); + } + d->num_threads = 0; + d->max_threads = 0; + for (int i = 0; i < MAX_ITERS; ++i) + d->flags[i] = NULL; + d->go = NULL; + d->iter = NULL; + d->sleep = NULL; + d->team_icvs = NULL; + d->fix_threads_per_go = false; + // calculate gos and groups ONCE on base size + d->computeGo(nThreads); + d->init(nThreads); + return d; + } + + static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); } + + void update_num_threads(size_t nthr) { init(nthr); } + + bool need_resize(size_t new_nthr) { return (new_nthr > max_threads); } + size_t get_num_threads() { return num_threads; } + kmp_uint64 go_release(); + void go_reset(); +}; + +#endif // KMP_BARRIER_H diff --git a/contrib/libs/cxxsupp/openmp/kmp_config.h b/contrib/libs/cxxsupp/openmp/kmp_config.h index 81314ed20a..2f7a7f9320 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_config.h +++ b/contrib/libs/cxxsupp/openmp/kmp_config.h @@ -80,6 +80,16 @@ #define KMP_HAVE_ATTRIBUTE_RTM LIBOMP_HAVE_ATTRIBUTE_RTM #define LIBOMP_ARCH_AARCH64_A64FX 0 #define KMP_ARCH_AARCH64_A64FX LIBOMP_ARCH_AARCH64_A64FX +#define LIBOMP_HAVE_XMMINTRIN_H 1 +#define KMP_HAVE_XMMINTRIN_H LIBOMP_HAVE_XMMINTRIN_H +#define LIBOMP_HAVE__MM_MALLOC 1 +#define KMP_HAVE__MM_MALLOC LIBOMP_HAVE__MM_MALLOC +#define LIBOMP_HAVE_ALIGNED_ALLOC 1 +#define KMP_HAVE_ALIGNED_ALLOC LIBOMP_HAVE_ALIGNED_ALLOC +#define LIBOMP_HAVE_POSIX_MEMALIGN 1 +#define KMP_HAVE_POSIX_MEMALIGN LIBOMP_HAVE_POSIX_MEMALIGN +#define LIBOMP_HAVE__ALIGNED_MALLOC 0 +#define KMP_HAVE__ALIGNED_MALLOC LIBOMP_HAVE__ALIGNED_MALLOC // Configured cache line based on architecture #if KMP_ARCH_PPC64 @@ -119,4 +129,9 @@ # define KMP_GOMP_COMPAT #endif +// use shared memory with dynamic library (except Android, where shm_* +// functions don't exist). +#if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !__ANDROID__ +#define KMP_USE_SHM +#endif #endif // KMP_CONFIG_H diff --git a/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp b/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp index 2a7c9a8cb2..e263558517 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp @@ -288,15 +288,7 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) { ompt_frame_t *ompt_frame; if (ompt_enabled.enabled) { kmp_info_t *master_th = __kmp_threads[gtid]; - kmp_team_t *parent_team = master_th->th.th_team; - ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info; - if (lwt) - ompt_frame = &(lwt->ompt_task_info.frame); - else { - int tid = __kmp_tid_from_gtid(gtid); - ompt_frame = &( - parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame); - } + ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame; ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); } OMPT_STORE_RETURN_ADDRESS(gtid); @@ -320,6 +312,12 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) { ); va_end(ap); + +#if OMPT_SUPPORT + if (ompt_enabled.enabled) { + ompt_frame->enter_frame = ompt_data_none; + } +#endif } #if KMP_STATS_ENABLED @@ -533,7 +531,8 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { kmp_task_team_t *task_team = this_thr->th.th_task_team; // we need to wait for the proxy tasks before finishing the thread - if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) + if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks || + task_team->tt.tt_hidden_helper_task_encountered)) __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL)); KMP_MB(); @@ -578,9 +577,6 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { __kmp_free(top); } - // if( serial_team -> t.t_serialized > 1 ) - serial_team->t.t_level--; - /* pop dispatch buffers stack */ KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer); { @@ -605,6 +601,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { } #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ + __kmp_pop_current_task_from_thread(this_thr); #if OMPD_SUPPORT if (ompd_state & OMPD_ENABLE_BP) ompd_bp_parallel_end(); @@ -623,8 +620,6 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { this_thr->th.th_dispatch = &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid]; - __kmp_pop_current_task_from_thread(this_thr); - KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0); this_thr->th.th_current_task->td_flags.executing = 1; @@ -645,6 +640,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { } } + serial_team->t.t_level--; if (__kmp_env_consistency_check) __kmp_pop_parallel(global_tid, NULL); #if OMPT_SUPPORT @@ -686,7 +682,7 @@ void __kmpc_flush(ident_t *loc) { if (!__kmp_cpuinfo.initialized) { __kmp_query_cpuid(&__kmp_cpuinfo); } - if (!__kmp_cpuinfo.sse2) { + if (!__kmp_cpuinfo.flags.sse2) { // CPU cannot execute SSE2 instructions. } else { #if KMP_COMPILER_ICC @@ -1359,7 +1355,7 @@ static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) { #endif #if KMP_ARCH_X86 || KMP_ARCH_X86_64 -#define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm) +#define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm) #else #define KMP_CPUINFO_RTM 0 #endif @@ -4330,24 +4326,35 @@ void __kmpc_doacross_fini(ident_t *loc, int gtid) { KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid)); } -/* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */ +/* OpenMP 5.1 Memory Management routines */ void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { - return __kmpc_alloc(__kmp_entry_gtid(), size, allocator); + return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator); +} + +void *omp_aligned_alloc(size_t align, size_t size, + omp_allocator_handle_t allocator) { + return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator); } void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) { - return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator); + return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator); +} + +void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size, + omp_allocator_handle_t allocator) { + return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator); } void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator, omp_allocator_handle_t free_allocator) { - return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator, + return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator, free_allocator); } void omp_free(void *ptr, omp_allocator_handle_t allocator) { - __kmpc_free(__kmp_entry_gtid(), ptr, allocator); + ___kmpc_free(__kmp_entry_gtid(), ptr, allocator); } +/* end of OpenMP 5.1 Memory Management routines */ int __kmpc_get_target_offload(void) { if (!__kmp_init_serial) { @@ -4395,6 +4402,38 @@ void __kmpc_error(ident_t *loc, int severity, const char *message) { __kmp_str_free(&src_loc); } +// Mark begin of scope directive. +void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) { +// reserved is for extension of scope directive and not used. +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) { + kmp_team_t *team = __kmp_threads[gtid]->th.th_team; + int tid = __kmp_tid_from_gtid(gtid); + ompt_callbacks.ompt_callback(ompt_callback_work)( + ompt_work_scope, ompt_scope_begin, + &(team->t.ompt_team_info.parallel_data), + &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1, + OMPT_GET_RETURN_ADDRESS(0)); + } +#endif // OMPT_SUPPORT && OMPT_OPTIONAL +} + +// Mark end of scope directive +void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) { +// reserved is for extension of scope directive and not used. +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) { + kmp_team_t *team = __kmp_threads[gtid]->th.th_team; + int tid = __kmp_tid_from_gtid(gtid); + ompt_callbacks.ompt_callback(ompt_callback_work)( + ompt_work_scope, ompt_scope_end, + &(team->t.ompt_team_info.parallel_data), + &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1, + OMPT_GET_RETURN_ADDRESS(0)); + } +#endif // OMPT_SUPPORT && OMPT_OPTIONAL +} + #ifdef KMP_USE_VERSION_SYMBOLS // For GOMP compatibility there are two versions of each omp_* API. // One is the plain C symbol and one is the Fortran symbol with an appended diff --git a/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp b/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp index 108384e1cc..f3407bf889 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp @@ -72,8 +72,8 @@ void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, bool use_hier = false) { // Pick up the nonmonotonic/monotonic bits from the scheduling type - // TODO: make nonmonotonic when static_steal is fixed - int monotonicity = SCHEDULE_MONOTONIC; + // Nonmonotonic as default for dynamic schedule when no modifier is specified + int monotonicity = SCHEDULE_NONMONOTONIC; // Let default be monotonic for executables // compiled with OpenMP* 4.5 or less compilers @@ -561,6 +561,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, _control87(_PC_64, _MCW_PC); // 0,0x30000 #endif /* value used for comparison in solver for cross-over point */ + KMP_ASSERT(tc > 0); long double target = ((long double)chunk * 2 + 1) * nproc / tc; /* crossover point--chunk indexes equal to or greater than @@ -668,6 +669,8 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, case kmp_sch_static_chunked: case kmp_sch_dynamic_chunked: dynamic_init: + if (tc == 0) + break; if (pr->u.p.parm1 <= 0) pr->u.p.parm1 = KMP_DEFAULT_CHUNK; else if (pr->u.p.parm1 > tc) @@ -1713,7 +1716,7 @@ int __kmp_dispatch_next_algorithm(int gtid, status = 0; // nothing to do, don't try atomic op break; } - KMP_DEBUG_ASSERT(init % chunk == 0); + KMP_DEBUG_ASSERT(chunk && init % chunk == 0); // compare with K*nproc*(chunk+1), K=2 by default if ((T)remaining < pr->u.p.parm2) { // use dynamic-style schedule @@ -2652,9 +2655,11 @@ __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, kmp_uint32 spins; kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; kmp_uint32 r; + kmp_uint64 time; KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); // main wait spin loop while (!f(r = TCR_4(*spin), check)) { KMP_FSYNC_SPIN_PREPARE(obj); @@ -2662,7 +2667,7 @@ __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, split. It causes problems with infinite recursion because of exit lock */ /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) __kmp_abort_thread(); */ - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } KMP_FSYNC_SPIN_ACQUIRED(obj); return r; @@ -2677,15 +2682,17 @@ void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, kmp_uint32 check = checker; kmp_uint32 spins; kmp_uint32 (*f)(void *, kmp_uint32) = pred; + kmp_uint64 time; KMP_FSYNC_SPIN_INIT(obj, spin); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); // main wait spin loop while (!f(spin, check)) { KMP_FSYNC_SPIN_PREPARE(obj); /* if we have waited a bit, or are noversubscribed, yield */ /* pause is in the following code */ - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } KMP_FSYNC_SPIN_ACQUIRED(obj); } diff --git a/contrib/libs/cxxsupp/openmp/kmp_dispatch.h b/contrib/libs/cxxsupp/openmp/kmp_dispatch.h index ae11361ca5..154db17461 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_dispatch.h +++ b/contrib/libs/cxxsupp/openmp/kmp_dispatch.h @@ -292,10 +292,12 @@ static UT __kmp_wait(volatile UT *spinner, UT checker, UT check = checker; kmp_uint32 spins; kmp_uint32 (*f)(UT, UT) = pred; + kmp_uint64 time; UT r; KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); // main wait spin loop while (!f(r = *spin, check)) { KMP_FSYNC_SPIN_PREPARE(obj); @@ -305,7 +307,7 @@ static UT __kmp_wait(volatile UT *spinner, UT checker, /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) __kmp_abort_thread(); */ // If oversubscribed, or have waited a bit then yield. - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } KMP_FSYNC_SPIN_ACQUIRED(obj); return r; diff --git a/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h b/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h index 30c967af3d..bf9ebf9b2e 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h +++ b/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h @@ -1446,6 +1446,120 @@ int FTN_STDCALL FTN_GET_TEAMS_THREAD_LIMIT(void) { #endif } +/// TODO: Include the `omp.h` of the current build +/* OpenMP 5.1 interop */ +typedef intptr_t omp_intptr_t; + +/* 0..omp_get_num_interop_properties()-1 are reserved for implementation-defined + * properties */ +typedef enum omp_interop_property { + omp_ipr_fr_id = -1, + omp_ipr_fr_name = -2, + omp_ipr_vendor = -3, + omp_ipr_vendor_name = -4, + omp_ipr_device_num = -5, + omp_ipr_platform = -6, + omp_ipr_device = -7, + omp_ipr_device_context = -8, + omp_ipr_targetsync = -9, + omp_ipr_first = -9 +} omp_interop_property_t; + +#define omp_interop_none 0 + +typedef enum omp_interop_rc { + omp_irc_no_value = 1, + omp_irc_success = 0, + omp_irc_empty = -1, + omp_irc_out_of_range = -2, + omp_irc_type_int = -3, + omp_irc_type_ptr = -4, + omp_irc_type_str = -5, + omp_irc_other = -6 +} omp_interop_rc_t; + +typedef enum omp_interop_fr { + omp_ifr_cuda = 1, + omp_ifr_cuda_driver = 2, + omp_ifr_opencl = 3, + omp_ifr_sycl = 4, + omp_ifr_hip = 5, + omp_ifr_level_zero = 6, + omp_ifr_last = 7 +} omp_interop_fr_t; + +typedef void *omp_interop_t; + +// libomptarget, if loaded, provides this function +int FTN_STDCALL FTN_GET_NUM_INTEROP_PROPERTIES(const omp_interop_t interop) { +#if KMP_MIC || KMP_OS_DARWIN || defined(KMP_STUB) + return 0; +#else + int (*fptr)(const omp_interop_t); + if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_num_interop_properties"))) + return (*fptr)(interop); + return 0; +#endif // KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB) +} + +/// TODO Convert FTN_GET_INTEROP_XXX functions into a macro like interop.cpp +// libomptarget, if loaded, provides this function +intptr_t FTN_STDCALL FTN_GET_INTEROP_INT(const omp_interop_t interop, + omp_interop_property_t property_id, + int *err) { + intptr_t (*fptr)(const omp_interop_t, omp_interop_property_t, int *); + if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_int"))) + return (*fptr)(interop, property_id, err); + return 0; +} + +// libomptarget, if loaded, provides this function +void *FTN_STDCALL FTN_GET_INTEROP_PTR(const omp_interop_t interop, + omp_interop_property_t property_id, + int *err) { + void *(*fptr)(const omp_interop_t, omp_interop_property_t, int *); + if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_ptr"))) + return (*fptr)(interop, property_id, err); + return nullptr; +} + +// libomptarget, if loaded, provides this function +const char *FTN_STDCALL FTN_GET_INTEROP_STR(const omp_interop_t interop, + omp_interop_property_t property_id, + int *err) { + const char *(*fptr)(const omp_interop_t, omp_interop_property_t, int *); + if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_str"))) + return (*fptr)(interop, property_id, err); + return nullptr; +} + +// libomptarget, if loaded, provides this function +const char *FTN_STDCALL FTN_GET_INTEROP_NAME( + const omp_interop_t interop, omp_interop_property_t property_id) { + const char *(*fptr)(const omp_interop_t, omp_interop_property_t); + if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_name"))) + return (*fptr)(interop, property_id); + return nullptr; +} + +// libomptarget, if loaded, provides this function +const char *FTN_STDCALL FTN_GET_INTEROP_TYPE_DESC( + const omp_interop_t interop, omp_interop_property_t property_id) { + const char *(*fptr)(const omp_interop_t, omp_interop_property_t); + if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_type_desc"))) + return (*fptr)(interop, property_id); + return nullptr; +} + +// libomptarget, if loaded, provides this function +const char *FTN_STDCALL FTN_GET_INTEROP_RC_DESC( + const omp_interop_t interop, omp_interop_property_t property_id) { + const char *(*fptr)(const omp_interop_t, omp_interop_property_t); + if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_rec_desc"))) + return (*fptr)(interop, property_id); + return nullptr; +} + // display environment variables when requested void FTN_STDCALL FTN_DISPLAY_ENV(int verbose) { #ifndef KMP_STUB diff --git a/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h b/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h index 5b9e396e3d..66e1e1ecd2 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h +++ b/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h @@ -140,6 +140,14 @@ #define FTN_SET_TEAMS_THREAD_LIMIT omp_set_teams_thread_limit #define FTN_GET_TEAMS_THREAD_LIMIT omp_get_teams_thread_limit +#define FTN_GET_NUM_INTEROP_PROPERTIES omp_get_num_interop_properties +#define FTN_GET_INTEROP_INT omp_get_interop_int +#define FTN_GET_INTEROP_PTR omp_get_interop_ptr +#define FTN_GET_INTEROP_STR omp_get_interop_str +#define FTN_GET_INTEROP_NAME omp_get_interop_name +#define FTN_GET_INTEROP_TYPE_DESC omp_get_interop_type_desc +#define FTN_GET_INTEROP_RC_DESC omp_get_interop_rc_desc + #endif /* KMP_FTN_PLAIN */ /* ------------------------------------------------------------------------ */ @@ -268,6 +276,14 @@ #define FTN_SET_TEAMS_THREAD_LIMIT omp_set_teams_thread_limit_ #define FTN_GET_TEAMS_THREAD_LIMIT omp_get_teams_thread_limit_ +#define FTN_GET_NUM_INTEROP_PROPERTIES omp_get_num_interop_properties_ +#define FTN_GET_INTEROP_INT omp_get_interop_int_ +#define FTN_GET_INTEROP_PTR omp_get_interop_ptr_ +#define FTN_GET_INTEROP_STR omp_get_interop_str_ +#define FTN_GET_INTEROP_NAME omp_get_interop_name_ +#define FTN_GET_INTEROP_TYPE_DESC omp_get_interop_type_desc_ +#define FTN_GET_INTEROP_RC_DESC omp_get_interop_rc_desc_ + #endif /* KMP_FTN_APPEND */ /* ------------------------------------------------------------------------ */ @@ -394,6 +410,14 @@ #define FTN_SET_TEAMS_THREAD_LIMIT OMP_SET_TEAMS_THREAD_LIMIT #define FTN_GET_TEAMS_THREAD_LIMIT OMP_GET_TEAMS_THREAD_LIMIT +#define FTN_GET_NUM_INTEROP_PROPERTIES OMP_GET_NUM_INTEROP_PROPERTIES +#define FTN_GET_INTEROP_INT OMP_GET_INTEROP_INT +#define FTN_GET_INTEROP_PTR OMP_GET_INTEROP_PTR +#define FTN_GET_INTEROP_STR OMP_GET_INTEROP_STR +#define FTN_GET_INTEROP_NAME OMP_GET_INTEROP_NAME +#define FTN_GET_INTEROP_TYPE_DESC OMP_GET_INTEROP_TYPE_DESC +#define FTN_GET_INTEROP_RC_DESC OMP_GET_INTEROP_RC_DESC + #endif /* KMP_FTN_UPPER */ /* ------------------------------------------------------------------------ */ @@ -522,6 +546,14 @@ #define FTN_SET_TEAMS_THREAD_LIMIT OMP_SET_TEAMS_THREAD_LIMIT_ #define FTN_GET_TEAMS_THREAD_LIMIT OMP_GET_TEAMS_THREAD_LIMIT_ +#define FTN_GET_NUM_INTEROP_PROPERTIES OMP_GET_NUM_INTEROP_PROPERTIES_ +#define FTN_GET_INTEROP_INT OMP_GET_INTEROP_INT_ +#define FTN_GET_INTEROP_PTR OMP_GET_INTEROP_PTR_ +#define FTN_GET_INTEROP_STR OMP_GET_INTEROP_STR_ +#define FTN_GET_INTEROP_NAME OMP_GET_INTEROP_NAME_ +#define FTN_GET_INTEROP_TYPE_DESC OMP_GET_INTEROP_TYPE_DESC_ +#define FTN_GET_INTEROP_RC_DESC OMP_GET_INTEROP_RC_DESC_ + #endif /* KMP_FTN_UAPPEND */ /* -------------------------- GOMP API NAMES ------------------------ */ @@ -712,5 +744,6 @@ #define KMP_API_NAME_GOMP_SECTIONS2_START GOMP_sections2_start #define KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER \ GOMP_workshare_task_reduction_unregister - +#define KMP_API_NAME_GOMP_ALLOC GOMP_alloc +#define KMP_API_NAME_GOMP_FREE GOMP_free #endif /* KMP_FTN_OS_H */ diff --git a/contrib/libs/cxxsupp/openmp/kmp_global.cpp b/contrib/libs/cxxsupp/openmp/kmp_global.cpp index b519fcf678..62bdac3c4b 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_global.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_global.cpp @@ -110,8 +110,8 @@ char const *__kmp_barrier_type_name[bs_last_barrier] = {"plain", "forkjoin" "reduction" #endif // KMP_FAST_REDUCTION_BARRIER }; -char const *__kmp_barrier_pattern_name[bp_last_bar] = {"linear", "tree", - "hyper", "hierarchical"}; +char const *__kmp_barrier_pattern_name[bp_last_bar] = { + "linear", "tree", "hyper", "hierarchical", "dist"}; int __kmp_allThreadsSpecified = 0; size_t __kmp_align_alloc = CACHE_LINE; @@ -219,6 +219,13 @@ int __kmp_mwait_enabled = FALSE; int __kmp_mwait_hints = 0; #endif +#if KMP_HAVE_UMWAIT +int __kmp_waitpkg_enabled = 0; +int __kmp_tpause_state = 0; +int __kmp_tpause_hint = 1; +int __kmp_tpause_enabled = 0; +#endif + /* map OMP 3.0 schedule types with our internal schedule types */ enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2] = { @@ -280,6 +287,7 @@ char *__kmp_cpuinfo_file = NULL; #endif /* KMP_AFFINITY_SUPPORTED */ kmp_nested_proc_bind_t __kmp_nested_proc_bind = {NULL, 0, 0}; +kmp_proc_bind_t __kmp_teams_proc_bind = proc_bind_spread; int __kmp_affinity_num_places = 0; int __kmp_display_affinity = FALSE; char *__kmp_affinity_format = NULL; @@ -424,6 +432,7 @@ kmp_int32 __kmp_use_yield_exp_set = 0; kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT; kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT; +kmp_uint64 __kmp_pause_init = 1; // for tpause /* ------------------------------------------------------ */ /* STATE mostly syncronized with global lock */ diff --git a/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp b/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp index 61a3199f1a..d77d4809a7 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp @@ -23,18 +23,24 @@ enum { KMP_GOMP_TASK_DEPENDS_FLAG = 8 }; +enum { + KMP_GOMP_DEPOBJ_IN = 1, + KMP_GOMP_DEPOBJ_OUT = 2, + KMP_GOMP_DEPOBJ_INOUT = 3, + KMP_GOMP_DEPOBJ_MTXINOUTSET = 4 +}; + // This class helps convert gomp dependency info into // kmp_depend_info_t structures class kmp_gomp_depends_info_t { void **depend; kmp_int32 num_deps; - size_t num_out, num_mutexinout, num_in; + size_t num_out, num_mutexinout, num_in, num_depobj; size_t offset; public: kmp_gomp_depends_info_t(void **depend) : depend(depend) { size_t ndeps = (kmp_intptr_t)depend[0]; - size_t num_doable; // GOMP taskdep structure: // if depend[0] != 0: // depend = [ ndeps | nout | &out | ... | &out | &in | ... | &in ] @@ -45,21 +51,17 @@ public: if (ndeps) { num_out = (kmp_intptr_t)depend[1]; num_in = ndeps - num_out; - num_mutexinout = 0; - num_doable = ndeps; + num_mutexinout = num_depobj = 0; offset = 2; } else { ndeps = (kmp_intptr_t)depend[1]; num_out = (kmp_intptr_t)depend[2]; num_mutexinout = (kmp_intptr_t)depend[3]; num_in = (kmp_intptr_t)depend[4]; - num_doable = num_out + num_mutexinout + num_in; + num_depobj = ndeps - num_out - num_mutexinout - num_in; + KMP_ASSERT(num_depobj <= ndeps); offset = 5; } - // TODO: Support gomp depobj - if (ndeps != num_doable) { - KMP_FATAL(GompFeatureNotSupported, "depobj"); - } num_deps = static_cast<kmp_int32>(ndeps); } kmp_int32 get_num_deps() const { return num_deps; } @@ -67,7 +69,6 @@ public: kmp_depend_info_t retval; memset(&retval, '\0', sizeof(retval)); KMP_ASSERT(index < (size_t)num_deps); - retval.base_addr = (kmp_intptr_t)depend[offset + index]; retval.len = 0; // Because inout and out are logically equivalent, // use inout and in dependency flags. GOMP does not provide a @@ -75,10 +76,37 @@ public: if (index < num_out) { retval.flags.in = 1; retval.flags.out = 1; + retval.base_addr = (kmp_intptr_t)depend[offset + index]; } else if (index >= num_out && index < (num_out + num_mutexinout)) { retval.flags.mtx = 1; - } else { + retval.base_addr = (kmp_intptr_t)depend[offset + index]; + } else if (index >= (num_out + num_mutexinout) && + index < (num_out + num_mutexinout + num_in)) { retval.flags.in = 1; + retval.base_addr = (kmp_intptr_t)depend[offset + index]; + } else { + // depobj is a two element array (size of elements are size of pointer) + // depobj[0] = base_addr + // depobj[1] = type (in, out, inout, mutexinoutset, etc.) + kmp_intptr_t *depobj = (kmp_intptr_t *)depend[offset + index]; + retval.base_addr = depobj[0]; + switch (depobj[1]) { + case KMP_GOMP_DEPOBJ_IN: + retval.flags.in = 1; + break; + case KMP_GOMP_DEPOBJ_OUT: + retval.flags.out = 1; + break; + case KMP_GOMP_DEPOBJ_INOUT: + retval.flags.in = 1; + retval.flags.out = 1; + break; + case KMP_GOMP_DEPOBJ_MTXINOUTSET: + retval.flags.mtx = 1; + break; + default: + KMP_FATAL(GompFeatureNotSupported, "Unknown depobj type"); + } } return retval; } @@ -1206,7 +1234,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data, // The low-order bit is the "untied" flag if (!(gomp_flags & KMP_GOMP_TASK_UNTIED_FLAG)) { - input_flags->tiedness = 1; + input_flags->tiedness = TASK_TIED; } // The second low-order bit is the "final" flag if (gomp_flags & KMP_GOMP_TASK_FINAL_FLAG) { @@ -1494,6 +1522,13 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task)(void *), KA_TRACE(20, ("GOMP_parallel_sections: T#%d\n", gtid)); #if OMPT_SUPPORT + ompt_frame_t *task_frame; + kmp_info_t *thr; + if (ompt_enabled.enabled) { + thr = __kmp_threads[gtid]; + task_frame = &(thr->th.th_current_task->ompt_task_info.frame); + task_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); + } OMPT_STORE_RETURN_ADDRESS(gtid); #endif @@ -1509,9 +1544,31 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task)(void *), KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE); } + +#if OMPT_SUPPORT + ompt_frame_t *child_frame; + if (ompt_enabled.enabled) { + child_frame = &(thr->th.th_current_task->ompt_task_info.frame); + child_frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); + } +#endif + task(data); + +#if OMPT_SUPPORT + if (ompt_enabled.enabled) { + child_frame->exit_frame = ompt_data_none; + } +#endif + KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)(); KA_TRACE(20, ("GOMP_parallel_sections exit: T#%d\n", gtid)); + +#if OMPT_SUPPORT + if (ompt_enabled.enabled) { + task_frame->enter_frame = ompt_data_none; + } +#endif } #define PARALLEL_LOOP(func, schedule, ompt_pre, ompt_post) \ @@ -1738,7 +1795,7 @@ void __GOMP_taskloop(void (*func)(void *), void *data, KMP_ASSERT(arg_align > 0); // The low-order bit is the "untied" flag if (!(gomp_flags & 1)) { - input_flags->tiedness = 1; + input_flags->tiedness = TASK_TIED; } // The second low-order bit is the "final" flag if (gomp_flags & 2) { @@ -2428,6 +2485,26 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER)( } } +// allocator construct +void *KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ALLOC)(size_t alignment, size_t size, + uintptr_t allocator) { + int gtid = __kmp_entry_gtid(); + KA_TRACE(20, ("GOMP_alloc: T#%d\n", gtid)); +#if OMPT_SUPPORT && OMPT_OPTIONAL + OMPT_STORE_RETURN_ADDRESS(gtid); +#endif + return __kmp_alloc(gtid, alignment, size, (omp_allocator_handle_t)allocator); +} + +void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_FREE)(void *ptr, uintptr_t allocator) { + int gtid = __kmp_entry_gtid(); + KA_TRACE(20, ("GOMP_free: T#%d\n", gtid)); +#if OMPT_SUPPORT && OMPT_OPTIONAL + OMPT_STORE_RETURN_ADDRESS(gtid); +#endif + return ___kmpc_free(gtid, ptr, (omp_allocator_handle_t)allocator); +} + /* The following sections of code create aliases for the GOMP_* functions, then create versioned symbols using the assembler directive .symver. This is only pertinent for ELF .so library. The KMP_VERSION_SYMBOL macro is defined in @@ -2616,6 +2693,10 @@ KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_START, 50, "GOMP_5.0"); KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS2_START, 50, "GOMP_5.0"); KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER, 50, "GOMP_5.0"); + +// GOMP_5.0.1 versioned symbols +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ALLOC, 501, "GOMP_5.0.1"); +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_FREE, 501, "GOMP_5.0.1"); #endif // KMP_USE_VERSION_SYMBOLS #ifdef __cplusplus diff --git a/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc b/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc index 8e3e90caae..776cca2b66 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc +++ b/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc @@ -223,6 +223,7 @@ __kmp_i18n_default_messages[] = "%1$s value \"%2$u\" will be used.", "%1$s value \"%2$s\" will be used.", "%1$s value \"%2$s\" will be used.", + "Mixing other barrier patterns with dist is prohibited. Using dist for all barrier patterns.", "%1$s maximum value \"%2$d\" will be used.", "%1$s minimum value \"%2$d\" will be used.", "Memory allocation failed.", @@ -307,6 +308,8 @@ __kmp_i18n_default_messages[] = "Allocator %1$s is not available, will use default allocator.", "%1$s: %2$s (%3$d total cores)", "%1$s: granularity setting: %2$s does not exist in topology. Using granularity=%3$s instead.", + "%1$s: hybrid core type detected: %2$d %3$s cores.", + "%1$s: %2$d with core efficiency %3$d.", "%1$s must be bound to a work-sharing or work-queuing construct with an \"ordered\" clause", "Detected end of %1$s without first executing a corresponding beginning.", "Iteration range too large in %1$s.", @@ -402,6 +405,15 @@ __kmp_i18n_default_messages[] = "KMP_HW_SUBSET ignored: %1$s, %2$s: layers are equivalent, please only specify one.", "KMP_HW_SUBSET ignored: %1$s layer should come after %2$s.", "%1$s: topology layer \"%2$s\" is equivalent to \"%3$s\".", + "%1$s: granularity=%2$s is too coarse, setting granularity=group.", + "%1$s: \"%2$s\" value is deprecated. Please use \"%3$s\" instead.", + "num_teams value must be positive, it is %1$d, using %2$d instead.", + "KMP_HW_SUBSET ignored: %1$s, %2$s: attributes are ambiguous, please only specify one.", + "KMP_HW_SUBSET ignored: %1$s: attribute specified more than once.", + "KMP_HW_SUBSET ignored: %1$s: attribute value %2$s is invalid.", + "KMP_HW_SUBSET ignored: all hardware resources would be filtered, please reduce the filter.", + "KMP_HW_SUBSET ignored: Too many attributes specified. This machine is not a hybrid architecutre.", + "KMP_HW_SUBSET: ignoring %1$s attribute. This machine is not a hybrid architecutre.", NULL }; @@ -437,6 +449,7 @@ __kmp_i18n_default_hints[] = "System error #193 is \"Bad format of EXE or DLL file\". Usually it means the file is found, but it is corrupted or a file for another architecture. Check whether \"%1$s\" is a file for %2$s architecture.", "System-related limit on the number of threads.", "Try setting new bounds (preferably less than or equal to %1$d) for num_teams clause.", + "Valid values are from %1$d to %2$d.", NULL }; @@ -453,8 +466,8 @@ __kmp_i18n_sections[] = { 5, __kmp_i18n_default_meta }, { 79, __kmp_i18n_default_strings }, { 6, __kmp_i18n_default_formats }, - { 286, __kmp_i18n_default_messages }, - { 28, __kmp_i18n_default_hints }, + { 298, __kmp_i18n_default_messages }, + { 29, __kmp_i18n_default_hints }, { 0, NULL } }; diff --git a/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc b/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc index 7fec5e6223..a66f8117c2 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc +++ b/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc @@ -217,6 +217,7 @@ enum kmp_i18n_id { kmp_i18n_msg_Using_uint_Value, kmp_i18n_msg_Using_uint64_Value, kmp_i18n_msg_Using_str_Value, + kmp_i18n_msg_BarrierPatternOverride, kmp_i18n_msg_MaxValueUsing, kmp_i18n_msg_MinValueUsing, kmp_i18n_msg_MemoryAllocFailed, @@ -301,6 +302,8 @@ enum kmp_i18n_id { kmp_i18n_msg_OmpNoAllocator, kmp_i18n_msg_TopologyGeneric, kmp_i18n_msg_AffGranularityBad, + kmp_i18n_msg_TopologyHybrid, + kmp_i18n_msg_TopologyHybridCoreEff, kmp_i18n_msg_CnsBoundToWorksharing, kmp_i18n_msg_CnsDetectedEnd, kmp_i18n_msg_CnsIterationRangeTooLarge, @@ -396,6 +399,15 @@ enum kmp_i18n_id { kmp_i18n_msg_AffHWSubsetEqvLayers, kmp_i18n_msg_AffHWSubsetOutOfOrder, kmp_i18n_msg_AffEqualTopologyTypes, + kmp_i18n_msg_AffGranTooCoarseProcGroup, + kmp_i18n_msg_StgDeprecatedValue, + kmp_i18n_msg_NumTeamsNotPositive, + kmp_i18n_msg_AffHWSubsetIncompat, + kmp_i18n_msg_AffHWSubsetAttrRepeat, + kmp_i18n_msg_AffHWSubsetAttrInvalid, + kmp_i18n_msg_AffHWSubsetAllFiltered, + kmp_i18n_msg_AffHWSubsetAttrsNonHybrid, + kmp_i18n_msg_AffHWSubsetIgnoringAttr, kmp_i18n_msg_last, // Set #5, hints. @@ -428,6 +440,7 @@ enum kmp_i18n_id { kmp_i18n_hnt_BadExeFormat, kmp_i18n_hnt_SystemLimitOnThreads, kmp_i18n_hnt_SetNewBound, + kmp_i18n_hnt_ValidValuesRange, kmp_i18n_hnt_last, kmp_i18n_xxx_lastest diff --git a/contrib/libs/cxxsupp/openmp/kmp_itt.cpp b/contrib/libs/cxxsupp/openmp/kmp_itt.cpp index a76c639625..f99b264da6 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_itt.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_itt.cpp @@ -24,12 +24,9 @@ #error #include "ittnotify_config.h" __itt_global __kmp_ittapi_clean_global; extern __itt_global __kmp_itt__ittapi_global; -kmp_int32 __kmp_barrier_domain_count; -kmp_int32 __kmp_region_domain_count; -__itt_domain *__kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS]; -__itt_domain *__kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS]; -__itt_domain *__kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS]; -kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS]; + +kmp_itthash_t __kmp_itt_barrier_domains = {{0}, 0}; +kmp_itthash_t __kmp_itt_region_domains = {{0}, 0}; __itt_domain *metadata_domain = NULL; __itt_string_handle *string_handle_imbl = NULL; __itt_string_handle *string_handle_loop = NULL; diff --git a/contrib/libs/cxxsupp/openmp/kmp_itt.h b/contrib/libs/cxxsupp/openmp/kmp_itt.h index 75a24540d4..c640e83b71 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_itt.h +++ b/contrib/libs/cxxsupp/openmp/kmp_itt.h @@ -278,15 +278,21 @@ __kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller); } /* if */ \ } while (0) -const int KMP_MAX_FRAME_DOMAINS = - 512; // Maximum number of frame domains to use (maps to +// Maximum number of frame domains to use (maps to // different OpenMP regions in the user source code). -extern kmp_int32 __kmp_barrier_domain_count; -extern kmp_int32 __kmp_region_domain_count; -extern __itt_domain *__kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS]; -extern __itt_domain *__kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS]; -extern __itt_domain *__kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS]; -extern kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS]; +const int KMP_MAX_FRAME_DOMAINS = 997; +typedef struct kmp_itthash_entry { + ident_t *loc; + int team_size; + __itt_domain *d; + struct kmp_itthash_entry *next_in_bucket; +} kmp_itthash_entry_t; +typedef struct kmp_itthash { + kmp_itthash_entry_t *buckets[KMP_MAX_FRAME_DOMAINS]; + int count; // just a heuristic to limit number of entries +} kmp_itthash_t; +extern kmp_itthash_t __kmp_itt_region_domains; +extern kmp_itthash_t __kmp_itt_barrier_domains; extern __itt_domain *metadata_domain; extern __itt_string_handle *string_handle_imbl; extern __itt_string_handle *string_handle_loop; diff --git a/contrib/libs/cxxsupp/openmp/kmp_lock.cpp b/contrib/libs/cxxsupp/openmp/kmp_lock.cpp index 59726f2b9f..fff7305b57 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_lock.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_lock.cpp @@ -96,12 +96,19 @@ __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) { } kmp_uint32 spins; + kmp_uint64 time; KMP_FSYNC_PREPARE(lck); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); kmp_backoff_t backoff = __kmp_spin_backoff_params; do { +#if !KMP_HAVE_UMWAIT __kmp_spin_backoff(&backoff); - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); +#else + if (!__kmp_tpause_enabled) + __kmp_spin_backoff(&backoff); +#endif + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free || !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)); KMP_FSYNC_ACQUIRED(lck); @@ -1344,14 +1351,15 @@ static int __kmp_test_queuing_lock_with_checks(kmp_queuing_lock_t *lck, } int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) { - kmp_info_t *this_thr; volatile kmp_int32 *head_id_p = &lck->lk.head_id; volatile kmp_int32 *tail_id_p = &lck->lk.tail_id; KA_TRACE(1000, ("__kmp_release_queuing_lock: lck:%p, T#%d entering\n", lck, gtid)); KMP_DEBUG_ASSERT(gtid >= 0); - this_thr = __kmp_thread_from_gtid(gtid); +#if KMP_DEBUG || DEBUG_QUEUING_LOCKS + kmp_info_t *this_thr = __kmp_thread_from_gtid(gtid); +#endif KMP_DEBUG_ASSERT(this_thr != NULL); #ifdef DEBUG_QUEUING_LOCKS TRACE_LOCK(gtid + 1, "rel ent"); @@ -2226,10 +2234,12 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) { // The current implementation of KMP_WAIT doesn't allow for mask // and poll to be re-read every spin iteration. kmp_uint32 spins; + kmp_uint64 time; KMP_FSYNC_PREPARE(lck); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); while (polls[ticket & mask] < ticket) { // atomic load - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); // Re-read the mask and the poll pointer from the lock structure. // // Make certain that "mask" is read before "polls" !!! @@ -2658,9 +2668,17 @@ void __kmp_spin_backoff(kmp_backoff_t *boff) { kmp_uint32 i; for (i = boff->step; i > 0; i--) { kmp_uint64 goal = __kmp_tsc() + boff->min_tick; - do { - KMP_CPU_PAUSE(); - } while (before(__kmp_tsc(), goal)); +#if KMP_HAVE_UMWAIT + if (__kmp_umwait_enabled) { + __kmp_tpause(0, boff->min_tick); + } else { +#endif + do { + KMP_CPU_PAUSE(); + } while (before(__kmp_tsc(), goal)); +#if KMP_HAVE_UMWAIT + } +#endif } boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1); } @@ -3103,7 +3121,7 @@ kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock, kmp_int32 gtid, kmp_indirect_locktag_t tag) { kmp_indirect_lock_t *lck; - kmp_lock_index_t idx; + kmp_lock_index_t idx, table_idx; __kmp_acquire_lock(&__kmp_global_lock, gtid); @@ -3116,26 +3134,41 @@ kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock, KA_TRACE(20, ("__kmp_allocate_indirect_lock: reusing an existing lock %p\n", lck)); } else { - idx = __kmp_i_lock_table.next; - // Check capacity and double the size if it is full - if (idx == __kmp_i_lock_table.size) { - // Double up the space for block pointers - int row = __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK; - kmp_indirect_lock_t **new_table = (kmp_indirect_lock_t **)__kmp_allocate( - 2 * row * sizeof(kmp_indirect_lock_t *)); - KMP_MEMCPY(new_table, __kmp_i_lock_table.table, - row * sizeof(kmp_indirect_lock_t *)); - kmp_indirect_lock_t **old_table = __kmp_i_lock_table.table; - __kmp_i_lock_table.table = new_table; - __kmp_free(old_table); - // Allocate new objects in the new blocks - for (int i = row; i < 2 * row; ++i) - *(__kmp_i_lock_table.table + i) = (kmp_indirect_lock_t *)__kmp_allocate( - KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t)); - __kmp_i_lock_table.size = 2 * idx; + kmp_uint32 row, col; + kmp_indirect_lock_table_t *lock_table = &__kmp_i_lock_table; + idx = 0; + // Find location in list of lock tables to put new lock + while (1) { + table_idx = lock_table->next; // index within this table + idx += lock_table->next; // global index within list of tables + if (table_idx < lock_table->nrow_ptrs * KMP_I_LOCK_CHUNK) { + row = table_idx / KMP_I_LOCK_CHUNK; + col = table_idx % KMP_I_LOCK_CHUNK; + // Allocate a new row of locks if necessary + if (!lock_table->table[row]) { + lock_table->table[row] = (kmp_indirect_lock_t *)__kmp_allocate( + sizeof(kmp_indirect_lock_t) * KMP_I_LOCK_CHUNK); + } + break; + } + // Allocate a new lock table if necessary with double the capacity + if (!lock_table->next_table) { + kmp_indirect_lock_table_t *next_table = + (kmp_indirect_lock_table_t *)__kmp_allocate( + sizeof(kmp_indirect_lock_table_t)); + next_table->table = (kmp_indirect_lock_t **)__kmp_allocate( + sizeof(kmp_indirect_lock_t *) * 2 * lock_table->nrow_ptrs); + next_table->nrow_ptrs = 2 * lock_table->nrow_ptrs; + next_table->next = 0; + next_table->next_table = nullptr; + lock_table->next_table = next_table; + } + lock_table = lock_table->next_table; + KMP_ASSERT(lock_table); } - __kmp_i_lock_table.next++; - lck = KMP_GET_I_LOCK(idx); + lock_table->next++; + + lck = &lock_table->table[row][col]; // Allocate a new base lock object lck->lock = (kmp_user_lock_p)__kmp_allocate(__kmp_indirect_lock_size[tag]); KA_TRACE(20, @@ -3166,10 +3199,7 @@ __kmp_lookup_indirect_lock(void **user_lock, const char *func) { } if (OMP_LOCK_T_SIZE < sizeof(void *)) { kmp_lock_index_t idx = KMP_EXTRACT_I_INDEX(user_lock); - if (idx >= __kmp_i_lock_table.size) { - KMP_FATAL(LockIsUninitialized, func); - } - lck = KMP_GET_I_LOCK(idx); + lck = __kmp_get_i_lock(idx); } else { lck = *((kmp_indirect_lock_t **)user_lock); } @@ -3179,7 +3209,7 @@ __kmp_lookup_indirect_lock(void **user_lock, const char *func) { return lck; } else { if (OMP_LOCK_T_SIZE < sizeof(void *)) { - return KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(user_lock)); + return __kmp_get_i_lock(KMP_EXTRACT_I_INDEX(user_lock)); } else { return *((kmp_indirect_lock_t **)user_lock); } @@ -3189,13 +3219,13 @@ __kmp_lookup_indirect_lock(void **user_lock, const char *func) { static void __kmp_init_indirect_lock(kmp_dyna_lock_t *lock, kmp_dyna_lockseq_t seq) { #if KMP_USE_ADAPTIVE_LOCKS - if (seq == lockseq_adaptive && !__kmp_cpuinfo.rtm) { + if (seq == lockseq_adaptive && !__kmp_cpuinfo.flags.rtm) { KMP_WARNING(AdaptiveNotSupported, "kmp_lockseq_t", "adaptive"); seq = lockseq_queuing; } #endif #if KMP_USE_TSX - if (seq == lockseq_rtm_queuing && !__kmp_cpuinfo.rtm) { + if (seq == lockseq_rtm_queuing && !__kmp_cpuinfo.flags.rtm) { seq = lockseq_queuing; } #endif @@ -3322,12 +3352,13 @@ void __kmp_init_dynamic_user_locks() { return; // Initialize lock index table - __kmp_i_lock_table.size = KMP_I_LOCK_CHUNK; - __kmp_i_lock_table.table = - (kmp_indirect_lock_t **)__kmp_allocate(sizeof(kmp_indirect_lock_t *)); + __kmp_i_lock_table.nrow_ptrs = KMP_I_LOCK_TABLE_INIT_NROW_PTRS; + __kmp_i_lock_table.table = (kmp_indirect_lock_t **)__kmp_allocate( + sizeof(kmp_indirect_lock_t *) * KMP_I_LOCK_TABLE_INIT_NROW_PTRS); *(__kmp_i_lock_table.table) = (kmp_indirect_lock_t *)__kmp_allocate( KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t)); __kmp_i_lock_table.next = 0; + __kmp_i_lock_table.next_table = nullptr; // Indirect lock size __kmp_indirect_lock_size[locktag_ticket] = sizeof(kmp_ticket_lock_t); @@ -3392,7 +3423,6 @@ void __kmp_init_dynamic_user_locks() { // Clean up the lock table. void __kmp_cleanup_indirect_user_locks() { - kmp_lock_index_t i; int k; // Clean up locks in the pools first (they were already destroyed before going @@ -3410,22 +3440,29 @@ void __kmp_cleanup_indirect_user_locks() { __kmp_indirect_lock_pool[k] = NULL; } // Clean up the remaining undestroyed locks. - for (i = 0; i < __kmp_i_lock_table.next; i++) { - kmp_indirect_lock_t *l = KMP_GET_I_LOCK(i); - if (l->lock != NULL) { - // Locks not destroyed explicitly need to be destroyed here. - KMP_I_LOCK_FUNC(l, destroy)(l->lock); - KA_TRACE( - 20, - ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p from table\n", - l)); - __kmp_free(l->lock); + kmp_indirect_lock_table_t *ptr = &__kmp_i_lock_table; + while (ptr) { + for (kmp_uint32 row = 0; row < ptr->nrow_ptrs; ++row) { + if (!ptr->table[row]) + continue; + for (kmp_uint32 col = 0; col < KMP_I_LOCK_CHUNK; ++col) { + kmp_indirect_lock_t *l = &ptr->table[row][col]; + if (l->lock) { + // Locks not destroyed explicitly need to be destroyed here. + KMP_I_LOCK_FUNC(l, destroy)(l->lock); + KA_TRACE(20, ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p " + "from table\n", + l)); + __kmp_free(l->lock); + } + } + __kmp_free(ptr->table[row]); } + kmp_indirect_lock_table_t *next_table = ptr->next_table; + if (ptr != &__kmp_i_lock_table) + __kmp_free(ptr); + ptr = next_table; } - // Free the table - for (i = 0; i < __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK; i++) - __kmp_free(__kmp_i_lock_table.table[i]); - __kmp_free(__kmp_i_lock_table.table); __kmp_init_user_locks = FALSE; } diff --git a/contrib/libs/cxxsupp/openmp/kmp_lock.h b/contrib/libs/cxxsupp/openmp/kmp_lock.h index 4f6ad6414e..a19f4ca323 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_lock.h +++ b/contrib/libs/cxxsupp/openmp/kmp_lock.h @@ -651,12 +651,15 @@ extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck, if (lck->tas.lk.poll != 0 || \ !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \ kmp_uint32 spins; \ + kmp_uint64 time; \ KMP_FSYNC_PREPARE(lck); \ KMP_INIT_YIELD(spins); \ + KMP_INIT_BACKOFF(time); \ do { \ - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); \ - } while (lck->tas.lk.poll != 0 || !__kmp_atomic_compare_store_acq( \ - &lck->tas.lk.poll, 0, gtid + 1)); \ + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); \ + } while ( \ + lck->tas.lk.poll != 0 || \ + !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)); \ } \ KMP_FSYNC_ACQUIRED(lck); \ } else { \ @@ -758,10 +761,12 @@ extern int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck, if ((lck->tas.lk.poll != 0) || \ !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \ kmp_uint32 spins; \ + kmp_uint64 time; \ KMP_FSYNC_PREPARE(lck); \ KMP_INIT_YIELD(spins); \ + KMP_INIT_BACKOFF(time); \ do { \ - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); \ + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); \ } while ( \ (lck->tas.lk.poll != 0) || \ !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)); \ @@ -1217,22 +1222,41 @@ extern kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])( ? __kmp_indirect_get_flags[(lck)->type]((lck)->lock) \ : NULL) -#define KMP_I_LOCK_CHUNK \ - 1024 // number of kmp_indirect_lock_t objects to be allocated together +// number of kmp_indirect_lock_t objects to be allocated together +#define KMP_I_LOCK_CHUNK 1024 +// Keep at a power of 2 since it is used in multiplication & division +KMP_BUILD_ASSERT(KMP_I_LOCK_CHUNK % 2 == 0); +// number of row entries in the initial lock table +#define KMP_I_LOCK_TABLE_INIT_NROW_PTRS 8 // Lock table for indirect locks. typedef struct kmp_indirect_lock_table { kmp_indirect_lock_t **table; // blocks of indirect locks allocated - kmp_lock_index_t size; // size of the indirect lock table + kmp_uint32 nrow_ptrs; // number *table pointer entries in table kmp_lock_index_t next; // index to the next lock to be allocated + struct kmp_indirect_lock_table *next_table; } kmp_indirect_lock_table_t; extern kmp_indirect_lock_table_t __kmp_i_lock_table; // Returns the indirect lock associated with the given index. -#define KMP_GET_I_LOCK(index) \ - (*(__kmp_i_lock_table.table + (index) / KMP_I_LOCK_CHUNK) + \ - (index) % KMP_I_LOCK_CHUNK) +// Returns nullptr if no lock at given index +static inline kmp_indirect_lock_t *__kmp_get_i_lock(kmp_lock_index_t idx) { + kmp_indirect_lock_table_t *lock_table = &__kmp_i_lock_table; + while (lock_table) { + kmp_lock_index_t max_locks = lock_table->nrow_ptrs * KMP_I_LOCK_CHUNK; + if (idx < max_locks) { + kmp_lock_index_t row = idx / KMP_I_LOCK_CHUNK; + kmp_lock_index_t col = idx % KMP_I_LOCK_CHUNK; + if (!lock_table->table[row] || idx >= lock_table->next) + break; + return &lock_table->table[row][col]; + } + idx -= max_locks; + lock_table = lock_table->next_table; + } + return nullptr; +} // Number of locks in a lock block, which is fixed to "1" now. // TODO: No lock block implementation now. If we do support, we need to manage @@ -1241,8 +1265,9 @@ extern int __kmp_num_locks_in_block; // Fast lock table lookup without consistency checking #define KMP_LOOKUP_I_LOCK(l) \ - ((OMP_LOCK_T_SIZE < sizeof(void *)) ? KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(l)) \ - : *((kmp_indirect_lock_t **)(l))) + ((OMP_LOCK_T_SIZE < sizeof(void *)) \ + ? __kmp_get_i_lock(KMP_EXTRACT_I_INDEX(l)) \ + : *((kmp_indirect_lock_t **)(l))) // Used once in kmp_error.cpp extern kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p, kmp_uint32); diff --git a/contrib/libs/cxxsupp/openmp/kmp_os.h b/contrib/libs/cxxsupp/openmp/kmp_os.h index 4437cf2518..d71e9aecb3 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_os.h +++ b/contrib/libs/cxxsupp/openmp/kmp_os.h @@ -1025,6 +1025,30 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v); #define KMP_MB() /* nothing to do */ #endif +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 +#if KMP_COMPILER_ICC +#define KMP_MFENCE_() _mm_mfence() +#define KMP_SFENCE_() _mm_sfence() +#elif KMP_COMPILER_MSVC +#define KMP_MFENCE_() MemoryBarrier() +#define KMP_SFENCE_() MemoryBarrier() +#else +#define KMP_MFENCE_() __sync_synchronize() +#define KMP_SFENCE_() __sync_synchronize() +#endif +#define KMP_MFENCE() \ + if (UNLIKELY(!__kmp_cpuinfo.initialized)) { \ + __kmp_query_cpuid(&__kmp_cpuinfo); \ + } \ + if (__kmp_cpuinfo.flags.sse2) { \ + KMP_MFENCE_(); \ + } +#define KMP_SFENCE() KMP_SFENCE_() +#else +#define KMP_MFENCE() KMP_MB() +#define KMP_SFENCE() KMP_MB() +#endif + #ifndef KMP_IMB #define KMP_IMB() /* nothing to do */ #endif diff --git a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp index fe931bb157..34f8a01743 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp @@ -107,6 +107,10 @@ static int __kmp_unregister_root_other_thread(int gtid); static void __kmp_reap_thread(kmp_info_t *thread, int is_root); kmp_info_t *__kmp_thread_pool_insert_pt = NULL; +void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, + int new_nthreads); +void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads); + /* Calculate the identifier of the current thread */ /* fast (and somewhat portable) way to get unique identifier of executing thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ @@ -910,7 +914,8 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, assured that there are enough threads available, because we checked on that earlier within critical section forkjoin */ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, - kmp_info_t *master_th, int master_gtid) { + kmp_info_t *master_th, int master_gtid, + int fork_teams_workers) { int i; int use_hot_team; @@ -999,7 +1004,12 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, } #if KMP_AFFINITY_SUPPORTED - __kmp_partition_places(team); + // Do not partition the places list for teams construct workers who + // haven't actually been forked to do real work yet. This partitioning + // will take place in the parallel region nested within the teams construct. + if (!fork_teams_workers) { + __kmp_partition_places(team); + } #endif } @@ -1204,7 +1214,7 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { this_thr->th.th_team = serial_team; serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; - KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, + KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid, this_thr->th.th_current_task)); KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); this_thr->th.th_current_task->td_flags.executing = 0; @@ -1563,15 +1573,24 @@ int __kmp_fork_call(ident_t *loc, int gtid, /* Change number of threads in the team if requested */ if (master_set_numthreads) { // The parallel has num_threads clause - if (master_set_numthreads < master_th->th.th_teams_size.nth) { + if (master_set_numthreads <= master_th->th.th_teams_size.nth) { // AC: only can reduce number of threads dynamically, can't increase kmp_info_t **other_threads = parent_team->t.t_threads; + // NOTE: if using distributed barrier, we need to run this code block + // even when the team size appears not to have changed from the max. + int old_proc = master_th->th.th_teams_size.nth; + if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == + bp_dist_bar) { + __kmp_resize_dist_barrier(parent_team, old_proc, + master_set_numthreads); + __kmp_add_threads_to_team(parent_team, master_set_numthreads); + } parent_team->t.t_nproc = master_set_numthreads; for (i = 0; i < master_set_numthreads; ++i) { other_threads[i]->th.th_team_nproc = master_set_numthreads; } - // Keep extra threads hot in the team for possible next parallels } + // Keep extra threads hot in the team for possible next parallels master_th->th.th_set_nproc = 0; } @@ -1584,6 +1603,41 @@ int __kmp_fork_call(ident_t *loc, int gtid, } #endif + // Figure out the proc_bind policy for the nested parallel within teams + kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; + // proc_bind_default means don't update + kmp_proc_bind_t proc_bind_icv = proc_bind_default; + if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { + proc_bind = proc_bind_false; + } else { + // No proc_bind clause specified; use current proc-bind-var + if (proc_bind == proc_bind_default) { + proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; + } + /* else: The proc_bind policy was specified explicitly on parallel + clause. + This overrides proc-bind-var for this parallel region, but does not + change proc-bind-var. */ + // Figure the value of proc-bind-var for the child threads. + if ((level + 1 < __kmp_nested_proc_bind.used) && + (__kmp_nested_proc_bind.bind_types[level + 1] != + master_th->th.th_current_task->td_icvs.proc_bind)) { + proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; + } + } + KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind); + // Need to change the bind-var ICV to correct value for each implicit task + if (proc_bind_icv != proc_bind_default && + master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) { + kmp_info_t **other_threads = parent_team->t.t_threads; + for (i = 0; i < master_th->th.th_team_nproc; ++i) { + other_threads[i]->th.th_current_task->td_icvs.proc_bind = + proc_bind_icv; + } + } + // Reset for next parallel region + master_th->th.th_set_proc_bind = proc_bind_default; + #if USE_ITT_BUILD && USE_ITT_NOTIFY if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || KMP_ITT_DEBUG) && @@ -1600,6 +1654,9 @@ int __kmp_fork_call(ident_t *loc, int gtid, parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); } #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ +#if KMP_AFFINITY_SUPPORTED + __kmp_partition_places(parent_team); +#endif KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " "master_th=%p, gtid=%d\n", @@ -1635,6 +1692,9 @@ int __kmp_fork_call(ident_t *loc, int gtid, } #endif + // Need this to happen before we determine the number of threads, not while + // we are allocating the team + //__kmp_push_current_task_to_thread(master_th, parent_team, 0); int enter_teams = 0; if (parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels) { @@ -1642,13 +1702,10 @@ int __kmp_fork_call(ident_t *loc, int gtid, } else { enter_teams = ((ap == NULL && active_level == 0) || (ap && teams_level > 0 && teams_level == level)); - nthreads = - master_set_numthreads - ? master_set_numthreads - : get__nproc_2( - parent_team, - master_tid); // TODO: get nproc directly from current task - + nthreads = master_set_numthreads + ? master_set_numthreads + // TODO: get nproc directly from current task + : get__nproc_2(parent_team, master_tid); // Check if we need to take forkjoin lock? (no need for serialized // parallel out of teams construct). This code moved here from // __kmp_reserve_threads() to speedup nested serialized parallels. @@ -1940,16 +1997,21 @@ int __kmp_fork_call(ident_t *loc, int gtid, // Figure out the proc_bind_policy for the new team. kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; - kmp_proc_bind_t proc_bind_icv = - proc_bind_default; // proc_bind_default means don't update + // proc_bind_default means don't update + kmp_proc_bind_t proc_bind_icv = proc_bind_default; if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { proc_bind = proc_bind_false; } else { + // No proc_bind clause specified; use current proc-bind-var for this + // parallel region if (proc_bind == proc_bind_default) { - // No proc_bind clause specified; use current proc-bind-var for this - // parallel region proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; } + // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND + if (master_th->th.th_teams_microtask && + microtask == (microtask_t)__kmp_teams_master) { + proc_bind = __kmp_teams_proc_bind; + } /* else: The proc_bind policy was specified explicitly on parallel clause. This overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */ @@ -1957,7 +2019,11 @@ int __kmp_fork_call(ident_t *loc, int gtid, if ((level + 1 < __kmp_nested_proc_bind.used) && (__kmp_nested_proc_bind.bind_types[level + 1] != master_th->th.th_current_task->td_icvs.proc_bind)) { - proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; + // Do not modify the proc bind icv for the two teams construct forks + // They just let the proc bind icv pass through + if (!master_th->th.th_teams_microtask || + !(microtask == (microtask_t)__kmp_teams_master || ap == NULL)) + proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; } } @@ -1983,6 +2049,8 @@ int __kmp_fork_call(ident_t *loc, int gtid, #endif proc_bind, &new_icvs, argc USE_NESTED_HOT_ARG(master_th)); + if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) + copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs); } else { /* allocate a new parallel team */ KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); @@ -1993,6 +2061,9 @@ int __kmp_fork_call(ident_t *loc, int gtid, proc_bind, &master_th->th.th_current_task->td_icvs, argc USE_NESTED_HOT_ARG(master_th)); + if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) + copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, + &master_th->th.th_current_task->td_icvs); } KF_TRACE( 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); @@ -2124,7 +2195,7 @@ int __kmp_fork_call(ident_t *loc, int gtid, if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong root->r.r_active = TRUE; - __kmp_fork_team_threads(root, team, master_th, gtid); + __kmp_fork_team_threads(root, team, master_th, gtid, !ap); __kmp_setup_icv_copy(team, nthreads, &master_th->th.th_current_task->td_icvs, loc); @@ -2359,6 +2430,12 @@ void __kmp_join_call(ident_t *loc, int gtid parent_team->t.t_stack_id = NULL; } #endif + + if (team->t.t_nproc > 1 && + __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + team->t.b->update_num_threads(team->t.t_nproc); + __kmp_add_threads_to_team(team, team->t.t_nproc); + } } KMP_MB(); @@ -2387,6 +2464,14 @@ void __kmp_join_call(ident_t *loc, int gtid } // active_level == 1 #endif /* USE_ITT_BUILD */ +#if KMP_AFFINITY_SUPPORTED + if (!exit_teams) { + // Restore master thread's partition. + master_th->th.th_first_place = team->t.t_first_place; + master_th->th.th_last_place = team->t.t_last_place; + } +#endif // KMP_AFFINITY_SUPPORTED + if (master_th->th.th_teams_microtask && !exit_teams && team->t.t_pkfn != (microtask_t)__kmp_teams_master && team->t.t_level == master_th->th.th_teams_level + 1) { @@ -2494,11 +2579,6 @@ void __kmp_join_call(ident_t *loc, int gtid master_th, team)); __kmp_pop_current_task_from_thread(master_th); -#if KMP_AFFINITY_SUPPORTED - // Restore master thread's partition. - master_th->th.th_first_place = team->t.t_first_place; - master_th->th.th_last_place = team->t.t_last_place; -#endif // KMP_AFFINITY_SUPPORTED master_th->th.th_def_allocator = team->t.t_def_allocator; #if OMPD_SUPPORT @@ -2646,6 +2726,9 @@ void __kmp_set_num_threads(int new_nth, int gtid) { __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); + if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth); + } // Release the extra threads we don't need any more. for (f = new_nth; f < hot_team->t.t_nproc; f++) { KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); @@ -2665,6 +2748,11 @@ void __kmp_set_num_threads(int new_nth, int gtid) { } #endif + if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + hot_team->t.b->update_num_threads(new_nth); + __kmp_add_threads_to_team(hot_team, new_nth); + } + __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); // Update the t_nproc field in the threads that are still active. @@ -4018,7 +4106,8 @@ void __kmp_unregister_root_current_thread(int gtid) { kmp_task_team_t *task_team = thread->th.th_task_team; // we need to wait for the proxy tasks before finishing the thread - if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { + if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks || + task_team->tt.tt_hidden_helper_task_encountered)) { #if OMPT_SUPPORT // the runtime is shutting down so we won't report any events thread->th.ompt_thread_info.state = ompt_state_undefined; @@ -4112,7 +4201,6 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, this_thr->th.th_team_nproc = team->t.t_nproc; this_thr->th.th_team_master = master; this_thr->th.th_team_serialized = team->t.t_serialized; - TCW_PTR(this_thr->th.th_sleep_loc, NULL); KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); @@ -4281,6 +4369,12 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, new_thr->th.th_task_state_top = 0; new_thr->th.th_task_state_stack_sz = 4; + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + // Make sure pool thread has transitioned to waiting on own thread struct + KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0); + // Thread activated in __kmp_allocate_team when increasing team size + } + #ifdef KMP_ADJUST_BLOCKTIME /* Adjust blocktime back to zero if necessary */ /* Middle initialization might not have occurred yet */ @@ -4448,6 +4542,9 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, balign[b].bb.use_oncore_barrier = 0; } + TCW_PTR(new_thr->th.th_sleep_loc, NULL); + new_thr->th.th_sleep_loc_type = flag_unset; + new_thr->th.th_spin_here = FALSE; new_thr->th.th_next_waiting = 0; #if KMP_OS_UNIX @@ -4976,6 +5073,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, kmp_team_t *team; int use_hot_team = !root->r.r_active; int level = 0; + int do_place_partition = 1; KA_TRACE(20, ("__kmp_allocate_team: called\n")); KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); @@ -4997,6 +5095,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, ++level; // not increment if #teams==1, or for outer fork of the teams; // increment otherwise } + // Do not perform the place partition if inner fork of the teams + // Wait until nested parallel region encountered inside teams construct + if ((master->th.th_teams_size.nteams == 1 && + master->th.th_teams_level >= team->t.t_level) || + (team->t.t_pkfn == (microtask_t)__kmp_teams_master)) + do_place_partition = 0; } hot_teams = master->th.th_hot_teams; if (level < __kmp_hot_teams_max_level && hot_teams && @@ -5027,6 +5131,17 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, } #endif + if (team->t.t_nproc != new_nproc && + __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + // Distributed barrier may need a resize + int old_nthr = team->t.t_nproc; + __kmp_resize_dist_barrier(team, old_nthr, new_nproc); + } + + // If not doing the place partition, then reset the team's proc bind + // to indicate that partitioning of all threads still needs to take place + if (do_place_partition == 0) + team->t.t_proc_bind = proc_bind_default; // Has the number of threads changed? /* Let's assume the most common case is that the number of threads is unchanged, and put that case first. */ @@ -5056,16 +5171,20 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, if ((team->t.t_size_changed == 0) && (team->t.t_proc_bind == new_proc_bind)) { if (new_proc_bind == proc_bind_spread) { - __kmp_partition_places( - team, 1); // add flag to update only master for spread + if (do_place_partition) { + // add flag to update only master for spread + __kmp_partition_places(team, 1); + } } KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " "proc_bind = %d, partition = [%d,%d]\n", team->t.t_id, new_proc_bind, team->t.t_first_place, team->t.t_last_place)); } else { - KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); - __kmp_partition_places(team); + if (do_place_partition) { + KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); + __kmp_partition_places(team); + } } #else KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); @@ -5076,6 +5195,11 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, new_nproc)); team->t.t_size_changed = 1; + if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + // Barrier size already reduced earlier in this function + // Activate team threads via th_used_in_team + __kmp_add_threads_to_team(team, new_nproc); + } #if KMP_NESTED_HOT_TEAMS if (__kmp_hot_teams_mode == 0) { // AC: saved number of threads should correspond to team's value in this @@ -5137,10 +5261,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, } #endif - KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); + if (do_place_partition) { + KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); #if KMP_AFFINITY_SUPPORTED - __kmp_partition_places(team); + __kmp_partition_places(team); #endif + } } else { // team->t.t_nproc < new_nproc #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED kmp_affin_mask_t *old_mask; @@ -5152,7 +5278,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, KA_TRACE(20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc)); - + int old_nproc = team->t.t_nproc; // save old value and use to update only team->t.t_size_changed = 1; #if KMP_NESTED_HOT_TEAMS @@ -5179,10 +5305,9 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); team->t.t_nproc = new_nproc; // just get reserved threads involved } else { - // we may have some threads in reserve, but not enough - team->t.t_nproc = - hot_teams[level] - .hot_team_nth; // get reserved threads involved if any + // We may have some threads in reserve, but not enough; + // get reserved threads involved if any. + team->t.t_nproc = hot_teams[level].hot_team_nth; hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size #endif // KMP_NESTED_HOT_TEAMS if (team->t.t_max_nproc < new_nproc) { @@ -5237,8 +5362,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, #if KMP_NESTED_HOT_TEAMS } // end of check of t_nproc vs. new_nproc vs. hot_team_nth #endif // KMP_NESTED_HOT_TEAMS + if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + // Barrier size already increased earlier in this function + // Activate team threads via th_used_in_team + __kmp_add_threads_to_team(team, new_nproc); + } /* make sure everyone is syncronized */ - int old_nproc = team->t.t_nproc; // save old value and use to update only // new threads below __kmp_initialize_team(team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident); @@ -5273,10 +5402,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, } #endif - KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); + if (do_place_partition) { + KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); #if KMP_AFFINITY_SUPPORTED - __kmp_partition_places(team); + __kmp_partition_places(team); #endif + } } // Check changes in number of threads kmp_info_t *master = team->t.t_threads[0]; @@ -5342,6 +5473,13 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, /* take this team from the team pool */ __kmp_team_pool = team->t.t_next_pool; + if (max_nproc > 1 && + __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + if (!team->t.b) { // Allocate barrier structure + team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); + } + } + /* setup the team for fresh use */ __kmp_initialize_team(team, new_nproc, new_icvs, NULL); @@ -5397,6 +5535,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, /* and set it up */ team->t.t_max_nproc = max_nproc; + if (max_nproc > 1 && + __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + // Allocate barrier structure + team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); + } + /* NOTE well, for some reason allocating one big buffer and dividing it up seems to really hurt performance a lot on the P4, so, let's not use this */ __kmp_allocate_team_arrays(team, max_nproc); @@ -5469,7 +5613,6 @@ void __kmp_free_team(kmp_root_t *root, int use_hot_team = team == root->r.r_hot_team; #if KMP_NESTED_HOT_TEAMS int level; - kmp_hot_team_ptr_t *hot_teams; if (master) { level = team->t.t_active_level - 1; if (master->th.th_teams_microtask) { // in teams construct? @@ -5483,7 +5626,9 @@ void __kmp_free_team(kmp_root_t *root, // team_of_workers before the parallel } // team->t.t_level will be increased inside parallel } - hot_teams = master->th.th_hot_teams; +#if KMP_DEBUG + kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams; +#endif if (level < __kmp_hot_teams_max_level) { KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); use_hot_team = 1; @@ -5553,10 +5698,43 @@ void __kmp_free_team(kmp_root_t *root, /* free the worker threads */ for (f = 1; f < team->t.t_nproc; ++f) { KMP_DEBUG_ASSERT(team->t.t_threads[f]); + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), + 1, 2); + } __kmp_free_thread(team->t.t_threads[f]); + } + + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + if (team->t.b) { + // wake up thread at old location + team->t.b->go_release(); + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { + for (f = 1; f < team->t.t_nproc; ++f) { + if (team->t.b->sleep[f].sleep) { + __kmp_atomic_resume_64( + team->t.t_threads[f]->th.th_info.ds.ds_gtid, + (kmp_atomic_flag_64<> *)NULL); + } + } + } + // Wait for threads to be removed from team + for (int f = 1; f < team->t.t_nproc; ++f) { + while (team->t.t_threads[f]->th.th_used_in_team.load() != 0) + KMP_CPU_PAUSE(); + } + } + } + + for (f = 1; f < team->t.t_nproc; ++f) { team->t.t_threads[f] = NULL; } + if (team->t.t_max_nproc > 1 && + __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + distributedBarrier::deallocate(team->t.b); + team->t.b = NULL; + } /* put the team back in the team pool */ /* TODO limit size of team pool, call reap_team if pool too large */ team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); @@ -5955,11 +6133,18 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid)); - /* Need release fence here to prevent seg faults for tree forkjoin barrier - * (GEH) */ - kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, - thread); - __kmp_release_64(&flag); + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + while ( + !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3)) + KMP_CPU_PAUSE(); + __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL); + } else { + /* Need release fence here to prevent seg faults for tree forkjoin + barrier (GEH) */ + kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, + thread); + __kmp_release_64(&flag); + } } // Terminate OS thread. @@ -6054,6 +6239,31 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { } // __kmp_reap_thread +static void __kmp_itthash_clean(kmp_info_t *th) { +#if USE_ITT_NOTIFY + if (__kmp_itt_region_domains.count > 0) { + for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { + kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i]; + while (bucket) { + kmp_itthash_entry_t *next = bucket->next_in_bucket; + __kmp_thread_free(th, bucket); + bucket = next; + } + } + } + if (__kmp_itt_barrier_domains.count > 0) { + for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { + kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i]; + while (bucket) { + kmp_itthash_entry_t *next = bucket->next_in_bucket; + __kmp_thread_free(th, bucket); + bucket = next; + } + } + } +#endif +} + static void __kmp_internal_end(void) { int i; @@ -6240,6 +6450,7 @@ void __kmp_internal_end_library(int gtid_req) { gtid)); return; } else { + __kmp_itthash_clean(__kmp_threads[gtid]); KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); @@ -6486,7 +6697,7 @@ void __kmp_register_library_startup(void) { char *value = NULL; // Actual value of the environment variable. -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library +#if defined(KMP_USE_SHM) char *shm_name = __kmp_str_format("/%s", name); int shm_preexist = 0; char *data1; @@ -6591,7 +6802,7 @@ void __kmp_register_library_startup(void) { } break; case 2: { // Neighbor is dead. -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library +#if defined(KMP_USE_SHM) // close shared memory. shm_unlink(shm_name); // this removes file in /dev/shm #else @@ -6605,7 +6816,7 @@ void __kmp_register_library_startup(void) { } } KMP_INTERNAL_FREE((void *)value); -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library +#if defined(KMP_USE_SHM) KMP_INTERNAL_FREE((void *)shm_name); #endif } // while @@ -6618,7 +6829,7 @@ void __kmp_unregister_library(void) { char *name = __kmp_reg_status_name(); char *value = NULL; -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library +#if defined(KMP_USE_SHM) char *shm_name = __kmp_str_format("/%s", name); int fd1 = shm_open(shm_name, O_RDONLY, 0666); if (fd1 == -1) { @@ -6639,14 +6850,14 @@ void __kmp_unregister_library(void) { KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { // Ok, this is our variable. Delete it. -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library +#if defined(KMP_USE_SHM) shm_unlink(shm_name); // this removes file in /dev/shm #else __kmp_env_unset(name); #endif } -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library +#if defined(KMP_USE_SHM) KMP_INTERNAL_FREE(shm_name); #endif @@ -6684,7 +6895,9 @@ static void __kmp_check_mic_type() { static void __kmp_user_level_mwait_init() { struct kmp_cpuid buf; __kmp_x86_cpuid(7, 0, &buf); - __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; + __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1); + __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait; + __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0); KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", __kmp_umwait_enabled)); } @@ -6844,8 +7057,8 @@ static void __kmp_do_serial_initialize(void) { #if KMP_FAST_REDUCTION_BARRIER #define kmp_reduction_barrier_gather_bb ((int)1) #define kmp_reduction_barrier_release_bb ((int)1) -#define kmp_reduction_barrier_gather_pat bp_hyper_bar -#define kmp_reduction_barrier_release_pat bp_hyper_bar +#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt +#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt #endif // KMP_FAST_REDUCTION_BARRIER for (i = bs_plain_barrier; i < bs_last_barrier; i++) { __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; @@ -7500,6 +7713,11 @@ static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, num_threads = 1; } } else { + if (num_threads < 0) { + __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1), + __kmp_msg_null); + num_threads = 1; + } // This thread will be the primary thread of the league primary threads // Store new thread limit; old limit is saved in th_cg_roots list thr->th.th_current_task->td_icvs.thread_limit = num_threads; @@ -7531,9 +7749,13 @@ static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, int num_threads) { kmp_info_t *thr = __kmp_threads[gtid]; - KMP_DEBUG_ASSERT(num_teams >= 0); - KMP_DEBUG_ASSERT(num_threads >= 0); - + if (num_teams < 0) { + // OpenMP specification requires requested values to be positive, + // but people can send us any value, so we'd better check + __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1), + __kmp_msg_null); + num_teams = 1; + } if (num_teams == 0) { if (__kmp_nteams > 0) { num_teams = __kmp_nteams; @@ -7590,7 +7812,7 @@ void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams num_teams = num_teams_ub; } else { // num_teams_lb <= num_teams <= num_teams_ub - if (num_threads == 0) { + if (num_threads <= 0) { if (num_teams_ub > __kmp_teams_max_nth) { num_teams = num_teams_lb; } else { @@ -8702,6 +8924,96 @@ void __kmp_omp_display_env(int verbose) { __kmp_release_bootstrap_lock(&__kmp_initz_lock); } +// The team size is changing, so distributed barrier must be modified +void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, + int new_nthreads) { + KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] == + bp_dist_bar); + kmp_info_t **other_threads = team->t.t_threads; + + // We want all the workers to stop waiting on the barrier while we adjust the + // size of the team. + for (int f = 1; f < old_nthreads; ++f) { + KMP_DEBUG_ASSERT(other_threads[f] != NULL); + // Ignore threads that are already inactive or not present in the team + if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) { + // teams construct causes thread_limit to get passed in, and some of + // those could be inactive; just ignore them + continue; + } + // If thread is transitioning still to in_use state, wait for it + if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) { + while (team->t.t_threads[f]->th.th_used_in_team.load() == 3) + KMP_CPU_PAUSE(); + } + // The thread should be in_use now + KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1); + // Transition to unused state + team->t.t_threads[f]->th.th_used_in_team.store(2); + KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2); + } + // Release all the workers + kmp_uint64 new_value; // new value for go + new_value = team->t.b->go_release(); + + KMP_MFENCE(); + + // Workers should see transition status 2 and move to 0; but may need to be + // woken up first + size_t my_go_index; + int count = old_nthreads - 1; + while (count > 0) { + count = old_nthreads - 1; + for (int f = 1; f < old_nthreads; ++f) { + my_go_index = f / team->t.b->threads_per_go; + if (other_threads[f]->th.th_used_in_team.load() != 0) { + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers + kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST( + void *, other_threads[f]->th.th_sleep_loc); + __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag); + } + } else { + KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0); + count--; + } + } + } + // Now update the barrier size + team->t.b->update_num_threads(new_nthreads); + team->t.b->go_reset(); +} + +void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) { + // Add the threads back to the team + KMP_DEBUG_ASSERT(team); + // Threads were paused and pointed at th_used_in_team temporarily during a + // resize of the team. We're going to set th_used_in_team to 3 to indicate to + // the thread that it should transition itself back into the team. Then, if + // blocktime isn't infinite, the thread could be sleeping, so we send a resume + // to wake it up. + for (int f = 1; f < new_nthreads; ++f) { + KMP_DEBUG_ASSERT(team->t.t_threads[f]); + KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0, + 3); + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads + __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid, + (kmp_flag_32<false, false> *)NULL); + } + } + // The threads should be transitioning to the team; when they are done, they + // should have set th_used_in_team to 1. This loop forces master to wait until + // all threads have moved into the team and are waiting in the barrier. + int count = new_nthreads - 1; + while (count > 0) { + count = new_nthreads - 1; + for (int f = 1; f < new_nthreads; ++f) { + if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) { + count--; + } + } + } +} + // Globals and functions for hidden helper task kmp_info_t **__kmp_hidden_helper_threads; kmp_info_t *__kmp_hidden_helper_main_thread; diff --git a/contrib/libs/cxxsupp/openmp/kmp_settings.cpp b/contrib/libs/cxxsupp/openmp/kmp_settings.cpp index 0b0973c766..112502fdce 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_settings.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_settings.cpp @@ -164,7 +164,12 @@ int __kmp_convert_to_milliseconds(char const *data) { return (INT_MAX); value = (double)0.0; mult = '\0'; +#if KMP_OS_WINDOWS && KMP_MSVC_COMPAT + // On Windows, each %c parameter needs additional size parameter for sscanf_s + nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, 1, &extra, 1); +#else nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, &extra); +#endif if (nvalues < 1) return (-1); if (nvalues == 1) @@ -297,8 +302,8 @@ void __kmp_check_stksize(size_t *val) { // if system stack size is too big then limit the size for worker threads if (*val > KMP_DEFAULT_STKSIZE * 16) // just a heuristics... *val = KMP_DEFAULT_STKSIZE * 16; - if (*val < KMP_MIN_STKSIZE) - *val = KMP_MIN_STKSIZE; + if (*val < __kmp_sys_min_stksize) + *val = __kmp_sys_min_stksize; if (*val > KMP_MAX_STKSIZE) *val = KMP_MAX_STKSIZE; // dead code currently, but may work in future #if KMP_OS_DARWIN @@ -426,6 +431,7 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value, int *out_range, char *out_routine, char *out_file, int *out_lb, int *out_ub) { + const char *par_range_value; size_t len = KMP_STRLEN(value) + 1; par_range_to_print = (char *)KMP_INTERNAL_MALLOC(len + 1); KMP_STRNCPY_S(par_range_to_print, len + 1, value, len + 1); @@ -434,11 +440,14 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value, __kmp_par_range_ub = INT_MAX; for (;;) { unsigned int len; - if (*value == '\0') { + if (!value || *value == '\0') { break; } if (!__kmp_strcasecmp_with_sentinel("routine", value, '=')) { - value = strchr(value, '=') + 1; + par_range_value = strchr(value, '=') + 1; + if (!par_range_value) + goto par_range_error; + value = par_range_value; len = __kmp_readstr_with_sentinel(out_routine, value, KMP_PAR_RANGE_ROUTINE_LEN - 1, ','); if (len == 0) { @@ -451,7 +460,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value, continue; } if (!__kmp_strcasecmp_with_sentinel("filename", value, '=')) { - value = strchr(value, '=') + 1; + par_range_value = strchr(value, '=') + 1; + if (!par_range_value) + goto par_range_error; + value = par_range_value; len = __kmp_readstr_with_sentinel(out_file, value, KMP_PAR_RANGE_FILENAME_LEN - 1, ','); if (len == 0) { @@ -465,7 +477,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value, } if ((!__kmp_strcasecmp_with_sentinel("range", value, '=')) || (!__kmp_strcasecmp_with_sentinel("incl_range", value, '='))) { - value = strchr(value, '=') + 1; + par_range_value = strchr(value, '=') + 1; + if (!par_range_value) + goto par_range_error; + value = par_range_value; if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) { goto par_range_error; } @@ -477,7 +492,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value, continue; } if (!__kmp_strcasecmp_with_sentinel("excl_range", value, '=')) { - value = strchr(value, '=') + 1; + par_range_value = strchr(value, '=') + 1; + if (!par_range_value) + goto par_range_error; + value = par_range_value; if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) { goto par_range_error; } @@ -1684,6 +1702,8 @@ static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value, const char *var; /* ---------- Barrier method control ------------ */ + static int dist_req = 0, non_dist_req = 0; + static bool warn = 1; for (int i = bs_plain_barrier; i < bs_last_barrier; i++) { var = __kmp_barrier_pattern_env_name[i]; @@ -1695,6 +1715,11 @@ static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value, for (j = bp_linear_bar; j < bp_last_bar; j++) { if (__kmp_match_with_sentinel(__kmp_barrier_pattern_name[j], value, 1, ',')) { + if (j == bp_dist_bar) { + dist_req++; + } else { + non_dist_req++; + } __kmp_barrier_gather_pattern[i] = (kmp_bar_pat_e)j; break; } @@ -1709,6 +1734,11 @@ static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value, if (comma != NULL) { for (j = bp_linear_bar; j < bp_last_bar; j++) { if (__kmp_str_match(__kmp_barrier_pattern_name[j], 1, comma + 1)) { + if (j == bp_dist_bar) { + dist_req++; + } else { + non_dist_req++; + } __kmp_barrier_release_pattern[i] = (kmp_bar_pat_e)j; break; } @@ -1723,6 +1753,20 @@ static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value, } } } + if (dist_req != 0) { + // set all barriers to dist + if ((non_dist_req != 0) && warn) { + KMP_INFORM(BarrierPatternOverride, name, + __kmp_barrier_pattern_name[bp_dist_bar]); + warn = 0; + } + for (int i = bs_plain_barrier; i < bs_last_barrier; i++) { + if (__kmp_barrier_release_pattern[i] != bp_dist_bar) + __kmp_barrier_release_pattern[i] = bp_dist_bar; + if (__kmp_barrier_gather_pattern[i] != bp_dist_bar) + __kmp_barrier_gather_pattern[i] = bp_dist_bar; + } + } } // __kmp_stg_parse_barrier_pattern static void __kmp_stg_print_barrier_pattern(kmp_str_buf_t *buffer, @@ -1739,7 +1783,7 @@ static void __kmp_stg_print_barrier_pattern(kmp_str_buf_t *buffer, __kmp_str_buf_print(buffer, " %s='", __kmp_barrier_pattern_env_name[i]); } - KMP_DEBUG_ASSERT(j < bs_last_barrier && k < bs_last_barrier); + KMP_DEBUG_ASSERT(j < bp_last_bar && k < bp_last_bar); __kmp_str_buf_print(buffer, "%s,%s'\n", __kmp_barrier_pattern_name[j], __kmp_barrier_pattern_name[k]); } @@ -3092,6 +3136,7 @@ static void __kmp_stg_parse_topology_method(char const *name, char const *value, } #if KMP_GROUP_AFFINITY else if (__kmp_str_match("group", 1, value)) { + KMP_WARNING(StgDeprecatedValue, name, value, "all"); __kmp_affinity_top_method = affinity_top_method_group; } #endif /* KMP_GROUP_AFFINITY */ @@ -3155,6 +3200,47 @@ static void __kmp_stg_print_topology_method(kmp_str_buf_t *buffer, } } // __kmp_stg_print_topology_method +// KMP_TEAMS_PROC_BIND +struct kmp_proc_bind_info_t { + const char *name; + kmp_proc_bind_t proc_bind; +}; +static kmp_proc_bind_info_t proc_bind_table[] = { + {"spread", proc_bind_spread}, + {"true", proc_bind_spread}, + {"close", proc_bind_close}, + // teams-bind = false means "replicate the primary thread's affinity" + {"false", proc_bind_primary}, + {"primary", proc_bind_primary}}; +static void __kmp_stg_parse_teams_proc_bind(char const *name, char const *value, + void *data) { + int valid; + const char *end; + valid = 0; + for (size_t i = 0; i < sizeof(proc_bind_table) / sizeof(proc_bind_table[0]); + ++i) { + if (__kmp_match_str(proc_bind_table[i].name, value, &end)) { + __kmp_teams_proc_bind = proc_bind_table[i].proc_bind; + valid = 1; + break; + } + } + if (!valid) { + KMP_WARNING(StgInvalidValue, name, value); + } +} +static void __kmp_stg_print_teams_proc_bind(kmp_str_buf_t *buffer, + char const *name, void *data) { + const char *value = KMP_I18N_STR(NotDefined); + for (size_t i = 0; i < sizeof(proc_bind_table) / sizeof(proc_bind_table[0]); + ++i) { + if (__kmp_teams_proc_bind == proc_bind_table[i].proc_bind) { + value = proc_bind_table[i].name; + break; + } + } + __kmp_stg_print_str(buffer, name, value); +} #endif /* KMP_AFFINITY_SUPPORTED */ // OMP_PROC_BIND / bind-var is functional on all 4.0 builds, including OS X* @@ -4415,7 +4501,7 @@ static void __kmp_stg_parse_lock_kind(char const *name, char const *value, } #if KMP_USE_ADAPTIVE_LOCKS else if (__kmp_str_match("adaptive", 1, value)) { - if (__kmp_cpuinfo.rtm) { // ??? Is cpuinfo available here? + if (__kmp_cpuinfo.flags.rtm) { // ??? Is cpuinfo available here? __kmp_user_lock_kind = lk_adaptive; KMP_STORE_LOCK_SEQ(adaptive); } else { @@ -4427,7 +4513,7 @@ static void __kmp_stg_parse_lock_kind(char const *name, char const *value, #endif // KMP_USE_ADAPTIVE_LOCKS #if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX else if (__kmp_str_match("rtm_queuing", 1, value)) { - if (__kmp_cpuinfo.rtm) { + if (__kmp_cpuinfo.flags.rtm) { __kmp_user_lock_kind = lk_rtm_queuing; KMP_STORE_LOCK_SEQ(rtm_queuing); } else { @@ -4436,7 +4522,7 @@ static void __kmp_stg_parse_lock_kind(char const *name, char const *value, KMP_STORE_LOCK_SEQ(queuing); } } else if (__kmp_str_match("rtm_spin", 1, value)) { - if (__kmp_cpuinfo.rtm) { + if (__kmp_cpuinfo.flags.rtm) { __kmp_user_lock_kind = lk_rtm_spin; KMP_STORE_LOCK_SEQ(rtm_spin); } else { @@ -4875,28 +4961,85 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value, // Check each component for (int i = 0; i < level; ++i) { - int offset = 0; - int num = atoi(components[i]); // each component should start with a number - if (num <= 0) { - goto err; // only positive integers are valid for count - } - if ((pos = strchr(components[i], '@'))) { - offset = atoi(pos + 1); // save offset - *pos = '\0'; // cut the offset from the component - } - pos = components[i] + strspn(components[i], digits); - if (pos == components[i]) { - goto err; - } - // detect the component type - kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos); - if (type == KMP_HW_UNKNOWN) { - goto err; - } - if (__kmp_hw_subset->specified(type)) { - goto err; + int core_level = 0; + char *core_components[MAX_T_LEVEL]; + // Split possible core components by '&' delimiter + pos = components[i]; + core_components[core_level++] = pos; + while ((pos = strchr(pos, '&'))) { + if (core_level >= MAX_T_LEVEL) + goto err; // too many different core types + *pos = '\0'; // modify input and avoid more copying + core_components[core_level++] = ++pos; // expect something after '&' + } + + for (int j = 0; j < core_level; ++j) { + char *offset_ptr; + char *attr_ptr; + int offset = 0; + kmp_hw_attr_t attr; + int num; + // components may begin with an optional count of the number of resources + if (isdigit(*core_components[j])) { + num = atoi(core_components[j]); + if (num <= 0) { + goto err; // only positive integers are valid for count + } + pos = core_components[j] + strspn(core_components[j], digits); + } else if (*core_components[j] == '*') { + num = kmp_hw_subset_t::USE_ALL; + pos = core_components[j] + 1; + } else { + num = kmp_hw_subset_t::USE_ALL; + pos = core_components[j]; + } + + offset_ptr = strchr(core_components[j], '@'); + attr_ptr = strchr(core_components[j], ':'); + + if (offset_ptr) { + offset = atoi(offset_ptr + 1); // save offset + *offset_ptr = '\0'; // cut the offset from the component + } + if (attr_ptr) { + attr.clear(); + // save the attribute +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + if (__kmp_str_match("intel_core", -1, attr_ptr + 1)) { + attr.set_core_type(KMP_HW_CORE_TYPE_CORE); + } else if (__kmp_str_match("intel_atom", -1, attr_ptr + 1)) { + attr.set_core_type(KMP_HW_CORE_TYPE_ATOM); + } +#endif + if (__kmp_str_match("eff", 3, attr_ptr + 1)) { + const char *number = attr_ptr + 1; + // skip the eff[iciency] token + while (isalpha(*number)) + number++; + if (!isdigit(*number)) { + goto err; + } + int efficiency = atoi(number); + attr.set_core_eff(efficiency); + } else { + goto err; + } + *attr_ptr = '\0'; // cut the attribute from the component + } + // detect the component type + kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos); + if (type == KMP_HW_UNKNOWN) { + goto err; + } + // Only the core type can have attributes + if (attr && type != KMP_HW_CORE) + goto err; + // Must allow core be specified more than once + if (type != KMP_HW_CORE && __kmp_hw_subset->specified(type)) { + goto err; + } + __kmp_hw_subset->push_back(num, type, offset, attr); } - __kmp_hw_subset->push_back(num, type, offset); } return; err: @@ -4908,6 +5051,21 @@ err: return; } +static inline const char * +__kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) { + switch (type) { + case KMP_HW_CORE_TYPE_UNKNOWN: + return "unknown"; +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + case KMP_HW_CORE_TYPE_ATOM: + return "intel_atom"; + case KMP_HW_CORE_TYPE_CORE: + return "intel_core"; +#endif + } + return "unknown"; +} + static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name, void *data) { kmp_str_buf_t buf; @@ -4923,10 +5081,20 @@ static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name, depth = __kmp_hw_subset->get_depth(); for (int i = 0; i < depth; ++i) { const auto &item = __kmp_hw_subset->at(i); - __kmp_str_buf_print(&buf, "%s%d%s", (i > 0 ? "," : ""), item.num, - __kmp_hw_get_keyword(item.type)); - if (item.offset) - __kmp_str_buf_print(&buf, "@%d", item.offset); + if (i > 0) + __kmp_str_buf_print(&buf, "%c", ','); + for (int j = 0; j < item.num_attrs; ++j) { + __kmp_str_buf_print(&buf, "%s%d%s", (j > 0 ? "&" : ""), item.num[j], + __kmp_hw_get_keyword(item.type)); + if (item.attr[j].is_core_type_valid()) + __kmp_str_buf_print( + &buf, ":%s", + __kmp_hw_get_core_type_keyword(item.attr[j].get_core_type())); + if (item.attr[j].is_core_eff_valid()) + __kmp_str_buf_print(&buf, ":eff%d", item.attr[j].get_core_eff()); + if (item.offset[j]) + __kmp_str_buf_print(&buf, "@%d", item.offset[j]); + } } __kmp_str_buf_print(buffer, "%s'\n", buf.str); __kmp_str_buf_free(&buf); @@ -5003,6 +5171,27 @@ static void __kmp_stg_print_mwait_hints(kmp_str_buf_t *buffer, char const *name, #endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +#if KMP_HAVE_UMWAIT +// ----------------------------------------------------------------------------- +// KMP_TPAUSE +// 0 = don't use TPAUSE, 1 = use C0.1 state, 2 = use C0.2 state + +static void __kmp_stg_parse_tpause(char const *name, char const *value, + void *data) { + __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_tpause_state); + if (__kmp_tpause_state != 0) { + // The actual hint passed to tpause is: 0 for C0.2 and 1 for C0.1 + if (__kmp_tpause_state == 2) // use C0.2 + __kmp_tpause_hint = 0; // default was set to 1 for C0.1 + } +} // __kmp_stg_parse_tpause + +static void __kmp_stg_print_tpause(kmp_str_buf_t *buffer, char const *name, + void *data) { + __kmp_stg_print_int(buffer, name, __kmp_tpause_state); +} // __kmp_stg_print_tpause +#endif // KMP_HAVE_UMWAIT + // ----------------------------------------------------------------------------- // OMP_DISPLAY_ENV @@ -5260,6 +5449,8 @@ static kmp_setting_t __kmp_stg_table[] = { #endif /* KMP_GOMP_COMPAT */ {"OMP_PROC_BIND", __kmp_stg_parse_proc_bind, __kmp_stg_print_proc_bind, NULL, 0, 0}, + {"KMP_TEAMS_PROC_BIND", __kmp_stg_parse_teams_proc_bind, + __kmp_stg_print_teams_proc_bind, NULL, 0, 0}, {"OMP_PLACES", __kmp_stg_parse_places, __kmp_stg_print_places, NULL, 0, 0}, {"KMP_TOPOLOGY_METHOD", __kmp_stg_parse_topology_method, __kmp_stg_print_topology_method, NULL, 0, 0}, @@ -5366,6 +5557,10 @@ static kmp_setting_t __kmp_stg_table[] = { {"KMP_MWAIT_HINTS", __kmp_stg_parse_mwait_hints, __kmp_stg_print_mwait_hints, NULL, 0, 0}, #endif + +#if KMP_HAVE_UMWAIT + {"KMP_TPAUSE", __kmp_stg_parse_tpause, __kmp_stg_print_tpause, NULL, 0, 0}, +#endif {"", NULL, NULL, NULL, 0, 0}}; // settings static int const __kmp_stg_count = @@ -5942,65 +6137,27 @@ void __kmp_env_initialize(char const *string) { // Handle the Win 64 group affinity stuff if there are multiple // processor groups, or if the user requested it, and OMP 4.0 // affinity is not in effect. - if (((__kmp_num_proc_groups > 1) && - (__kmp_affinity_type == affinity_default) && - (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default)) || - (__kmp_affinity_top_method == affinity_top_method_group)) { + if (__kmp_num_proc_groups > 1 && + __kmp_affinity_type == affinity_default && + __kmp_nested_proc_bind.bind_types[0] == proc_bind_default) { + // Do not respect the initial processor affinity mask if it is assigned + // exactly one Windows Processor Group since this is interpreted as the + // default OS assignment. Not respecting the mask allows the runtime to + // use all the logical processors in all groups. if (__kmp_affinity_respect_mask == affinity_respect_mask_default && exactly_one_group) { __kmp_affinity_respect_mask = FALSE; } + // Use compact affinity with anticipation of pinning to at least the + // group granularity since threads can only be bound to one group. if (__kmp_affinity_type == affinity_default) { __kmp_affinity_type = affinity_compact; __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel; } - if (__kmp_affinity_top_method == affinity_top_method_default) { - if (__kmp_affinity_gran == KMP_HW_UNKNOWN) { - __kmp_affinity_top_method = affinity_top_method_group; - __kmp_affinity_gran = KMP_HW_PROC_GROUP; - } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) { - __kmp_affinity_top_method = affinity_top_method_group; - } else { - __kmp_affinity_top_method = affinity_top_method_all; - } - } else if (__kmp_affinity_top_method == affinity_top_method_group) { - if (__kmp_affinity_gran == KMP_HW_UNKNOWN) { - __kmp_affinity_gran = KMP_HW_PROC_GROUP; - } else if ((__kmp_affinity_gran != KMP_HW_PROC_GROUP) && - (__kmp_affinity_gran != KMP_HW_THREAD)) { - const char *str = __kmp_hw_get_keyword(__kmp_affinity_gran); - KMP_WARNING(AffGranTopGroup, var, str); - __kmp_affinity_gran = KMP_HW_THREAD; - } - } else { - if (__kmp_affinity_gran == KMP_HW_UNKNOWN) { - __kmp_affinity_gran = KMP_HW_CORE; - } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) { - const char *str = NULL; - switch (__kmp_affinity_type) { - case affinity_physical: - str = "physical"; - break; - case affinity_logical: - str = "logical"; - break; - case affinity_compact: - str = "compact"; - break; - case affinity_scatter: - str = "scatter"; - break; - case affinity_explicit: - str = "explicit"; - break; - // No MIC on windows, so no affinity_balanced case - default: - KMP_DEBUG_ASSERT(0); - } - KMP_WARNING(AffGranGroupType, var, str); - __kmp_affinity_gran = KMP_HW_CORE; - } - } + if (__kmp_affinity_top_method == affinity_top_method_default) + __kmp_affinity_top_method = affinity_top_method_all; + if (__kmp_affinity_gran == KMP_HW_UNKNOWN) + __kmp_affinity_gran = KMP_HW_PROC_GROUP; } else #endif /* KMP_GROUP_AFFINITY */ diff --git a/contrib/libs/cxxsupp/openmp/kmp_stats.h b/contrib/libs/cxxsupp/openmp/kmp_stats.h index 78bbb9068a..0e3ea3b9cf 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_stats.h +++ b/contrib/libs/cxxsupp/openmp/kmp_stats.h @@ -246,6 +246,8 @@ enum stats_state_e { // KMP_tree_release -- time in __kmp_tree_barrier_release // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather // KMP_hyper_release -- time in __kmp_hyper_barrier_release +// KMP_dist_gather -- time in __kmp_dist_barrier_gather +// KMP_dist_release -- time in __kmp_dist_barrier_release // clang-format off #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \ macro(KMP_fork_call, 0, arg) \ @@ -255,6 +257,8 @@ enum stats_state_e { macro(KMP_hier_release, 0, arg) \ macro(KMP_hyper_gather, 0, arg) \ macro(KMP_hyper_release, 0, arg) \ + macro(KMP_dist_gather, 0, arg) \ + macro(KMP_dist_release, 0, arg) \ macro(KMP_linear_gather, 0, arg) \ macro(KMP_linear_release, 0, arg) \ macro(KMP_tree_gather, 0, arg) \ diff --git a/contrib/libs/cxxsupp/openmp/kmp_str.cpp b/contrib/libs/cxxsupp/openmp/kmp_str.cpp index ffce2b88ab..e64f989fbc 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_str.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_str.cpp @@ -515,6 +515,31 @@ int __kmp_str_match(char const *target, int len, char const *data) { return ((len > 0) ? i >= len : (!target[i] && (len || !data[i]))); } // __kmp_str_match +// If data contains all of target, returns true, otherwise returns false. +// len should be the length of target +bool __kmp_str_contains(char const *target, int len, char const *data) { + int i = 0, j = 0, start = 0; + if (target == NULL || data == NULL) { + return FALSE; + } + while (target[i]) { + if (!data[j]) + return FALSE; + if (TOLOWER(target[i]) != TOLOWER(data[j])) { + j = start + 1; + start = j; + i = 0; + } else { + if (i == 0) + start = j; + j++; + i++; + } + } + + return i == len; +} // __kmp_str_contains + int __kmp_str_match_false(char const *data) { int result = __kmp_str_match("false", 1, data) || __kmp_str_match("off", 2, data) || diff --git a/contrib/libs/cxxsupp/openmp/kmp_str.h b/contrib/libs/cxxsupp/openmp/kmp_str.h index ff6179908e..855b5df55d 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_str.h +++ b/contrib/libs/cxxsupp/openmp/kmp_str.h @@ -106,6 +106,7 @@ int __kmp_str_eqf(char const *lhs, char const *rhs); char *__kmp_str_format(char const *format, ...); void __kmp_str_free(char **str); int __kmp_str_match(char const *target, int len, char const *data); +bool __kmp_str_contains(char const *target, int len, char const *data); int __kmp_str_match_false(char const *data); int __kmp_str_match_true(char const *data); void __kmp_str_replace(char *str, char search_for, char replace_with); diff --git a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp index dd3e7688d3..501830eaa7 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp @@ -86,6 +86,7 @@ static kmp_dephash_t *__kmp_dephash_extend(kmp_info_t *thread, h->buckets = (kmp_dephash_entry **)(h + 1); h->generation = gen; h->nconflicts = 0; + h->last_all = current_dephash->last_all; // make sure buckets are properly initialized for (size_t i = 0; i < new_size; i++) { @@ -142,6 +143,7 @@ static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread, h->nelements = 0; h->nconflicts = 0; h->buckets = (kmp_dephash_entry **)(h + 1); + h->last_all = NULL; for (size_t i = 0; i < h_size; i++) h->buckets[i] = 0; @@ -174,7 +176,10 @@ static kmp_dephash_entry *__kmp_dephash_find(kmp_info_t *thread, thread, sizeof(kmp_dephash_entry_t)); #endif entry->addr = addr; - entry->last_out = NULL; + if (!h->last_all) // no predecessor task with omp_all_memory dependence + entry->last_out = NULL; + else // else link the omp_all_memory depnode to the new entry + entry->last_out = __kmp_node_ref(h->last_all); entry->last_set = NULL; entry->prev_set = NULL; entry->last_flag = 0; @@ -290,6 +295,63 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid, return npredecessors; } +static inline kmp_int32 +__kmp_process_dep_all(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *h, + bool dep_barrier, kmp_task_t *task) { + KA_TRACE(30, ("__kmp_process_dep_all: T#%d processing dep_all, " + "dep_barrier = %d\n", + gtid, dep_barrier)); + kmp_info_t *thread = __kmp_threads[gtid]; + kmp_int32 npredecessors = 0; + + // process previous omp_all_memory node if any + npredecessors += + __kmp_depnode_link_successor(gtid, thread, task, node, h->last_all); + __kmp_node_deref(thread, h->last_all); + if (!dep_barrier) { + h->last_all = __kmp_node_ref(node); + } else { + // if this is a sync point in the serial sequence, then the previous + // outputs are guaranteed to be completed after the execution of this + // task so the previous output nodes can be cleared. + h->last_all = NULL; + } + + // process all regular dependences + for (size_t i = 0; i < h->size; i++) { + kmp_dephash_entry_t *info = h->buckets[i]; + if (!info) // skip empty slots in dephash + continue; + for (; info; info = info->next_in_bucket) { + // for each entry the omp_all_memory works as OUT dependence + kmp_depnode_t *last_out = info->last_out; + kmp_depnode_list_t *last_set = info->last_set; + kmp_depnode_list_t *prev_set = info->prev_set; + if (last_set) { + npredecessors += + __kmp_depnode_link_successor(gtid, thread, task, node, last_set); + __kmp_depnode_list_free(thread, last_set); + __kmp_depnode_list_free(thread, prev_set); + info->last_set = NULL; + info->prev_set = NULL; + info->last_flag = 0; // no sets in this dephash entry + } else { + npredecessors += + __kmp_depnode_link_successor(gtid, thread, task, node, last_out); + } + __kmp_node_deref(thread, last_out); + if (!dep_barrier) { + info->last_out = __kmp_node_ref(node); + } else { + info->last_out = NULL; + } + } + } + KA_TRACE(30, ("__kmp_process_dep_all: T#%d found %d predecessors\n", gtid, + npredecessors)); + return npredecessors; +} + template <bool filter> static inline kmp_int32 __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash, @@ -417,7 +479,7 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list) { - int i, n_mtxs = 0; + int i, n_mtxs = 0, dep_all = 0; #if KMP_DEBUG kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); #endif @@ -429,7 +491,8 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node, // Filter deps in dep_list // TODO: Different algorithm for large dep_list ( > 10 ? ) for (i = 0; i < ndeps; i++) { - if (dep_list[i].base_addr != 0) { + if (dep_list[i].base_addr != 0 && + dep_list[i].base_addr != (kmp_intptr_t)KMP_SIZE_T_MAX) { KMP_DEBUG_ASSERT( dep_list[i].flag == KMP_DEP_IN || dep_list[i].flag == KMP_DEP_OUT || dep_list[i].flag == KMP_DEP_INOUT || @@ -451,6 +514,13 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node, dep_list[i].flag = KMP_DEP_OUT; // downgrade mutexinoutset to inout } } + } else if (dep_list[i].flag == KMP_DEP_ALL || + dep_list[i].base_addr == (kmp_intptr_t)KMP_SIZE_T_MAX) { + // omp_all_memory dependence can be marked by compiler by either + // (addr=0 && flag=0x80) (flag KMP_DEP_ALL), or (addr=-1). + // omp_all_memory overrides all other dependences if any + dep_all = 1; + break; } } @@ -464,10 +534,14 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node, // the end int npredecessors; - npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier, ndeps, - dep_list, task); - npredecessors += __kmp_process_deps<false>( - gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task); + if (!dep_all) { // regular dependences + npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier, + ndeps, dep_list, task); + npredecessors += __kmp_process_deps<false>( + gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task); + } else { // omp_all_memory dependence + npredecessors = __kmp_process_dep_all(gtid, node, *hash, dep_barrier, task); + } node->dn.task = task; KMP_MB(); @@ -755,8 +829,10 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps, bool ignore = current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final; - ignore = ignore && thread->th.th_task_team != NULL && - thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE; + ignore = + ignore && thread->th.th_task_team != NULL && + thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE && + thread->th.th_task_team->tt.tt_hidden_helper_task_encountered == FALSE; ignore = ignore || current_task->td_dephash == NULL; if (ignore) { diff --git a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h index 73abf07018..99f182bbd0 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h +++ b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h @@ -73,6 +73,8 @@ static inline void __kmp_dephash_free_entries(kmp_info_t *thread, h->buckets[i] = 0; } } + __kmp_node_deref(thread, h->last_all); + h->last_all = NULL; } static inline void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h) { @@ -144,9 +146,10 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) { // encountering thread's queue; otherwise, it can be pushed to its own // queue. if (!next_taskdata->td_flags.hidden_helper) { - __kmpc_give_task( - successor->dn.task, - __kmp_tid_from_gtid(next_taskdata->encountering_gtid)); + kmp_int32 encountering_gtid = + next_taskdata->td_alloc_thread->th.th_info.ds.ds_gtid; + kmp_int32 encountering_tid = __kmp_tid_from_gtid(encountering_gtid); + __kmpc_give_task(successor->dn.task, encountering_tid); } else { __kmp_omp_task(gtid, successor->dn.task, false); } diff --git a/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp b/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp index 55e9c30763..e445438524 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp @@ -324,10 +324,16 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { kmp_info_t *thread = __kmp_threads[gtid]; kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); - // We don't need to map to shadow gtid if it is already hidden helper thread - if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) { - gtid = KMP_GTID_TO_SHADOW_GTID(gtid); - thread = __kmp_threads[gtid]; + // If we encounter a hidden helper task, and the current thread is not a + // hidden helper thread, we have to give the task to any hidden helper thread + // starting from its shadow one. + if (UNLIKELY(taskdata->td_flags.hidden_helper && + !KMP_HIDDEN_HELPER_THREAD(gtid))) { + kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid); + __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid)); + // Signal the hidden helper threads. + __kmp_hidden_helper_worker_thread_signal(); + return TASK_SUCCESSFULLY_PUSHED; } kmp_task_team_t *task_team = thread->th.th_task_team; @@ -434,16 +440,8 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { gtid, taskdata, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); - auto hidden_helper = taskdata->td_flags.hidden_helper; - __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); - // Signal one worker thread to execute the task - if (UNLIKELY(hidden_helper)) { - // Wake hidden helper threads up if they're sleeping - __kmp_hidden_helper_worker_thread_signal(); - } - return TASK_SUCCESSFULLY_PUSHED; } @@ -809,6 +807,24 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid, gtid, taskdata, children)); } +// Only need to keep track of child task counts if any of the following: +// 1. team parallel and tasking not serialized; +// 2. it is a proxy or detachable or hidden helper task +// 3. the children counter of its parent task is greater than 0. +// The reason for the 3rd one is for serialized team that found detached task, +// hidden helper task, T. In this case, the execution of T is still deferred, +// and it is also possible that a regular task depends on T. In this case, if we +// don't track the children, task synchronization will be broken. +static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) { + kmp_tasking_flags_t flags = taskdata->td_flags; + bool ret = !(flags.team_serial || flags.tasking_ser); + ret = ret || flags.proxy == TASK_PROXY || + flags.detachable == TASK_DETACHABLE || flags.hidden_helper; + ret = ret || + KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0; + return ret; +} + // __kmp_task_finish: bookkeeping to do when a task finishes execution // // gtid: global thread ID for calling thread @@ -825,8 +841,9 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, kmp_info_t *thread = __kmp_threads[gtid]; kmp_task_team_t *task_team = thread->th.th_task_team; // might be NULL for serial teams... +#if KMP_DEBUG kmp_int32 children = 0; - +#endif KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " "task %p\n", gtid, taskdata, resumed_task)); @@ -934,16 +951,15 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, if (ompt) __ompt_task_finish(task, resumed_task, ompt_task_complete); #endif - - // Only need to keep track of count if team parallel and tasking not - // serialized, or task is detachable and event has already been fulfilled - if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || - taskdata->td_flags.detachable == TASK_DETACHABLE || - taskdata->td_flags.hidden_helper) { + // TODO: What would be the balance between the conditions in the function + // and an atomic operation? + if (__kmp_track_children_task(taskdata)) { __kmp_release_deps(gtid, taskdata); // Predecrement simulated by "- 1" calculation - children = - KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; +#if KMP_DEBUG + children = -1 + +#endif + KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks); KMP_DEBUG_ASSERT(children >= 0); if (taskdata->td_taskgroup) KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); @@ -1189,7 +1205,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task; kmp_taskdata_t *taskdata; kmp_info_t *thread = __kmp_threads[gtid]; - kmp_info_t *encountering_thread = thread; kmp_team_t *team = thread->th.th_team; kmp_taskdata_t *parent_task = thread->th.th_current_task; size_t shareds_offset; @@ -1201,15 +1216,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, if (__kmp_enable_hidden_helper) { if (!TCR_4(__kmp_init_hidden_helper)) __kmp_hidden_helper_initialize(); - - // For a hidden helper task encountered by a regular thread, we will push - // the task to the (gtid%__kmp_hidden_helper_threads_num)-th hidden helper - // thread. - if (!KMP_HIDDEN_HELPER_THREAD(gtid)) { - thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)]; - // We don't change the parent-child relation for hidden helper task as - // we need that to do per-task-region synchronization. - } } else { // If the hidden helper task is not enabled, reset the flag to FALSE. flags->hidden_helper = FALSE; @@ -1232,8 +1238,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, // Untied task encountered causes the TSC algorithm to check entire deque of // the victim thread. If no untied task encountered, then checking the head // of the deque should be enough. - KMP_CHECK_UPDATE( - encountering_thread->th.th_task_team->tt.tt_untied_task_encountered, 1); + KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1); } // Detachable tasks are not proxy tasks yet but could be in the future. Doing @@ -1247,32 +1252,30 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, } /* are we running in a sequential parallel or tskm_immediate_exec... we need tasking support enabled */ - if ((encountering_thread->th.th_task_team) == NULL) { + if ((thread->th.th_task_team) == NULL) { /* This should only happen if the team is serialized setup a task team and propagate it to the thread */ KMP_DEBUG_ASSERT(team->t.t_serialized); KA_TRACE(30, ("T#%d creating task team in __kmp_task_alloc for proxy task\n", gtid)); - __kmp_task_team_setup( - encountering_thread, team, - 1); // 1 indicates setup the current team regardless of nthreads - encountering_thread->th.th_task_team = - team->t.t_task_team[encountering_thread->th.th_task_state]; + // 1 indicates setup the current team regardless of nthreads + __kmp_task_team_setup(thread, team, 1); + thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; } - kmp_task_team_t *task_team = encountering_thread->th.th_task_team; + kmp_task_team_t *task_team = thread->th.th_task_team; /* tasking must be enabled now as the task might not be pushed */ if (!KMP_TASKING_ENABLED(task_team)) { KA_TRACE( 30, ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); - __kmp_enable_tasking(task_team, encountering_thread); - kmp_int32 tid = encountering_thread->th.th_info.ds.ds_tid; + __kmp_enable_tasking(task_team, thread); + kmp_int32 tid = thread->th.th_info.ds.ds_tid; kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; // No lock needed since only owner can allocate if (thread_data->td.td_deque == NULL) { - __kmp_alloc_task_deque(encountering_thread, thread_data); + __kmp_alloc_task_deque(thread, thread_data); } } @@ -1297,11 +1300,11 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, // Avoid double allocation here by combining shareds with taskdata #if USE_FAST_MEMORY - taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( - encountering_thread, shareds_offset + sizeof_shareds); + taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + + sizeof_shareds); #else /* ! USE_FAST_MEMORY */ - taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( - encountering_thread, shareds_offset + sizeof_shareds); + taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + + sizeof_shareds); #endif /* USE_FAST_MEMORY */ task = KMP_TASKDATA_TO_TASK(taskdata); @@ -1328,7 +1331,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, taskdata->td_task_id = KMP_GEN_TASK_ID(); taskdata->td_team = thread->th.th_team; - taskdata->td_alloc_thread = encountering_thread; + taskdata->td_alloc_thread = thread; taskdata->td_parent = parent_task; taskdata->td_level = parent_task->td_level + 1; // increment nesting level KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); @@ -1342,10 +1345,16 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); taskdata->td_flags = *flags; - taskdata->encountering_gtid = gtid; taskdata->td_task_team = thread->th.th_task_team; taskdata->td_size_alloc = shareds_offset + sizeof_shareds; taskdata->td_flags.tasktype = TASK_EXPLICIT; + // If it is hidden helper task, we need to set the team and task team + // correspondingly. + if (flags->hidden_helper) { + kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)]; + taskdata->td_team = shadow_thread->th.th_team; + taskdata->td_task_team = shadow_thread->th.th_task_team; + } // GEH - TODO: fix this to copy parent task's value of tasking_ser flag taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); @@ -1382,11 +1391,9 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, if (UNLIKELY(ompt_enabled.enabled)) __ompt_task_init(taskdata, gtid); #endif - // Only need to keep track of child task counts if team parallel and tasking - // not serialized or if it is a proxy or detachable or hidden helper task - if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE || - flags->hidden_helper || - !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { + // TODO: What would be the balance between the conditions in the function and + // an atomic operation? + if (__kmp_track_children_task(taskdata)) { KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); if (parent_task->td_taskgroup) KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); @@ -1438,11 +1445,12 @@ kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, size_t sizeof_shareds, kmp_routine_entry_t task_entry, kmp_int64 device_id) { - if (__kmp_enable_hidden_helper) { - auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags); + auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags); + // target task is untied defined in the specification + input_flags.tiedness = TASK_UNTIED; + + if (__kmp_enable_hidden_helper) input_flags.hidden_helper = TRUE; - input_flags.tiedness = TASK_UNTIED; - } return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t, sizeof_shareds, task_entry); @@ -1613,13 +1621,15 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task) #endif + if (task->routine != NULL) { #ifdef KMP_GOMP_COMPAT - if (taskdata->td_flags.native) { - ((void (*)(void *))(*(task->routine)))(task->shareds); - } else + if (taskdata->td_flags.native) { + ((void (*)(void *))(*(task->routine)))(task->shareds); + } else #endif /* KMP_GOMP_COMPAT */ - { - (*(task->routine))(gtid, task); + { + (*(task->routine))(gtid, task); + } } KMP_POP_PARTITIONED_TIMER(); @@ -2833,15 +2843,14 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, // We need to un-mark this victim as a finished victim. This must be done // before releasing the lock, or else other threads (starting with the // primary thread victim) might be prematurely released from the barrier!!! - kmp_int32 count; - - count = KMP_ATOMIC_INC(unfinished_threads); - +#if KMP_DEBUG + kmp_int32 count = +#endif + KMP_ATOMIC_INC(unfinished_threads); KA_TRACE( 20, ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", gtid, count + 1, task_team)); - *thread_finished = FALSE; } TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1); @@ -2948,8 +2957,7 @@ static inline int __kmp_execute_tasks_template( (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != NULL)) { asleep = 1; - __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), - other_thread->th.th_sleep_loc); + __kmp_null_resume_wrapper(other_thread); // A sleeping thread should not have any tasks on it's queue. // There is a slight possibility that it resumes, steals a task // from another thread, which spawns more tasks, all in the time @@ -3034,9 +3042,10 @@ static inline int __kmp_execute_tasks_template( // done. This decrement might be to the spin location, and result in the // termination condition being satisfied. if (!*thread_finished) { - kmp_int32 count; - - count = KMP_ATOMIC_DEC(unfinished_threads) - 1; +#if KMP_DEBUG + kmp_int32 count = -1 + +#endif + KMP_ATOMIC_DEC(unfinished_threads); KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " "unfinished_threads to %d task_team=%p\n", gtid, count, task_team)); @@ -3065,6 +3074,18 @@ static inline int __kmp_execute_tasks_template( return FALSE; } + // Check the flag again to see if it has already done in case to be trapped + // into infinite loop when a if0 task depends on a hidden helper task + // outside any parallel region. Detached tasks are not impacted in this case + // because the only thread executing this function has to execute the proxy + // task so it is in another code path that has the same check. + if (flag == NULL || (!final_spin && flag->done_check())) { + KA_TRACE(15, + ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", + gtid)); + return TRUE; + } + // We could be getting tasks from target constructs; if this is the only // thread, keep trying to execute tasks from own queue if (nthreads == 1 && @@ -3098,6 +3119,16 @@ int __kmp_execute_tasks_64( thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); } +template <bool C, bool S> +int __kmp_atomic_execute_tasks_64( + kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag, + int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), + kmp_int32 is_constrained) { + return __kmp_execute_tasks_template( + thread, gtid, flag, final_spin, + thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); +} + int __kmp_execute_tasks_oncore( kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), @@ -3124,6 +3155,14 @@ template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32, int *USE_ITT_BUILD_ARG(void *), kmp_int32); +template int __kmp_atomic_execute_tasks_64<false, true>( + kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int, + int *USE_ITT_BUILD_ARG(void *), kmp_int32); + +template int __kmp_atomic_execute_tasks_64<true, false>( + kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int, + int *USE_ITT_BUILD_ARG(void *), kmp_int32); + // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the // next barrier so they can assist in executing enqueued tasks. // First thread in allocates the task team atomically. @@ -3162,7 +3201,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team, // tasks and execute them. In extra barrier mode, tasks do not sleep // at the separate tasking barrier, so this isn't a problem. for (i = 0; i < nthreads; i++) { - volatile void *sleep_loc; + void *sleep_loc; kmp_info_t *thread = threads_data[i].td.td_thr; if (i == this_thr->th.th_info.ds.ds_tid) { @@ -3179,7 +3218,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team, KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", __kmp_gtid_from_thread(this_thr), __kmp_gtid_from_thread(thread))); - __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); + __kmp_null_resume_wrapper(thread); } else { KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", __kmp_gtid_from_thread(this_thr), @@ -3451,6 +3490,7 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, TCW_4(task_team->tt.tt_found_tasks, FALSE); TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); + TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); task_team->tt.tt_nproc = nthreads = team->t.t_nproc; KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); @@ -3512,9 +3552,11 @@ void __kmp_reap_task_teams(void) { void __kmp_wait_to_unref_task_teams(void) { kmp_info_t *thread; kmp_uint32 spins; + kmp_uint64 time; int done; KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); for (;;) { done = TRUE; @@ -3547,7 +3589,7 @@ void __kmp_wait_to_unref_task_teams(void) { __kmp_gtid_from_thread(thread))); if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { - volatile void *sleep_loc; + void *sleep_loc; // If the thread is sleeping, awaken it. if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != NULL) { @@ -3555,7 +3597,7 @@ void __kmp_wait_to_unref_task_teams(void) { 10, ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); - __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); + __kmp_null_resume_wrapper(thread); } } } @@ -3564,7 +3606,7 @@ void __kmp_wait_to_unref_task_teams(void) { } // If oversubscribed or have waited a bit, yield. - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } } @@ -3613,6 +3655,7 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); TCW_4(task_team->tt.tt_found_tasks, FALSE); TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); + TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team->t.t_nproc); TCW_4(task_team->tt.tt_active, TRUE); @@ -3705,8 +3748,10 @@ void __kmp_task_team_wait( "setting active to false, setting local and team's pointer to NULL\n", __kmp_gtid_from_thread(this_thr), task_team)); KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || - task_team->tt.tt_found_proxy_tasks == TRUE); + task_team->tt.tt_found_proxy_tasks == TRUE || + task_team->tt.tt_hidden_helper_task_encountered == TRUE); TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); + TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); TCW_SYNC_4(task_team->tt.tt_active, FALSE); KMP_MB(); @@ -3869,11 +3914,12 @@ static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { } static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { +#if KMP_DEBUG kmp_int32 children = 0; - // Predecrement simulated by "- 1" calculation - children = - KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; + children = -1 + +#endif + KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks); KMP_DEBUG_ASSERT(children >= 0); // Remove the imaginary children @@ -3936,7 +3982,7 @@ void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) { // This should be similar to start_k = __kmp_get_random( thread ) % nthreads // but we cannot use __kmp_get_random here - kmp_int32 start_k = start; + kmp_int32 start_k = start % nthreads; kmp_int32 pass = 1; kmp_int32 k = start_k; diff --git a/contrib/libs/cxxsupp/openmp/kmp_utility.cpp b/contrib/libs/cxxsupp/openmp/kmp_utility.cpp index 6531536f5d..9465f720e0 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_utility.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_utility.cpp @@ -135,7 +135,7 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) { p->initialized = 1; - p->sse2 = 1; // Assume SSE2 by default. + p->flags.sse2 = 1; // Assume SSE2 by default. __kmp_x86_cpuid(0, 0, &buf); @@ -175,7 +175,7 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) { data[i] = (t & 0xff); } - p->sse2 = (buf.edx >> 26) & 1; + p->flags.sse2 = (buf.edx >> 26) & 1; #ifdef KMP_DEBUG @@ -253,15 +253,21 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) { i, buf.eax, buf.ebx, buf.ecx, buf.edx)); } #endif -#if KMP_USE_ADAPTIVE_LOCKS - p->rtm = 0; + p->flags.rtm = 0; + p->flags.hybrid = 0; if (max_arg > 7) { /* RTM bit CPUID.07:EBX, bit 11 */ + /* HYRBID bit CPUID.07:EDX, bit 15 */ __kmp_x86_cpuid(7, 0, &buf); - p->rtm = (buf.ebx >> 11) & 1; - KA_TRACE(trace_level, (" RTM")); + p->flags.rtm = (buf.ebx >> 11) & 1; + p->flags.hybrid = (buf.edx >> 15) & 1; + if (p->flags.rtm) { + KA_TRACE(trace_level, (" RTM")); + } + if (p->flags.hybrid) { + KA_TRACE(trace_level, (" HYBRID")); + } } -#endif } { // Parse CPU brand string for frequency, saving the string for later. diff --git a/contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp b/contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp index cabb5722f4..d41ddf231e 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp @@ -33,6 +33,10 @@ template <bool C, bool S> void __kmp_mwait_64(int th_gtid, kmp_flag_64<C, S> *flag) { __kmp_mwait_template(th_gtid, flag); } +template <bool C, bool S> +void __kmp_atomic_mwait_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag) { + __kmp_mwait_template(th_gtid, flag); +} void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag) { __kmp_mwait_template(th_gtid, flag); } @@ -40,4 +44,8 @@ void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag) { template void __kmp_mwait_32<false, false>(int, kmp_flag_32<false, false> *); template void __kmp_mwait_64<false, true>(int, kmp_flag_64<false, true> *); template void __kmp_mwait_64<true, false>(int, kmp_flag_64<true, false> *); +template void +__kmp_atomic_mwait_64<false, true>(int, kmp_atomic_flag_64<false, true> *); +template void +__kmp_atomic_mwait_64<true, false>(int, kmp_atomic_flag_64<true, false> *); #endif diff --git a/contrib/libs/cxxsupp/openmp/kmp_wait_release.h b/contrib/libs/cxxsupp/openmp/kmp_wait_release.h index d528ce9f18..b32cb15de1 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_wait_release.h +++ b/contrib/libs/cxxsupp/openmp/kmp_wait_release.h @@ -33,96 +33,288 @@ higher level operations such as barriers and fork/join. @{ */ -/*! - * The flag_type describes the storage used for the flag. - */ -enum flag_type { - flag32, /**< 32 bit flags */ - flag64, /**< 64 bit flags */ - flag_oncore /**< special 64-bit flag for on-core barrier (hierarchical) */ -}; - struct flag_properties { unsigned int type : 16; unsigned int reserved : 16; }; -/*! - * Base class for wait/release volatile flag - */ -template <typename P> class kmp_flag_native { - volatile P *loc; - flag_properties t; +template <enum flag_type FlagType> struct flag_traits {}; + +template <> struct flag_traits<flag32> { + typedef kmp_uint32 flag_t; + static const flag_type t = flag32; + static inline flag_t tcr(flag_t f) { return TCR_4(f); } + static inline flag_t test_then_add4(volatile flag_t *f) { + return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f)); + } + static inline flag_t test_then_or(volatile flag_t *f, flag_t v) { + return KMP_TEST_THEN_OR32(f, v); + } + static inline flag_t test_then_and(volatile flag_t *f, flag_t v) { + return KMP_TEST_THEN_AND32(f, v); + } +}; + +template <> struct flag_traits<atomic_flag64> { + typedef kmp_uint64 flag_t; + static const flag_type t = atomic_flag64; + static inline flag_t tcr(flag_t f) { return TCR_8(f); } + static inline flag_t test_then_add4(volatile flag_t *f) { + return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f)); + } + static inline flag_t test_then_or(volatile flag_t *f, flag_t v) { + return KMP_TEST_THEN_OR64(f, v); + } + static inline flag_t test_then_and(volatile flag_t *f, flag_t v) { + return KMP_TEST_THEN_AND64(f, v); + } +}; + +template <> struct flag_traits<flag64> { + typedef kmp_uint64 flag_t; + static const flag_type t = flag64; + static inline flag_t tcr(flag_t f) { return TCR_8(f); } + static inline flag_t test_then_add4(volatile flag_t *f) { + return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f)); + } + static inline flag_t test_then_or(volatile flag_t *f, flag_t v) { + return KMP_TEST_THEN_OR64(f, v); + } + static inline flag_t test_then_and(volatile flag_t *f, flag_t v) { + return KMP_TEST_THEN_AND64(f, v); + } +}; + +template <> struct flag_traits<flag_oncore> { + typedef kmp_uint64 flag_t; + static const flag_type t = flag_oncore; + static inline flag_t tcr(flag_t f) { return TCR_8(f); } + static inline flag_t test_then_add4(volatile flag_t *f) { + return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f)); + } + static inline flag_t test_then_or(volatile flag_t *f, flag_t v) { + return KMP_TEST_THEN_OR64(f, v); + } + static inline flag_t test_then_and(volatile flag_t *f, flag_t v) { + return KMP_TEST_THEN_AND64(f, v); + } +}; + +/*! Base class for all flags */ +template <flag_type FlagType> class kmp_flag { +protected: + flag_properties t; /**< "Type" of the flag in loc */ + kmp_info_t *waiting_threads[1]; /**< Threads sleeping on this thread. */ + kmp_uint32 num_waiting_threads; /**< #threads sleeping on this thread. */ + std::atomic<bool> *sleepLoc; public: - typedef P flag_t; - kmp_flag_native(volatile P *p, flag_type ft) - : loc(p), t({(short unsigned int)ft, 0U}) {} - volatile P *get() { return loc; } - void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); } - void set(volatile P *new_loc) { loc = new_loc; } + typedef flag_traits<FlagType> traits_type; + kmp_flag() : t({FlagType, 0U}), num_waiting_threads(0), sleepLoc(nullptr) {} + kmp_flag(int nwaiters) + : t({FlagType, 0U}), num_waiting_threads(nwaiters), sleepLoc(nullptr) {} + kmp_flag(std::atomic<bool> *sloc) + : t({FlagType, 0U}), num_waiting_threads(0), sleepLoc(sloc) {} + /*! @result the flag_type */ flag_type get_type() { return (flag_type)(t.type); } - P load() { return *loc; } - void store(P val) { *loc = val; } + + /*! param i in index into waiting_threads + * @result the thread that is waiting at index i */ + kmp_info_t *get_waiter(kmp_uint32 i) { + KMP_DEBUG_ASSERT(i < num_waiting_threads); + return waiting_threads[i]; + } + /*! @result num_waiting_threads */ + kmp_uint32 get_num_waiters() { return num_waiting_threads; } + /*! @param thr in the thread which is now waiting + * Insert a waiting thread at index 0. */ + void set_waiter(kmp_info_t *thr) { + waiting_threads[0] = thr; + num_waiting_threads = 1; + } + enum barrier_type get_bt() { return bs_last_barrier; } }; -/*! - * Base class for wait/release atomic flag - */ -template <typename P> class kmp_flag { - std::atomic<P> - *loc; /**< Pointer to the flag storage that is modified by another thread - */ - flag_properties t; /**< "Type" of the flag in loc */ +/*! Base class for wait/release volatile flag */ +template <typename PtrType, flag_type FlagType, bool Sleepable> +class kmp_flag_native : public kmp_flag<FlagType> { +protected: + volatile PtrType *loc; + PtrType checker; /**< When flag==checker, it has been released. */ + typedef flag_traits<FlagType> traits_type; + public: - typedef P flag_t; - kmp_flag(std::atomic<P> *p, flag_type ft) - : loc(p), t({(short unsigned int)ft, 0U}) {} - /*! - * @result the pointer to the actual flag - */ - std::atomic<P> *get() { return loc; } - /*! - * @result void* pointer to the actual flag - */ + typedef PtrType flag_t; + kmp_flag_native(volatile PtrType *p) : kmp_flag<FlagType>(), loc(p) {} + kmp_flag_native(volatile PtrType *p, kmp_info_t *thr) + : kmp_flag<FlagType>(1), loc(p) { + this->waiting_threads[0] = thr; + } + kmp_flag_native(volatile PtrType *p, PtrType c) + : kmp_flag<FlagType>(), loc(p), checker(c) {} + kmp_flag_native(volatile PtrType *p, PtrType c, std::atomic<bool> *sloc) + : kmp_flag<FlagType>(sloc), loc(p), checker(c) {} + virtual ~kmp_flag_native() {} + void *operator new(size_t size) { return __kmp_allocate(size); } + void operator delete(void *p) { __kmp_free(p); } + volatile PtrType *get() { return loc; } + void *get_void_p() { return RCAST(void *, CCAST(PtrType *, loc)); } + void set(volatile PtrType *new_loc) { loc = new_loc; } + PtrType load() { return *loc; } + void store(PtrType val) { *loc = val; } + /*! @result true if the flag object has been released. */ + virtual bool done_check() { + if (Sleepable && !(this->sleepLoc)) + return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) == + checker; + else + return traits_type::tcr(*(this->get())) == checker; + } + /*! @param old_loc in old value of flag + * @result true if the flag's old value indicates it was released. */ + virtual bool done_check_val(PtrType old_loc) { return old_loc == checker; } + /*! @result true if the flag object is not yet released. + * Used in __kmp_wait_template like: + * @code + * while (flag.notdone_check()) { pause(); } + * @endcode */ + virtual bool notdone_check() { + return traits_type::tcr(*(this->get())) != checker; + } + /*! @result Actual flag value before release was applied. + * Trigger all waiting threads to run by modifying flag to release state. */ + void internal_release() { + (void)traits_type::test_then_add4((volatile PtrType *)this->get()); + } + /*! @result Actual flag value before sleep bit(s) set. + * Notes that there is at least one thread sleeping on the flag by setting + * sleep bit(s). */ + PtrType set_sleeping() { + if (this->sleepLoc) { + this->sleepLoc->store(true); + return *(this->get()); + } + return traits_type::test_then_or((volatile PtrType *)this->get(), + KMP_BARRIER_SLEEP_STATE); + } + /*! @result Actual flag value before sleep bit(s) cleared. + * Notes that there are no longer threads sleeping on the flag by clearing + * sleep bit(s). */ + void unset_sleeping() { + if (this->sleepLoc) { + this->sleepLoc->store(false); + return; + } + traits_type::test_then_and((volatile PtrType *)this->get(), + ~KMP_BARRIER_SLEEP_STATE); + } + /*! @param old_loc in old value of flag + * Test if there are threads sleeping on the flag's old value in old_loc. */ + bool is_sleeping_val(PtrType old_loc) { + if (this->sleepLoc) + return this->sleepLoc->load(); + return old_loc & KMP_BARRIER_SLEEP_STATE; + } + /*! Test whether there are threads sleeping on the flag. */ + bool is_sleeping() { + if (this->sleepLoc) + return this->sleepLoc->load(); + return is_sleeping_val(*(this->get())); + } + bool is_any_sleeping() { + if (this->sleepLoc) + return this->sleepLoc->load(); + return is_sleeping_val(*(this->get())); + } + kmp_uint8 *get_stolen() { return NULL; } +}; + +/*! Base class for wait/release atomic flag */ +template <typename PtrType, flag_type FlagType, bool Sleepable> +class kmp_flag_atomic : public kmp_flag<FlagType> { +protected: + std::atomic<PtrType> *loc; /**< Pointer to flag location to wait on */ + PtrType checker; /**< Flag == checker means it has been released. */ +public: + typedef flag_traits<FlagType> traits_type; + typedef PtrType flag_t; + kmp_flag_atomic(std::atomic<PtrType> *p) : kmp_flag<FlagType>(), loc(p) {} + kmp_flag_atomic(std::atomic<PtrType> *p, kmp_info_t *thr) + : kmp_flag<FlagType>(1), loc(p) { + this->waiting_threads[0] = thr; + } + kmp_flag_atomic(std::atomic<PtrType> *p, PtrType c) + : kmp_flag<FlagType>(), loc(p), checker(c) {} + kmp_flag_atomic(std::atomic<PtrType> *p, PtrType c, std::atomic<bool> *sloc) + : kmp_flag<FlagType>(sloc), loc(p), checker(c) {} + /*! @result the pointer to the actual flag */ + std::atomic<PtrType> *get() { return loc; } + /*! @result void* pointer to the actual flag */ void *get_void_p() { return RCAST(void *, loc); } - /*! - * @param new_loc in set loc to point at new_loc - */ - void set(std::atomic<P> *new_loc) { loc = new_loc; } - /*! - * @result the flag_type - */ - flag_type get_type() { return (flag_type)(t.type); } - /*! - * @result flag value - */ - P load() { return loc->load(std::memory_order_acquire); } - /*! - * @param val the new flag value to be stored - */ - void store(P val) { loc->store(val, std::memory_order_release); } - // Derived classes must provide the following: - /* - kmp_info_t * get_waiter(kmp_uint32 i); - kmp_uint32 get_num_waiters(); - bool done_check(); - bool done_check_val(P old_loc); - bool notdone_check(); - P internal_release(); - void suspend(int th_gtid); - void mwait(int th_gtid); - void resume(int th_gtid); - P set_sleeping(); - P unset_sleeping(); - bool is_sleeping(); - bool is_any_sleeping(); - bool is_sleeping_val(P old_loc); - int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, - int *thread_finished - USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 - is_constrained); - */ + /*! @param new_loc in set loc to point at new_loc */ + void set(std::atomic<PtrType> *new_loc) { loc = new_loc; } + /*! @result flag value */ + PtrType load() { return loc->load(std::memory_order_acquire); } + /*! @param val the new flag value to be stored */ + void store(PtrType val) { loc->store(val, std::memory_order_release); } + /*! @result true if the flag object has been released. */ + bool done_check() { + if (Sleepable && !(this->sleepLoc)) + return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker; + else + return this->load() == checker; + } + /*! @param old_loc in old value of flag + * @result true if the flag's old value indicates it was released. */ + bool done_check_val(PtrType old_loc) { return old_loc == checker; } + /*! @result true if the flag object is not yet released. + * Used in __kmp_wait_template like: + * @code + * while (flag.notdone_check()) { pause(); } + * @endcode */ + bool notdone_check() { return this->load() != checker; } + /*! @result Actual flag value before release was applied. + * Trigger all waiting threads to run by modifying flag to release state. */ + void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); } + /*! @result Actual flag value before sleep bit(s) set. + * Notes that there is at least one thread sleeping on the flag by setting + * sleep bit(s). */ + PtrType set_sleeping() { + if (this->sleepLoc) { + this->sleepLoc->store(true); + return *(this->get()); + } + return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE); + } + /*! @result Actual flag value before sleep bit(s) cleared. + * Notes that there are no longer threads sleeping on the flag by clearing + * sleep bit(s). */ + void unset_sleeping() { + if (this->sleepLoc) { + this->sleepLoc->store(false); + return; + } + KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE); + } + /*! @param old_loc in old value of flag + * Test whether there are threads sleeping on flag's old value in old_loc. */ + bool is_sleeping_val(PtrType old_loc) { + if (this->sleepLoc) + return this->sleepLoc->load(); + return old_loc & KMP_BARRIER_SLEEP_STATE; + } + /*! Test whether there are threads sleeping on the flag. */ + bool is_sleeping() { + if (this->sleepLoc) + return this->sleepLoc->load(); + return is_sleeping_val(this->load()); + } + bool is_any_sleeping() { + if (this->sleepLoc) + return this->sleepLoc->load(); + return is_sleeping_val(this->load()); + } + kmp_uint8 *get_stolen() { return NULL; } }; #if OMPT_SUPPORT @@ -185,6 +377,7 @@ __kmp_wait_template(kmp_info_t *this_thr, #else kmp_uint32 hibernate; #endif + kmp_uint64 time; KMP_FSYNC_SPIN_INIT(spin, NULL); if (flag->done_check()) { @@ -264,8 +457,9 @@ final_spin=FALSE) ompt_entry_state = this_thr->th.ompt_thread_info.state; if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit || KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) { - ompt_lw_taskteam_t *team = - this_thr->th.th_team->t.ompt_serialized_team_info; + ompt_lw_taskteam_t *team = NULL; + if (this_thr->th.th_team) + team = this_thr->th.th_team->t.ompt_serialized_team_info; if (team) { tId = &(team->ompt_task_info.task_data); } else { @@ -283,6 +477,7 @@ final_spin=FALSE) #endif KMP_INIT_YIELD(spins); // Setup for waiting + KMP_INIT_BACKOFF(time); if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || __kmp_pause_status == kmp_soft_paused) { @@ -340,11 +535,11 @@ final_spin=FALSE) disabled (KMP_TASKING=0). */ if (task_team != NULL) { if (TCR_SYNC_4(task_team->tt.tt_active)) { - if (KMP_TASKING_ENABLED(task_team)) + if (KMP_TASKING_ENABLED(task_team)) { flag->execute_tasks( this_thr, th_gtid, final_spin, &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); - else + } else this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; } else { KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)); @@ -370,7 +565,7 @@ final_spin=FALSE) // If we are oversubscribed, or have waited a bit (and // KMP_LIBRARY=throughput), then yield - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); #if KMP_STATS_ENABLED // Check if thread has been signalled to idle state @@ -557,6 +752,7 @@ static inline void __kmp_mwait_template(int th_gtid, C *flag) { else { // if flag changes here, wake-up happens immediately TCW_PTR(th->th.th_sleep_loc, (void *)flag); + th->th.th_sleep_loc_type = flag->get_type(); __kmp_unlock_suspend_mx(th); KF_TRACE(50, ("__kmp_mwait_template: T#%d calling mwait\n", th_gtid)); #if KMP_HAVE_UMWAIT @@ -574,6 +770,7 @@ static inline void __kmp_mwait_template(int th_gtid, C *flag) { if (flag->is_sleeping()) flag->unset_sleeping(); TCW_PTR(th->th.th_sleep_loc, NULL); + th->th.th_sleep_loc_type = flag_unset; } // Mark thread as active again th->th.th_active = TRUE; @@ -624,251 +821,15 @@ template <class C> static inline void __kmp_release_template(C *flag) { } } -template <typename FlagType> struct flag_traits {}; - -template <> struct flag_traits<kmp_uint32> { - typedef kmp_uint32 flag_t; - static const flag_type t = flag32; - static inline flag_t tcr(flag_t f) { return TCR_4(f); } - static inline flag_t test_then_add4(volatile flag_t *f) { - return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f)); - } - static inline flag_t test_then_or(volatile flag_t *f, flag_t v) { - return KMP_TEST_THEN_OR32(f, v); - } - static inline flag_t test_then_and(volatile flag_t *f, flag_t v) { - return KMP_TEST_THEN_AND32(f, v); - } -}; - -template <> struct flag_traits<kmp_uint64> { - typedef kmp_uint64 flag_t; - static const flag_type t = flag64; - static inline flag_t tcr(flag_t f) { return TCR_8(f); } - static inline flag_t test_then_add4(volatile flag_t *f) { - return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f)); - } - static inline flag_t test_then_or(volatile flag_t *f, flag_t v) { - return KMP_TEST_THEN_OR64(f, v); - } - static inline flag_t test_then_and(volatile flag_t *f, flag_t v) { - return KMP_TEST_THEN_AND64(f, v); - } -}; - -// Basic flag that does not use C11 Atomics -template <typename FlagType, bool Sleepable> -class kmp_basic_flag_native : public kmp_flag_native<FlagType> { - typedef flag_traits<FlagType> traits_type; - FlagType checker; /**< Value to compare flag to to check if flag has been - released. */ - kmp_info_t - *waiting_threads[1]; /**< Array of threads sleeping on this thread. */ - kmp_uint32 - num_waiting_threads; /**< Number of threads sleeping on this thread. */ -public: - kmp_basic_flag_native(volatile FlagType *p) - : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(0) {} - kmp_basic_flag_native(volatile FlagType *p, kmp_info_t *thr) - : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(1) { - waiting_threads[0] = thr; - } - kmp_basic_flag_native(volatile FlagType *p, FlagType c) - : kmp_flag_native<FlagType>(p, traits_type::t), checker(c), - num_waiting_threads(0) {} - /*! - * param i in index into waiting_threads - * @result the thread that is waiting at index i - */ - kmp_info_t *get_waiter(kmp_uint32 i) { - KMP_DEBUG_ASSERT(i < num_waiting_threads); - return waiting_threads[i]; - } - /*! - * @result num_waiting_threads - */ - kmp_uint32 get_num_waiters() { return num_waiting_threads; } - /*! - * @param thr in the thread which is now waiting - * - * Insert a waiting thread at index 0. - */ - void set_waiter(kmp_info_t *thr) { - waiting_threads[0] = thr; - num_waiting_threads = 1; - } - /*! - * @result true if the flag object has been released. - */ - bool done_check() { - if (Sleepable) - return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) == - checker; - else - return traits_type::tcr(*(this->get())) == checker; - } - /*! - * @param old_loc in old value of flag - * @result true if the flag's old value indicates it was released. - */ - bool done_check_val(FlagType old_loc) { return old_loc == checker; } - /*! - * @result true if the flag object is not yet released. - * Used in __kmp_wait_template like: - * @code - * while (flag.notdone_check()) { pause(); } - * @endcode - */ - bool notdone_check() { return traits_type::tcr(*(this->get())) != checker; } - /*! - * @result Actual flag value before release was applied. - * Trigger all waiting threads to run by modifying flag to release state. - */ - void internal_release() { - (void)traits_type::test_then_add4((volatile FlagType *)this->get()); - } - /*! - * @result Actual flag value before sleep bit(s) set. - * Notes that there is at least one thread sleeping on the flag by setting - * sleep bit(s). - */ - FlagType set_sleeping() { - return traits_type::test_then_or((volatile FlagType *)this->get(), - KMP_BARRIER_SLEEP_STATE); - } - /*! - * @result Actual flag value before sleep bit(s) cleared. - * Notes that there are no longer threads sleeping on the flag by clearing - * sleep bit(s). - */ - FlagType unset_sleeping() { - return traits_type::test_then_and((volatile FlagType *)this->get(), - ~KMP_BARRIER_SLEEP_STATE); - } - /*! - * @param old_loc in old value of flag - * Test whether there are threads sleeping on the flag's old value in old_loc. - */ - bool is_sleeping_val(FlagType old_loc) { - return old_loc & KMP_BARRIER_SLEEP_STATE; - } - /*! - * Test whether there are threads sleeping on the flag. - */ - bool is_sleeping() { return is_sleeping_val(*(this->get())); } - bool is_any_sleeping() { return is_sleeping_val(*(this->get())); } - kmp_uint8 *get_stolen() { return NULL; } - enum barrier_type get_bt() { return bs_last_barrier; } -}; - -template <typename FlagType, bool Sleepable> -class kmp_basic_flag : public kmp_flag<FlagType> { - typedef flag_traits<FlagType> traits_type; - FlagType checker; /**< Value to compare flag to to check if flag has been - released. */ - kmp_info_t - *waiting_threads[1]; /**< Array of threads sleeping on this thread. */ - kmp_uint32 - num_waiting_threads; /**< Number of threads sleeping on this thread. */ -public: - kmp_basic_flag(std::atomic<FlagType> *p) - : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {} - kmp_basic_flag(std::atomic<FlagType> *p, kmp_info_t *thr) - : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) { - waiting_threads[0] = thr; - } - kmp_basic_flag(std::atomic<FlagType> *p, FlagType c) - : kmp_flag<FlagType>(p, traits_type::t), checker(c), - num_waiting_threads(0) {} - /*! - * param i in index into waiting_threads - * @result the thread that is waiting at index i - */ - kmp_info_t *get_waiter(kmp_uint32 i) { - KMP_DEBUG_ASSERT(i < num_waiting_threads); - return waiting_threads[i]; - } - /*! - * @result num_waiting_threads - */ - kmp_uint32 get_num_waiters() { return num_waiting_threads; } - /*! - * @param thr in the thread which is now waiting - * - * Insert a waiting thread at index 0. - */ - void set_waiter(kmp_info_t *thr) { - waiting_threads[0] = thr; - num_waiting_threads = 1; - } - /*! - * @result true if the flag object has been released. - */ - bool done_check() { - if (Sleepable) - return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker; - else - return this->load() == checker; - } - /*! - * @param old_loc in old value of flag - * @result true if the flag's old value indicates it was released. - */ - bool done_check_val(FlagType old_loc) { return old_loc == checker; } - /*! - * @result true if the flag object is not yet released. - * Used in __kmp_wait_template like: - * @code - * while (flag.notdone_check()) { pause(); } - * @endcode - */ - bool notdone_check() { return this->load() != checker; } - /*! - * @result Actual flag value before release was applied. - * Trigger all waiting threads to run by modifying flag to release state. - */ - void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); } - /*! - * @result Actual flag value before sleep bit(s) set. - * Notes that there is at least one thread sleeping on the flag by setting - * sleep bit(s). - */ - FlagType set_sleeping() { - return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE); - } - /*! - * @result Actual flag value before sleep bit(s) cleared. - * Notes that there are no longer threads sleeping on the flag by clearing - * sleep bit(s). - */ - FlagType unset_sleeping() { - return KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE); - } - /*! - * @param old_loc in old value of flag - * Test whether there are threads sleeping on the flag's old value in old_loc. - */ - bool is_sleeping_val(FlagType old_loc) { - return old_loc & KMP_BARRIER_SLEEP_STATE; - } - /*! - * Test whether there are threads sleeping on the flag. - */ - bool is_sleeping() { return is_sleeping_val(this->load()); } - bool is_any_sleeping() { return is_sleeping_val(this->load()); } - kmp_uint8 *get_stolen() { return NULL; } - enum barrier_type get_bt() { return bs_last_barrier; } -}; - template <bool Cancellable, bool Sleepable> -class kmp_flag_32 : public kmp_basic_flag<kmp_uint32, Sleepable> { +class kmp_flag_32 : public kmp_flag_atomic<kmp_uint32, flag32, Sleepable> { public: kmp_flag_32(std::atomic<kmp_uint32> *p) - : kmp_basic_flag<kmp_uint32, Sleepable>(p) {} + : kmp_flag_atomic<kmp_uint32, flag32, Sleepable>(p) {} kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr) - : kmp_basic_flag<kmp_uint32, Sleepable>(p, thr) {} + : kmp_flag_atomic<kmp_uint32, flag32, Sleepable>(p, thr) {} kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c) - : kmp_basic_flag<kmp_uint32, Sleepable>(p, c) {} + : kmp_flag_atomic<kmp_uint32, flag32, Sleepable>(p, c) {} void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); } #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT void mwait(int th_gtid) { __kmp_mwait_32(th_gtid, this); } @@ -895,14 +856,16 @@ public: }; template <bool Cancellable, bool Sleepable> -class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64, Sleepable> { +class kmp_flag_64 : public kmp_flag_native<kmp_uint64, flag64, Sleepable> { public: kmp_flag_64(volatile kmp_uint64 *p) - : kmp_basic_flag_native<kmp_uint64, Sleepable>(p) {} + : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p) {} kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr) - : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, thr) {} + : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p, thr) {} kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c) - : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, c) {} + : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p, c) {} + kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c, std::atomic<bool> *loc) + : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p, c, loc) {} void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); } #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT void mwait(int th_gtid) { __kmp_mwait_64(th_gtid, this); } @@ -928,20 +891,52 @@ public: flag_type get_ptr_type() { return flag64; } }; +template <bool Cancellable, bool Sleepable> +class kmp_atomic_flag_64 + : public kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable> { +public: + kmp_atomic_flag_64(std::atomic<kmp_uint64> *p) + : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p) {} + kmp_atomic_flag_64(std::atomic<kmp_uint64> *p, kmp_info_t *thr) + : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p, thr) {} + kmp_atomic_flag_64(std::atomic<kmp_uint64> *p, kmp_uint64 c) + : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p, c) {} + kmp_atomic_flag_64(std::atomic<kmp_uint64> *p, kmp_uint64 c, + std::atomic<bool> *loc) + : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p, c, loc) {} + void suspend(int th_gtid) { __kmp_atomic_suspend_64(th_gtid, this); } + void mwait(int th_gtid) { __kmp_atomic_mwait_64(th_gtid, this); } + void resume(int th_gtid) { __kmp_atomic_resume_64(th_gtid, this); } + int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, + int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), + kmp_int32 is_constrained) { + return __kmp_atomic_execute_tasks_64( + this_thr, gtid, this, final_spin, + thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); + } + bool wait(kmp_info_t *this_thr, + int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + if (final_spin) + return __kmp_wait_template<kmp_atomic_flag_64, TRUE, Cancellable, + Sleepable>( + this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); + else + return __kmp_wait_template<kmp_atomic_flag_64, FALSE, Cancellable, + Sleepable>( + this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); + } + void release() { __kmp_release_template(this); } + flag_type get_ptr_type() { return atomic_flag64; } +}; + // Hierarchical 64-bit on-core barrier instantiation -class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> { - kmp_uint64 checker; - kmp_info_t *waiting_threads[1]; - kmp_uint32 num_waiting_threads; - kmp_uint32 - offset; /**< Portion of flag that is of interest for an operation. */ +class kmp_flag_oncore : public kmp_flag_native<kmp_uint64, flag_oncore, false> { + kmp_uint32 offset; /**< Portion of flag of interest for an operation. */ bool flag_switch; /**< Indicates a switch in flag location. */ enum barrier_type bt; /**< Barrier type. */ - kmp_info_t *this_thr; /**< Thread that may be redirected to different flag - location. */ + kmp_info_t *this_thr; /**< Thread to redirect to different flag location. */ #if USE_ITT_BUILD - void * - itt_sync_obj; /**< ITT object that must be passed to new flag location. */ + void *itt_sync_obj; /**< ITT object to pass to new flag location. */ #endif unsigned char &byteref(volatile kmp_uint64 *loc, size_t offset) { return (RCAST(unsigned char *, CCAST(kmp_uint64 *, loc)))[offset]; @@ -949,31 +944,26 @@ class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> { public: kmp_flag_oncore(volatile kmp_uint64 *p) - : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0), - flag_switch(false) {} + : kmp_flag_native<kmp_uint64, flag_oncore, false>(p), flag_switch(false) { + } kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx) - : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0), - offset(idx), flag_switch(false) {} + : kmp_flag_native<kmp_uint64, flag_oncore, false>(p), offset(idx), + flag_switch(false), + bt(bs_last_barrier) USE_ITT_BUILD_ARG(itt_sync_obj(nullptr)) {} kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx, enum barrier_type bar_t, kmp_info_t *thr USE_ITT_BUILD_ARG(void *itt)) - : kmp_flag_native<kmp_uint64>(p, flag_oncore), checker(c), - num_waiting_threads(0), offset(idx), flag_switch(false), bt(bar_t), + : kmp_flag_native<kmp_uint64, flag_oncore, false>(p, c), offset(idx), + flag_switch(false), bt(bar_t), this_thr(thr) USE_ITT_BUILD_ARG(itt_sync_obj(itt)) {} - kmp_info_t *get_waiter(kmp_uint32 i) { - KMP_DEBUG_ASSERT(i < num_waiting_threads); - return waiting_threads[i]; - } - kmp_uint32 get_num_waiters() { return num_waiting_threads; } - void set_waiter(kmp_info_t *thr) { - waiting_threads[0] = thr; - num_waiting_threads = 1; - } - bool done_check_val(kmp_uint64 old_loc) { + virtual ~kmp_flag_oncore() override {} + void *operator new(size_t size) { return __kmp_allocate(size); } + void operator delete(void *p) { __kmp_free(p); } + bool done_check_val(kmp_uint64 old_loc) override { return byteref(&old_loc, offset) == checker; } - bool done_check() { return done_check_val(*get()); } - bool notdone_check() { + bool done_check() override { return done_check_val(*get()); } + bool notdone_check() override { // Calculate flag_switch if (this_thr->th.th_bar[bt].bb.wait_flag == KMP_BARRIER_SWITCH_TO_OWN_FLAG) flag_switch = true; @@ -997,17 +987,6 @@ public: KMP_TEST_THEN_OR64(get(), mask); } } - kmp_uint64 set_sleeping() { - return KMP_TEST_THEN_OR64(get(), KMP_BARRIER_SLEEP_STATE); - } - kmp_uint64 unset_sleeping() { - return KMP_TEST_THEN_AND64(get(), ~KMP_BARRIER_SLEEP_STATE); - } - bool is_sleeping_val(kmp_uint64 old_loc) { - return old_loc & KMP_BARRIER_SLEEP_STATE; - } - bool is_sleeping() { return is_sleeping_val(*get()); } - bool is_any_sleeping() { return is_sleeping_val(*get()); } void wait(kmp_info_t *this_thr, int final_spin) { if (final_spin) __kmp_wait_template<kmp_flag_oncore, TRUE>( @@ -1038,27 +1017,39 @@ public: thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); #endif } - kmp_uint8 *get_stolen() { return NULL; } enum barrier_type get_bt() { return bt; } flag_type get_ptr_type() { return flag_oncore; } }; -// Used to wake up threads, volatile void* flag is usually the th_sleep_loc -// associated with int gtid. -static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) { +static inline void __kmp_null_resume_wrapper(kmp_info_t *thr) { + int gtid = __kmp_gtid_from_thread(thr); + void *flag = CCAST(void *, thr->th.th_sleep_loc); + flag_type type = thr->th.th_sleep_loc_type; if (!flag) return; - - switch (RCAST(kmp_flag_64<> *, CCAST(void *, flag))->get_type()) { + // Attempt to wake up a thread: examine its type and call appropriate template + switch (type) { case flag32: - __kmp_resume_32(gtid, (kmp_flag_32<> *)NULL); + __kmp_resume_32(gtid, RCAST(kmp_flag_32<> *, flag)); break; case flag64: - __kmp_resume_64(gtid, (kmp_flag_64<> *)NULL); + __kmp_resume_64(gtid, RCAST(kmp_flag_64<> *, flag)); + break; + case atomic_flag64: + __kmp_atomic_resume_64(gtid, RCAST(kmp_atomic_flag_64<> *, flag)); break; case flag_oncore: - __kmp_resume_oncore(gtid, (kmp_flag_oncore *)NULL); + __kmp_resume_oncore(gtid, RCAST(kmp_flag_oncore *, flag)); + break; +#ifdef KMP_DEBUG + case flag_unset: + KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d is unset\n", type)); break; + default: + KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d does not match any " + "known flag type\n", + type)); +#endif } } diff --git a/contrib/libs/cxxsupp/openmp/omp.h b/contrib/libs/cxxsupp/openmp/omp.h index cb2fe49599..2ddf4f630b 100644 --- a/contrib/libs/cxxsupp/openmp/omp.h +++ b/contrib/libs/cxxsupp/openmp/omp.h @@ -437,14 +437,23 @@ extern omp_allocator_handle_t __KAI_KMPC_CONVENTION omp_get_default_allocator(void); # ifdef __cplusplus extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a = omp_null_allocator); - extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t a = omp_null_allocator); + extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size, + omp_allocator_handle_t a = omp_null_allocator); + extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size, + omp_allocator_handle_t a = omp_null_allocator); + extern void *__KAI_KMPC_CONVENTION omp_aligned_calloc(size_t align, size_t nmemb, size_t size, + omp_allocator_handle_t a = omp_null_allocator); extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator = omp_null_allocator, omp_allocator_handle_t free_allocator = omp_null_allocator); extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator); # else extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a); + extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size, + omp_allocator_handle_t a); extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t a); + extern void *__KAI_KMPC_CONVENTION omp_aligned_calloc(size_t align, size_t nmemb, size_t size, + omp_allocator_handle_t a); extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator, omp_allocator_handle_t free_allocator); extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a); diff --git a/contrib/libs/cxxsupp/openmp/ompt-general.cpp b/contrib/libs/cxxsupp/openmp/ompt-general.cpp index 3d8ef041f7..c1468c0c32 100644 --- a/contrib/libs/cxxsupp/openmp/ompt-general.cpp +++ b/contrib/libs/cxxsupp/openmp/ompt-general.cpp @@ -295,9 +295,16 @@ ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) { OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success. \n"); OMPT_VERBOSE_INIT_PRINT("Searching for ompt_start_tool in %s... ", fname); + dlerror(); // Clear any existing error start_tool = (ompt_start_tool_t)dlsym(h, "ompt_start_tool"); if (!start_tool) { - OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", dlerror()); + char *error = dlerror(); + if (error != NULL) { + OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", error); + } else { + OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", + "ompt_start_tool = NULL"); + } } else #elif KMP_OS_WINDOWS OMPT_VERBOSE_INIT_PRINT("Opening %s... ", fname); diff --git a/contrib/libs/cxxsupp/openmp/ompt-specific.cpp b/contrib/libs/cxxsupp/openmp/ompt-specific.cpp index 1ad0e17ed4..c28b9bd1a6 100644 --- a/contrib/libs/cxxsupp/openmp/ompt-specific.cpp +++ b/contrib/libs/cxxsupp/openmp/ompt-specific.cpp @@ -283,10 +283,6 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, link_lwt->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr); *OMPT_CUR_TEAM_INFO(thr) = tmp_team; - ompt_task_info_t tmp_task = lwt->ompt_task_info; - link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr); - *OMPT_CUR_TASK_INFO(thr) = tmp_task; - // link the taskteam into the list of taskteams: ompt_lw_taskteam_t *my_parent = thr->th.th_team->t.ompt_serialized_team_info; @@ -297,6 +293,10 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, ompd_bp_parallel_begin(); } #endif + + ompt_task_info_t tmp_task = lwt->ompt_task_info; + link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr); + *OMPT_CUR_TASK_INFO(thr) = tmp_task; } else { // this is the first serialized team, so we just store the values in the // team and drop the taskteam-object @@ -313,6 +313,9 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, void __ompt_lw_taskteam_unlink(kmp_info_t *thr) { ompt_lw_taskteam_t *lwtask = thr->th.th_team->t.ompt_serialized_team_info; if (lwtask) { + ompt_task_info_t tmp_task = lwtask->ompt_task_info; + lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr); + *OMPT_CUR_TASK_INFO(thr) = tmp_task; #if OMPD_SUPPORT if (ompd_state & OMPD_ENABLE_BP) { ompd_bp_parallel_end(); @@ -324,10 +327,6 @@ void __ompt_lw_taskteam_unlink(kmp_info_t *thr) { lwtask->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr); *OMPT_CUR_TEAM_INFO(thr) = tmp_team; - ompt_task_info_t tmp_task = lwtask->ompt_task_info; - lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr); - *OMPT_CUR_TASK_INFO(thr) = tmp_task; - if (lwtask->heap) { __kmp_free(lwtask); lwtask = NULL; @@ -365,13 +364,9 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type, if (team == NULL) return 0; ompt_lw_taskteam_t *lwt = NULL, - *next_lwt = LWT_FROM_TEAM(taskdata->td_team), - *prev_lwt = NULL; + *next_lwt = LWT_FROM_TEAM(taskdata->td_team); while (ancestor_level > 0) { - // needed for thread_num - prev_team = team; - prev_lwt = lwt; // next lightweight team (if any) if (lwt) lwt = lwt->parent; @@ -390,6 +385,7 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type, taskdata = taskdata->td_parent; if (team == NULL) return 0; + prev_team = team; team = team->t.t_parent; if (taskdata) { next_lwt = LWT_FROM_TEAM(taskdata->td_team); @@ -431,9 +427,18 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type, if (thread_num) { if (level == 0) *thread_num = __kmp_get_tid(); - else if (prev_lwt) + else if (lwt) *thread_num = 0; - else + else if (!prev_team) { + // The innermost parallel region contains at least one explicit task. + // The task at level > 0 is either an implicit task that + // corresponds to the mentioned region or one of the explicit tasks + // nested inside the same region. Note that the task isn't the + // innermost explicit tasks (because of condition level > 0). + // Since the task at this level still belongs to the innermost parallel + // region, thread_num is determined the same way as for level==0. + *thread_num = __kmp_get_tid(); + } else *thread_num = prev_team->t.t_master_tid; // *thread_num = team->t.t_master_tid; } diff --git a/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp b/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp index 42ad1d56f9..5cd6ad6a03 100644 --- a/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp +++ b/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp @@ -1051,6 +1051,8 @@ void __kmp_reap_worker(kmp_info_t *th) { "exit_val = %p\n", th->th.th_info.ds.ds_gtid, exit_val)); } +#else + (void)status; // unused variable #endif /* KMP_DEBUG */ KA_TRACE(10, ("__kmp_reap_worker: done reaping T#%d\n", @@ -1232,7 +1234,7 @@ static void __kmp_atfork_child(void) { // affinity in the parent kmp_set_thread_affinity_mask_initial(); #endif - // Set default not to bind threads tightly in the child (we’re expecting + // Set default not to bind threads tightly in the child (we're expecting // over-subscription after the fork and this can improve things for // scripting languages that use OpenMP inside process-parallel code). __kmp_affinity_type = affinity_none; @@ -1407,9 +1409,13 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) { /* TODO: shouldn't this use release semantics to ensure that __kmp_suspend_initialize_thread gets called first? */ old_spin = flag->set_sleeping(); + TCW_PTR(th->th.th_sleep_loc, (void *)flag); + th->th.th_sleep_loc_type = flag->get_type(); if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && __kmp_pause_status != kmp_soft_paused) { flag->unset_sleeping(); + TCW_PTR(th->th.th_sleep_loc, NULL); + th->th.th_sleep_loc_type = flag_unset; __kmp_unlock_suspend_mx(th); return; } @@ -1417,8 +1423,10 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) { " was %x\n", th_gtid, flag->get(), flag->load(), old_spin)); - if (flag->done_check_val(old_spin)) { - old_spin = flag->unset_sleeping(); + if (flag->done_check_val(old_spin) || flag->done_check()) { + flag->unset_sleeping(); + TCW_PTR(th->th.th_sleep_loc, NULL); + th->th.th_sleep_loc_type = flag_unset; KF_TRACE(5, ("__kmp_suspend_template: T#%d false alarm, reset sleep bit " "for spin(%p)\n", th_gtid, flag->get())); @@ -1427,7 +1435,6 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) { "with low probability" return when the condition variable has not been signaled or broadcast */ int deactivated = FALSE; - TCW_PTR(th->th.th_sleep_loc, (void *)flag); while (flag->is_sleeping()) { #ifdef DEBUG_SUSPEND @@ -1449,6 +1456,9 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) { deactivated = TRUE; } + KMP_DEBUG_ASSERT(th->th.th_sleep_loc); + KMP_DEBUG_ASSERT(flag->get_type() == th->th.th_sleep_loc_type); + #if USE_SUSPEND_TIMEOUT struct timespec now; struct timeval tval; @@ -1478,6 +1488,18 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) { if ((status != 0) && (status != EINTR) && (status != ETIMEDOUT)) { KMP_SYSFAIL("pthread_cond_wait", status); } + + KMP_DEBUG_ASSERT(flag->get_type() == flag->get_ptr_type()); + + if (!flag->is_sleeping() && + ((status == EINTR) || (status == ETIMEDOUT))) { + // if interrupt or timeout, and thread is no longer sleeping, we need to + // make sure sleep_loc gets reset; however, this shouldn't be needed if + // we woke up with resume + flag->unset_sleeping(); + TCW_PTR(th->th.th_sleep_loc, NULL); + th->th.th_sleep_loc_type = flag_unset; + } #ifdef KMP_DEBUG if (status == ETIMEDOUT) { if (flag->is_sleeping()) { @@ -1487,6 +1509,8 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) { KF_TRACE(2, ("__kmp_suspend_template: T#%d timeout wakeup, sleep bit " "not set!\n", th_gtid)); + TCW_PTR(th->th.th_sleep_loc, NULL); + th->th.th_sleep_loc_type = flag_unset; } } else if (flag->is_sleeping()) { KF_TRACE(100, @@ -1504,6 +1528,13 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) { } } } + // We may have had the loop variable set before entering the loop body; + // so we need to reset sleep_loc. + TCW_PTR(th->th.th_sleep_loc, NULL); + th->th.th_sleep_loc_type = flag_unset; + + KMP_DEBUG_ASSERT(!flag->is_sleeping()); + KMP_DEBUG_ASSERT(!th->th.th_sleep_loc); #ifdef DEBUG_SUSPEND { char buffer[128]; @@ -1525,6 +1556,10 @@ template <bool C, bool S> void __kmp_suspend_64(int th_gtid, kmp_flag_64<C, S> *flag) { __kmp_suspend_template(th_gtid, flag); } +template <bool C, bool S> +void __kmp_atomic_suspend_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag) { + __kmp_suspend_template(th_gtid, flag); +} void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) { __kmp_suspend_template(th_gtid, flag); } @@ -1532,6 +1567,10 @@ void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) { template void __kmp_suspend_32<false, false>(int, kmp_flag_32<false, false> *); template void __kmp_suspend_64<false, true>(int, kmp_flag_64<false, true> *); template void __kmp_suspend_64<true, false>(int, kmp_flag_64<true, false> *); +template void +__kmp_atomic_suspend_64<false, true>(int, kmp_atomic_flag_64<false, true> *); +template void +__kmp_atomic_suspend_64<true, false>(int, kmp_atomic_flag_64<true, false> *); /* This routine signals the thread specified by target_gtid to wake up after setting the sleep bit indicated by the flag argument to FALSE. @@ -1554,36 +1593,50 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) { __kmp_lock_suspend_mx(th); - if (!flag) { // coming from __kmp_null_resume_wrapper + if (!flag || flag != th->th.th_sleep_loc) { + // coming from __kmp_null_resume_wrapper, or thread is now sleeping on a + // different location; wake up at new location flag = (C *)CCAST(void *, th->th.th_sleep_loc); } // First, check if the flag is null or its type has changed. If so, someone // else woke it up. - if (!flag || flag->get_type() != flag->get_ptr_type()) { // get_ptr_type - // simply shows what flag was cast to + if (!flag) { // Thread doesn't appear to be sleeping on anything KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already " "awake: flag(%p)\n", - gtid, target_gtid, NULL)); + gtid, target_gtid, (void *)NULL)); __kmp_unlock_suspend_mx(th); return; + } else if (flag->get_type() != th->th.th_sleep_loc_type) { + // Flag type does not appear to match this function template; possibly the + // thread is sleeping on something else. Try null resume again. + KF_TRACE( + 5, + ("__kmp_resume_template: T#%d retrying, thread T#%d Mismatch flag(%p), " + "spin(%p) type=%d ptr_type=%d\n", + gtid, target_gtid, flag, flag->get(), flag->get_type(), + th->th.th_sleep_loc_type)); + __kmp_unlock_suspend_mx(th); + __kmp_null_resume_wrapper(th); + return; } else { // if multiple threads are sleeping, flag should be internally // referring to a specific thread here - typename C::flag_t old_spin = flag->unset_sleeping(); - if (!flag->is_sleeping_val(old_spin)) { + if (!flag->is_sleeping()) { KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already " - "awake: flag(%p): " - "%u => %u\n", - gtid, target_gtid, flag->get(), old_spin, flag->load())); + "awake: flag(%p): %u\n", + gtid, target_gtid, flag->get(), (unsigned int)flag->load())); __kmp_unlock_suspend_mx(th); return; } - KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset " - "sleep bit for flag's loc(%p): " - "%u => %u\n", - gtid, target_gtid, flag->get(), old_spin, flag->load())); } + KMP_DEBUG_ASSERT(flag); + flag->unset_sleeping(); TCW_PTR(th->th.th_sleep_loc, NULL); + th->th.th_sleep_loc_type = flag_unset; + + KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset " + "sleep bit for flag's loc(%p): %u\n", + gtid, target_gtid, flag->get(), (unsigned int)flag->load())); #ifdef DEBUG_SUSPEND { @@ -1609,12 +1662,19 @@ template <bool C, bool S> void __kmp_resume_64(int target_gtid, kmp_flag_64<C, S> *flag) { __kmp_resume_template(target_gtid, flag); } +template <bool C, bool S> +void __kmp_atomic_resume_64(int target_gtid, kmp_atomic_flag_64<C, S> *flag) { + __kmp_resume_template(target_gtid, flag); +} void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) { __kmp_resume_template(target_gtid, flag); } template void __kmp_resume_32<false, true>(int, kmp_flag_32<false, true> *); +template void __kmp_resume_32<false, false>(int, kmp_flag_32<false, false> *); template void __kmp_resume_64<false, true>(int, kmp_flag_64<false, true> *); +template void +__kmp_atomic_resume_64<false, true>(int, kmp_atomic_flag_64<false, true> *); #if KMP_USE_MONITOR void __kmp_resume_monitor() { @@ -1741,8 +1801,12 @@ static int __kmp_get_xproc(void) { int r = 0; -#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ - KMP_OS_OPENBSD || KMP_OS_HURD +#if KMP_OS_LINUX + + __kmp_type_convert(sysconf(_SC_NPROCESSORS_CONF), &(r)); + +#elif KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD || \ + KMP_OS_HURD __kmp_type_convert(sysconf(_SC_NPROCESSORS_ONLN), &(r)); |