aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorthegeorg <thegeorg@yandex-team.ru>2022-06-03 10:53:07 +0300
committerthegeorg <thegeorg@yandex-team.ru>2022-06-03 10:53:07 +0300
commita1d4361e379e2c72a469ad1bd64569cbc2db131f (patch)
tree0caddb240a10132376e4653a31578e117d33f9fd
parent41f55a521834080d9d703c099c0418cfff3a0546 (diff)
downloadydb-a1d4361e379e2c72a469ad1bd64569cbc2db131f.tar.gz
Update contrib/libs/cxxsupp/openmp to 14.0.4
ref:77c6cdda99b217d50c4deadca11f5611fa0dc168
-rw-r--r--contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report4
-rw-r--r--contrib/libs/cxxsupp/openmp/exports_so.txt2
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp.h302
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_affinity.cpp718
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_affinity.h158
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_alloc.cpp115
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_atomic.cpp194
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_atomic.h82
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_barrier.cpp567
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_barrier.h141
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_config.h15
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_csupport.cpp83
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp17
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_dispatch.h4
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h114
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_ftn_os.h35
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_global.cpp13
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp107
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc17
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc13
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_itt.cpp9
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_itt.h22
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_lock.cpp141
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_lock.h49
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_os.h24
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_runtime.cpp430
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_settings.cpp331
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_stats.h4
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_str.cpp25
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_str.h1
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp94
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_taskdeps.h9
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_tasking.cpp212
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_utility.cpp20
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp8
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_wait_release.h741
-rw-r--r--contrib/libs/cxxsupp/openmp/omp.h11
-rw-r--r--contrib/libs/cxxsupp/openmp/ompt-general.cpp9
-rw-r--r--contrib/libs/cxxsupp/openmp/ompt-specific.cpp35
-rw-r--r--contrib/libs/cxxsupp/openmp/z_Linux_util.cpp102
40 files changed, 3980 insertions, 998 deletions
diff --git a/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report b/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report
index 7e4845f2b7..7fc086467b 100644
--- a/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report
+++ b/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report
@@ -126,6 +126,7 @@ BELONGS ya.make
kmp_atomic.cpp [7:8]
kmp_atomic.h [7:8]
kmp_barrier.cpp [7:8]
+ kmp_barrier.h [7:8]
kmp_cancel.cpp [4:5]
kmp_config.h [6:7]
kmp_csupport.cpp [7:8]
@@ -193,6 +194,7 @@ BELONGS ya.make
kmp_atomic.cpp [7:8]
kmp_atomic.h [7:8]
kmp_barrier.cpp [7:8]
+ kmp_barrier.h [7:8]
kmp_cancel.cpp [4:5]
kmp_config.h [6:7]
kmp_csupport.cpp [7:8]
@@ -326,6 +328,7 @@ BELONGS ya.make
kmp_atomic.cpp [9:9]
kmp_atomic.h [9:9]
kmp_barrier.cpp [9:9]
+ kmp_barrier.h [9:9]
kmp_cancel.cpp [6:6]
kmp_config.h [8:8]
kmp_csupport.cpp [9:9]
@@ -393,6 +396,7 @@ BELONGS ya.make
kmp_atomic.cpp [9:9]
kmp_atomic.h [9:9]
kmp_barrier.cpp [9:9]
+ kmp_barrier.h [9:9]
kmp_cancel.cpp [6:6]
kmp_config.h [8:8]
kmp_csupport.cpp [9:9]
diff --git a/contrib/libs/cxxsupp/openmp/exports_so.txt b/contrib/libs/cxxsupp/openmp/exports_so.txt
index cb79ae72e6..ac188af310 100644
--- a/contrib/libs/cxxsupp/openmp/exports_so.txt
+++ b/contrib/libs/cxxsupp/openmp/exports_so.txt
@@ -120,5 +120,7 @@ GOMP_4.5 {
} GOMP_4.0;
GOMP_5.0 {
} GOMP_4.5;
+GOMP_5.0.1 {
+} GOMP_5.0;
# end of file #
diff --git a/contrib/libs/cxxsupp/openmp/kmp.h b/contrib/libs/cxxsupp/openmp/kmp.h
index 0652080277..9502167474 100644
--- a/contrib/libs/cxxsupp/openmp/kmp.h
+++ b/contrib/libs/cxxsupp/openmp/kmp.h
@@ -115,6 +115,7 @@ typedef unsigned int kmp_hwloc_depth_t;
#include "kmp_debug.h"
#include "kmp_lock.h"
#include "kmp_version.h"
+#include "kmp_barrier.h"
#if USE_DEBUGGER
#error #include "kmp_debugger.h"
#endif
@@ -263,6 +264,7 @@ typedef union kmp_root kmp_root_p;
template <bool C = false, bool S = true> class kmp_flag_32;
template <bool C = false, bool S = true> class kmp_flag_64;
+template <bool C = false, bool S = true> class kmp_atomic_flag_64;
class kmp_flag_oncore;
#ifdef __cplusplus
@@ -616,6 +618,19 @@ enum kmp_hw_t : int {
KMP_HW_LAST
};
+typedef enum kmp_hw_core_type_t {
+ KMP_HW_CORE_TYPE_UNKNOWN = 0x0,
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ KMP_HW_CORE_TYPE_ATOM = 0x20,
+ KMP_HW_CORE_TYPE_CORE = 0x40,
+ KMP_HW_MAX_NUM_CORE_TYPES = 3,
+#else
+ KMP_HW_MAX_NUM_CORE_TYPES = 1,
+#endif
+} kmp_hw_core_type_t;
+
+#define KMP_HW_MAX_NUM_CORE_EFFS 8
+
#define KMP_DEBUG_ASSERT_VALID_HW_TYPE(type) \
KMP_DEBUG_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
#define KMP_ASSERT_VALID_HW_TYPE(type) \
@@ -627,6 +642,7 @@ enum kmp_hw_t : int {
const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural = false);
const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural = false);
+const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type);
/* Only Linux* OS and Windows* OS support thread affinity. */
#if KMP_AFFINITY_SUPPORTED
@@ -847,6 +863,7 @@ typedef struct kmp_nested_proc_bind_t {
} kmp_nested_proc_bind_t;
extern kmp_nested_proc_bind_t __kmp_nested_proc_bind;
+extern kmp_proc_bind_t __kmp_teams_proc_bind;
extern int __kmp_display_affinity;
extern char *__kmp_affinity_format;
@@ -987,7 +1004,7 @@ typedef omp_memspace_handle_t kmp_memspace_t; // placeholder
typedef struct kmp_allocator_t {
omp_memspace_handle_t memspace;
void **memkind; // pointer to memkind
- int alignment;
+ size_t alignment;
omp_alloctrait_value_t fb;
kmp_allocator_t *fb_data;
kmp_uint64 pool_size;
@@ -1001,13 +1018,25 @@ extern omp_allocator_handle_t __kmpc_init_allocator(int gtid,
extern void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t al);
extern void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t al);
extern omp_allocator_handle_t __kmpc_get_default_allocator(int gtid);
+// external interfaces, may be used by compiler
extern void *__kmpc_alloc(int gtid, size_t sz, omp_allocator_handle_t al);
+extern void *__kmpc_aligned_alloc(int gtid, size_t align, size_t sz,
+ omp_allocator_handle_t al);
extern void *__kmpc_calloc(int gtid, size_t nmemb, size_t sz,
omp_allocator_handle_t al);
extern void *__kmpc_realloc(int gtid, void *ptr, size_t sz,
omp_allocator_handle_t al,
omp_allocator_handle_t free_al);
extern void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al);
+// internal interfaces, contain real implementation
+extern void *__kmp_alloc(int gtid, size_t align, size_t sz,
+ omp_allocator_handle_t al);
+extern void *__kmp_calloc(int gtid, size_t align, size_t nmemb, size_t sz,
+ omp_allocator_handle_t al);
+extern void *__kmp_realloc(int gtid, void *ptr, size_t sz,
+ omp_allocator_handle_t al,
+ omp_allocator_handle_t free_al);
+extern void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al);
extern void __kmp_init_memkind();
extern void __kmp_fini_memkind();
@@ -1066,7 +1095,9 @@ extern void __kmp_init_target_mem();
#define KMP_MIN_BLOCKTIME (0)
#define KMP_MAX_BLOCKTIME \
(INT_MAX) /* Must be this for "infinite" setting the work */
-#define KMP_DEFAULT_BLOCKTIME (200) /* __kmp_blocktime is in milliseconds */
+
+/* __kmp_blocktime is in milliseconds */
+#define KMP_DEFAULT_BLOCKTIME (__kmp_is_hybrid_cpu() ? (0) : (200))
#if KMP_USE_MONITOR
#define KMP_DEFAULT_MONITOR_STKSIZE ((size_t)(64 * 1024))
@@ -1204,6 +1235,13 @@ typedef struct kmp_cpuid {
kmp_uint32 edx;
} kmp_cpuid_t;
+typedef struct kmp_cpuinfo_flags_t {
+ unsigned sse2 : 1; // 0 if SSE2 instructions are not supported, 1 otherwise.
+ unsigned rtm : 1; // 0 if RTM instructions are not supported, 1 otherwise.
+ unsigned hybrid : 1;
+ unsigned reserved : 29; // Ensure size of 32 bits
+} kmp_cpuinfo_flags_t;
+
typedef struct kmp_cpuinfo {
int initialized; // If 0, other fields are not initialized.
int signature; // CPUID(1).EAX
@@ -1211,8 +1249,7 @@ typedef struct kmp_cpuinfo {
int model; // ( CPUID(1).EAX[19:16] << 4 ) + CPUID(1).EAX[7:4] ( ( Extended
// Model << 4 ) + Model)
int stepping; // CPUID(1).EAX[3:0] ( Stepping )
- int sse2; // 0 if SSE2 instructions are not supported, 1 otherwise.
- int rtm; // 0 if RTM instructions are not supported, 1 otherwise.
+ kmp_cpuinfo_flags_t flags;
int apic_id;
int physical_id;
int logical_id;
@@ -1278,6 +1315,82 @@ static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = _mm_getcsr(); }
#define KMP_X86_MXCSR_MASK 0xffffffc0 /* ignore status flags (6 lsb) */
+// User-level Monitor/Mwait
+#if KMP_HAVE_UMWAIT
+// We always try for UMWAIT first
+#if KMP_HAVE_WAITPKG_INTRINSICS
+#if KMP_HAVE_IMMINTRIN_H
+#include <immintrin.h>
+#elif KMP_HAVE_INTRIN_H
+#include <intrin.h>
+#endif
+#endif // KMP_HAVE_WAITPKG_INTRINSICS
+
+KMP_ATTRIBUTE_TARGET_WAITPKG
+static inline int __kmp_tpause(uint32_t hint, uint64_t counter) {
+#if !KMP_HAVE_WAITPKG_INTRINSICS
+ uint32_t timeHi = uint32_t(counter >> 32);
+ uint32_t timeLo = uint32_t(counter & 0xffffffff);
+ char flag;
+ __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n"
+ "setb %0"
+ : "=r"(flag)
+ : "a"(timeLo), "d"(timeHi), "c"(hint)
+ :);
+ return flag;
+#else
+ return _tpause(hint, counter);
+#endif
+}
+KMP_ATTRIBUTE_TARGET_WAITPKG
+static inline void __kmp_umonitor(void *cacheline) {
+#if !KMP_HAVE_WAITPKG_INTRINSICS
+ __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 "
+ :
+ : "a"(cacheline)
+ :);
+#else
+ _umonitor(cacheline);
+#endif
+}
+KMP_ATTRIBUTE_TARGET_WAITPKG
+static inline int __kmp_umwait(uint32_t hint, uint64_t counter) {
+#if !KMP_HAVE_WAITPKG_INTRINSICS
+ uint32_t timeHi = uint32_t(counter >> 32);
+ uint32_t timeLo = uint32_t(counter & 0xffffffff);
+ char flag;
+ __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n"
+ "setb %0"
+ : "=r"(flag)
+ : "a"(timeLo), "d"(timeHi), "c"(hint)
+ :);
+ return flag;
+#else
+ return _umwait(hint, counter);
+#endif
+}
+#elif KMP_HAVE_MWAIT
+#if KMP_OS_UNIX
+#include <pmmintrin.h>
+#else
+#include <intrin.h>
+#endif
+#if KMP_OS_UNIX
+__attribute__((target("sse3")))
+#endif
+static inline void
+__kmp_mm_monitor(void *cacheline, unsigned extensions, unsigned hints) {
+ _mm_monitor(cacheline, extensions, hints);
+}
+#if KMP_OS_UNIX
+__attribute__((target("sse3")))
+#endif
+static inline void
+__kmp_mm_mwait(unsigned extensions, unsigned hints) {
+ _mm_mwait(extensions, hints);
+}
+#endif // KMP_HAVE_UMWAIT
+
#if KMP_ARCH_X86
extern void __kmp_x86_pause(void);
#elif KMP_MIC
@@ -1307,6 +1420,9 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
#define KMP_INIT_YIELD(count) \
{ (count) = __kmp_yield_init; }
+#define KMP_INIT_BACKOFF(time) \
+ { (time) = __kmp_pause_init; }
+
#define KMP_OVERSUBSCRIBED \
(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc))
@@ -1344,7 +1460,36 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
} \
}
-#define KMP_YIELD_OVERSUB_ELSE_SPIN(count) \
+// If TPAUSE is available & enabled, use it. If oversubscribed, use the slower
+// (C0.2) state, which improves performance of other SMT threads on the same
+// core, otherwise, use the fast (C0.1) default state, or whatever the user has
+// requested. Uses a timed TPAUSE, and exponential backoff. If TPAUSE isn't
+// available, fall back to the regular CPU pause and yield combination.
+#if KMP_HAVE_UMWAIT
+#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time) \
+ { \
+ if (__kmp_tpause_enabled) { \
+ if (KMP_OVERSUBSCRIBED) { \
+ __kmp_tpause(0, (time)); \
+ } else { \
+ __kmp_tpause(__kmp_tpause_hint, (time)); \
+ } \
+ (time) *= 2; \
+ } else { \
+ KMP_CPU_PAUSE(); \
+ if ((KMP_TRY_YIELD_OVERSUB)) { \
+ __kmp_yield(); \
+ } else if (__kmp_use_yield == 1) { \
+ (count) -= 2; \
+ if (!(count)) { \
+ __kmp_yield(); \
+ (count) = __kmp_yield_next; \
+ } \
+ } \
+ } \
+ }
+#else
+#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time) \
{ \
KMP_CPU_PAUSE(); \
if ((KMP_TRY_YIELD_OVERSUB)) \
@@ -1357,86 +1502,14 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
} \
} \
}
-
-// User-level Monitor/Mwait
-#if KMP_HAVE_UMWAIT
-// We always try for UMWAIT first
-#if KMP_HAVE_WAITPKG_INTRINSICS
-#if KMP_HAVE_IMMINTRIN_H
-#include <immintrin.h>
-#elif KMP_HAVE_INTRIN_H
-#include <intrin.h>
-#endif
-#endif // KMP_HAVE_WAITPKG_INTRINSICS
-KMP_ATTRIBUTE_TARGET_WAITPKG
-static inline int __kmp_tpause(uint32_t hint, uint64_t counter) {
-#if !KMP_HAVE_WAITPKG_INTRINSICS
- uint32_t timeHi = uint32_t(counter >> 32);
- uint32_t timeLo = uint32_t(counter & 0xffffffff);
- char flag;
- __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n"
- "setb %0"
- : "=r"(flag)
- : "a"(timeLo), "d"(timeHi), "c"(hint)
- :);
- return flag;
-#else
- return _tpause(hint, counter);
-#endif
-}
-KMP_ATTRIBUTE_TARGET_WAITPKG
-static inline void __kmp_umonitor(void *cacheline) {
-#if !KMP_HAVE_WAITPKG_INTRINSICS
- __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 "
- :
- : "a"(cacheline)
- :);
-#else
- _umonitor(cacheline);
-#endif
-}
-KMP_ATTRIBUTE_TARGET_WAITPKG
-static inline int __kmp_umwait(uint32_t hint, uint64_t counter) {
-#if !KMP_HAVE_WAITPKG_INTRINSICS
- uint32_t timeHi = uint32_t(counter >> 32);
- uint32_t timeLo = uint32_t(counter & 0xffffffff);
- char flag;
- __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n"
- "setb %0"
- : "=r"(flag)
- : "a"(timeLo), "d"(timeHi), "c"(hint)
- :);
- return flag;
-#else
- return _umwait(hint, counter);
-#endif
-}
-#elif KMP_HAVE_MWAIT
-#if KMP_OS_UNIX
-#include <pmmintrin.h>
-#else
-#include <intrin.h>
-#endif
-#if KMP_OS_UNIX
-__attribute__((target("sse3")))
-#endif
-static inline void
-__kmp_mm_monitor(void *cacheline, unsigned extensions, unsigned hints) {
- _mm_monitor(cacheline, extensions, hints);
-}
-#if KMP_OS_UNIX
-__attribute__((target("sse3")))
-#endif
-static inline void
-__kmp_mm_mwait(unsigned extensions, unsigned hints) {
- _mm_mwait(extensions, hints);
-}
#endif // KMP_HAVE_UMWAIT
/* ------------------------------------------------------------------------ */
/* Support datatypes for the orphaned construct nesting checks. */
/* ------------------------------------------------------------------------ */
+/* When adding to this enum, add its corresponding string in cons_text_c[]
+ * array in kmp_error.cpp */
enum cons_type {
ct_none,
ct_parallel,
@@ -1879,6 +1952,15 @@ typedef struct kmp_disp {
0 // Thread th_reap_state: not safe to reap (tasking)
#define KMP_SAFE_TO_REAP 1 // Thread th_reap_state: safe to reap (not tasking)
+// The flag_type describes the storage used for the flag.
+enum flag_type {
+ flag32, /**< atomic 32 bit flags */
+ flag64, /**< 64 bit flags */
+ atomic_flag64, /**< atomic 64 bit flags */
+ flag_oncore, /**< special 64-bit flag for on-core barrier (hierarchical) */
+ flag_unset
+};
+
enum barrier_type {
bs_plain_barrier = 0, /* 0, All non-fork/join barriers (except reduction
barriers if enabled) */
@@ -1902,6 +1984,7 @@ typedef enum kmp_bar_pat { /* Barrier communication patterns */
bp_hyper_bar = 2, /* Hypercube-embedded tree with min
branching factor 2^n */
bp_hierarchical_bar = 3, /* Machine hierarchy tree */
+ bp_dist_bar = 4, /* Distributed barrier */
bp_last_bar /* Placeholder to mark the end */
} kmp_bar_pat_e;
@@ -2241,22 +2324,26 @@ typedef union kmp_depnode kmp_depnode_t;
typedef struct kmp_depnode_list kmp_depnode_list_t;
typedef struct kmp_dephash_entry kmp_dephash_entry_t;
+// macros for checking dep flag as an integer
#define KMP_DEP_IN 0x1
#define KMP_DEP_OUT 0x2
#define KMP_DEP_INOUT 0x3
#define KMP_DEP_MTX 0x4
#define KMP_DEP_SET 0x8
+#define KMP_DEP_ALL 0x80
// Compiler sends us this info:
typedef struct kmp_depend_info {
kmp_intptr_t base_addr;
size_t len;
union {
- kmp_uint8 flag;
- struct {
+ kmp_uint8 flag; // flag as an unsigned char
+ struct { // flag as a set of 8 bits
unsigned in : 1;
unsigned out : 1;
unsigned mtx : 1;
unsigned set : 1;
+ unsigned unused : 3;
+ unsigned all : 1;
} flags;
};
} kmp_depend_info_t;
@@ -2302,6 +2389,7 @@ struct kmp_dephash_entry {
typedef struct kmp_dephash {
kmp_dephash_entry_t **buckets;
size_t size;
+ kmp_depnode_t *last_all;
size_t generation;
kmp_uint32 nelements;
kmp_uint32 nconflicts;
@@ -2409,13 +2497,6 @@ struct kmp_taskdata { /* aligned during dynamic allocation */
kmp_depnode_t
*td_depnode; // Pointer to graph node if this task has dependencies
kmp_task_team_t *td_task_team;
- // The global thread id of the encountering thread. We need it because when a
- // regular task depends on a hidden helper task, and the hidden helper task
- // is finished on a hidden helper thread, it will call __kmp_release_deps to
- // release all dependences. If now the task is a regular task, we need to pass
- // the encountering gtid such that the task will be picked up and executed by
- // its encountering team instead of hidden helper team.
- kmp_int32 encountering_gtid;
size_t td_size_alloc; // Size of task structure, including shareds etc.
#if defined(KMP_GOMP_COMPAT)
// 4 or 8 byte integers for the loop bounds in GOMP_taskloop
@@ -2626,6 +2707,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
/* while awaiting queuing lock acquire */
volatile void *th_sleep_loc; // this points at a kmp_flag<T>
+ flag_type th_sleep_loc_type; // enum type of flag stored in th_sleep_loc
ident_t *th_ident;
unsigned th_x; // Random number generator data
@@ -2646,6 +2728,9 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
written by the worker thread) */
kmp_uint8 th_active_in_pool; // included in count of #active threads in pool
int th_active; // ! sleeping; 32 bits for TCR/TCW
+ std::atomic<kmp_uint32> th_used_in_team; // Flag indicating use in team
+ // 0 = not used in team; 1 = used in team;
+ // 2 = transitioning to not used in team; 3 = transitioning to used in team
struct cons_header *th_cons; // used for consistency check
#if KMP_USE_HIER_SCHED
// used for hierarchical scheduling
@@ -2825,6 +2910,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
#if USE_ITT_BUILD
void *t_stack_id; // team specific stack stitching id (for ittnotify)
#endif /* USE_ITT_BUILD */
+ distributedBarrier *b; // Distributed barrier data associated with team
} kmp_base_team_t;
union KMP_ALIGN_CACHE kmp_team {
@@ -2949,6 +3035,9 @@ extern int __kmp_storage_map_verbose_specified;
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
extern kmp_cpuinfo_t __kmp_cpuinfo;
+static inline bool __kmp_is_hybrid_cpu() { return __kmp_cpuinfo.flags.hybrid; }
+#else
+static inline bool __kmp_is_hybrid_cpu() { return false; }
#endif
extern volatile int __kmp_init_serial;
@@ -3033,6 +3122,7 @@ extern kmp_int32 __kmp_use_yield;
extern kmp_int32 __kmp_use_yield_exp_set;
extern kmp_uint32 __kmp_yield_init;
extern kmp_uint32 __kmp_yield_next;
+extern kmp_uint64 __kmp_pause_init;
/* ------------------------------------------------------------------------- */
extern int __kmp_allThreadsSpecified;
@@ -3235,6 +3325,13 @@ extern int __kmp_mwait_enabled; // Runtime check if ring3 mwait is enabled
extern int __kmp_mwait_hints; // Hints to pass in to mwait
#endif
+#if KMP_HAVE_UMWAIT
+extern int __kmp_waitpkg_enabled; // Runtime check if waitpkg exists
+extern int __kmp_tpause_state; // 0 (default), 1=C0.1, 2=C0.2; from KMP_TPAUSE
+extern int __kmp_tpause_hint; // 1=C0.1 (default), 0=C0.2; from KMP_TPAUSE
+extern int __kmp_tpause_enabled; // 0 (default), 1 (KMP_TPAUSE is non-zero)
+#endif
+
/* ------------------------------------------------------------------------- */
extern kmp_global_t __kmp_global; /* global status */
@@ -4118,6 +4215,10 @@ typedef enum kmp_severity_t {
} kmp_severity_t;
extern void __kmpc_error(ident_t *loc, int severity, const char *message);
+// Support for scope directive
+KMP_EXPORT void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved);
+KMP_EXPORT void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved);
+
#ifdef __cplusplus
}
#endif
@@ -4126,18 +4227,26 @@ template <bool C, bool S>
extern void __kmp_suspend_32(int th_gtid, kmp_flag_32<C, S> *flag);
template <bool C, bool S>
extern void __kmp_suspend_64(int th_gtid, kmp_flag_64<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_atomic_suspend_64(int th_gtid,
+ kmp_atomic_flag_64<C, S> *flag);
extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag);
#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
template <bool C, bool S>
extern void __kmp_mwait_32(int th_gtid, kmp_flag_32<C, S> *flag);
template <bool C, bool S>
extern void __kmp_mwait_64(int th_gtid, kmp_flag_64<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_atomic_mwait_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag);
extern void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag);
#endif
template <bool C, bool S>
extern void __kmp_resume_32(int target_gtid, kmp_flag_32<C, S> *flag);
template <bool C, bool S>
extern void __kmp_resume_64(int target_gtid, kmp_flag_64<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_atomic_resume_64(int target_gtid,
+ kmp_atomic_flag_64<C, S> *flag);
extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag);
template <bool C, bool S>
@@ -4156,6 +4265,14 @@ int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
void *itt_sync_obj,
#endif /* USE_ITT_BUILD */
kmp_int32 is_constrained);
+template <bool C, bool S>
+int __kmp_atomic_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
+ kmp_atomic_flag_64<C, S> *flag,
+ int final_spin, int *thread_finished,
+#if USE_ITT_BUILD
+ void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+ kmp_int32 is_constrained);
int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
kmp_flag_oncore *flag, int final_spin,
int *thread_finished,
@@ -4213,6 +4330,15 @@ public:
}
}
}
+ /// Instead of erroring out, return non-zero when
+ /// unsuccessful fopen() for any reason
+ int try_open(const char *filename, const char *mode) {
+ KMP_ASSERT(!f);
+ f = fopen(filename, mode);
+ if (!f)
+ return errno;
+ return 0;
+ }
/// Set the FILE* object to stdout and output there
/// No open call should happen before this call.
void set_stdout() {
diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
index 8b40bd7ecd..414a27fb05 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
@@ -26,6 +26,7 @@
#define HWLOC_GROUP_KIND_INTEL_DIE 104
#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
#endif
+#include <ctype.h>
// The machine topology
kmp_topology_t *__kmp_topology = nullptr;
@@ -123,6 +124,20 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
return ((plural) ? "unknowns" : "unknown");
}
+const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
+ switch (type) {
+ case KMP_HW_CORE_TYPE_UNKNOWN:
+ return "unknown";
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ case KMP_HW_CORE_TYPE_ATOM:
+ return "Intel Atom(R) processor";
+ case KMP_HW_CORE_TYPE_CORE:
+ return "Intel(R) Core(TM) processor";
+#endif
+ }
+ return "unknown";
+}
+
////////////////////////////////////////////////////////////////////////////////
// kmp_hw_thread_t methods
int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
@@ -174,20 +189,94 @@ void kmp_hw_thread_t::print() const {
for (int i = 0; i < depth; ++i) {
printf("%4d ", ids[i]);
}
+ if (attrs) {
+ if (attrs.is_core_type_valid())
+ printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type()));
+ if (attrs.is_core_eff_valid())
+ printf(" (eff=%d)", attrs.get_core_eff());
+ }
printf("\n");
}
////////////////////////////////////////////////////////////////////////////////
// kmp_topology_t methods
+// Add a layer to the topology based on the ids. Assume the topology
+// is perfectly nested (i.e., so no object has more than one parent)
+void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
+ // Figure out where the layer should go by comparing the ids of the current
+ // layers with the new ids
+ int target_layer;
+ int previous_id = kmp_hw_thread_t::UNKNOWN_ID;
+ int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID;
+
+ // Start from the highest layer and work down to find target layer
+ // If new layer is equal to another layer then put the new layer above
+ for (target_layer = 0; target_layer < depth; ++target_layer) {
+ bool layers_equal = true;
+ bool strictly_above_target_layer = false;
+ for (int i = 0; i < num_hw_threads; ++i) {
+ int id = hw_threads[i].ids[target_layer];
+ int new_id = ids[i];
+ if (id != previous_id && new_id == previous_new_id) {
+ // Found the layer we are strictly above
+ strictly_above_target_layer = true;
+ layers_equal = false;
+ break;
+ } else if (id == previous_id && new_id != previous_new_id) {
+ // Found a layer we are below. Move to next layer and check.
+ layers_equal = false;
+ break;
+ }
+ previous_id = id;
+ previous_new_id = new_id;
+ }
+ if (strictly_above_target_layer || layers_equal)
+ break;
+ }
+
+ // Found the layer we are above. Now move everything to accommodate the new
+ // layer. And put the new ids and type into the topology.
+ for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
+ types[j] = types[i];
+ types[target_layer] = type;
+ for (int k = 0; k < num_hw_threads; ++k) {
+ for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
+ hw_threads[k].ids[j] = hw_threads[k].ids[i];
+ hw_threads[k].ids[target_layer] = ids[k];
+ }
+ equivalent[type] = type;
+ depth++;
+}
+
+#if KMP_GROUP_AFFINITY
+// Insert the Windows Processor Group structure into the topology
+void kmp_topology_t::_insert_windows_proc_groups() {
+ // Do not insert the processor group structure for a single group
+ if (__kmp_num_proc_groups == 1)
+ return;
+ kmp_affin_mask_t *mask;
+ int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
+ KMP_CPU_ALLOC(mask);
+ for (int i = 0; i < num_hw_threads; ++i) {
+ KMP_CPU_ZERO(mask);
+ KMP_CPU_SET(hw_threads[i].os_id, mask);
+ ids[i] = __kmp_get_proc_group(mask);
+ }
+ KMP_CPU_FREE(mask);
+ _insert_layer(KMP_HW_PROC_GROUP, ids);
+ __kmp_free(ids);
+}
+#endif
+
// Remove layers that don't add information to the topology.
// This is done by having the layer take on the id = UNKNOWN_ID (-1)
void kmp_topology_t::_remove_radix1_layers() {
int preference[KMP_HW_LAST];
int top_index1, top_index2;
// Set up preference associative array
- preference[KMP_HW_PROC_GROUP] = 110;
- preference[KMP_HW_SOCKET] = 100;
+ preference[KMP_HW_SOCKET] = 110;
+ preference[KMP_HW_PROC_GROUP] = 100;
preference[KMP_HW_CORE] = 95;
preference[KMP_HW_THREAD] = 90;
preference[KMP_HW_NUMA] = 85;
@@ -305,6 +394,7 @@ void kmp_topology_t::_gather_enumeration_information() {
count[i] = 0;
ratio[i] = 0;
}
+ int core_level = get_level(KMP_HW_CORE);
for (int i = 0; i < num_hw_threads; ++i) {
kmp_hw_thread_t &hw_thread = hw_threads[i];
for (int layer = 0; layer < depth; ++layer) {
@@ -320,6 +410,29 @@ void kmp_topology_t::_gather_enumeration_information() {
ratio[l] = max[l];
max[l] = 1;
}
+ // Figure out the number of different core types
+ // and efficiencies for hybrid CPUs
+ if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) {
+ if (hw_thread.attrs.is_core_eff_valid() &&
+ hw_thread.attrs.core_eff >= num_core_efficiencies) {
+ // Because efficiencies can range from 0 to max efficiency - 1,
+ // the number of efficiencies is max efficiency + 1
+ num_core_efficiencies = hw_thread.attrs.core_eff + 1;
+ }
+ if (hw_thread.attrs.is_core_type_valid()) {
+ bool found = false;
+ for (int j = 0; j < num_core_types; ++j) {
+ if (hw_thread.attrs.get_core_type() == core_types[j]) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES);
+ core_types[num_core_types++] = hw_thread.attrs.get_core_type();
+ }
+ }
+ }
break;
}
}
@@ -333,6 +446,42 @@ void kmp_topology_t::_gather_enumeration_information() {
}
}
+int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr,
+ int above_level,
+ bool find_all) const {
+ int current, current_max;
+ int previous_id[KMP_HW_LAST];
+ for (int i = 0; i < depth; ++i)
+ previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
+ int core_level = get_level(KMP_HW_CORE);
+ if (find_all)
+ above_level = -1;
+ KMP_ASSERT(above_level < core_level);
+ current_max = 0;
+ current = 0;
+ for (int i = 0; i < num_hw_threads; ++i) {
+ kmp_hw_thread_t &hw_thread = hw_threads[i];
+ if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) {
+ if (current > current_max)
+ current_max = current;
+ current = hw_thread.attrs.contains(attr);
+ } else {
+ for (int level = above_level + 1; level <= core_level; ++level) {
+ if (hw_thread.ids[level] != previous_id[level]) {
+ if (hw_thread.attrs.contains(attr))
+ current++;
+ break;
+ }
+ }
+ }
+ for (int level = 0; level < depth; ++level)
+ previous_id[level] = hw_thread.ids[level];
+ }
+ if (current > current_max)
+ current_max = current;
+ return current_max;
+}
+
// Find out if the topology is uniform
void kmp_topology_t::_discover_uniformity() {
int num = 1;
@@ -406,7 +555,7 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
kmp_topology_t *retval;
// Allocate all data in one large allocation
size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
- sizeof(int) * ndepth * 3;
+ sizeof(int) * (size_t)KMP_HW_LAST * 3;
char *bytes = (char *)__kmp_allocate(size);
retval = (kmp_topology_t *)bytes;
if (nproc > 0) {
@@ -419,8 +568,12 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
int *arr =
(int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
retval->types = (kmp_hw_t *)arr;
- retval->ratio = arr + ndepth;
- retval->count = arr + 2 * ndepth;
+ retval->ratio = arr + (size_t)KMP_HW_LAST;
+ retval->count = arr + 2 * (size_t)KMP_HW_LAST;
+ retval->num_core_efficiencies = 0;
+ retval->num_core_types = 0;
+ for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
+ retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
for (int i = 0; i < ndepth; ++i) {
retval->types[i] = types[i];
@@ -478,6 +631,13 @@ void kmp_topology_t::dump() const {
}
printf("\n");
+ printf("* num_core_eff: %d\n", num_core_efficiencies);
+ printf("* num_core_types: %d\n", num_core_types);
+ printf("* core_types: ");
+ for (int i = 0; i < num_core_types; ++i)
+ printf("%3d ", core_types[i]);
+ printf("\n");
+
printf("* equivalent map:\n");
KMP_FOREACH_HW_TYPE(i) {
const char *key = __kmp_hw_get_keyword(i);
@@ -571,6 +731,29 @@ void kmp_topology_t::print(const char *env_var) const {
}
KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
+ // Hybrid topology information
+ if (__kmp_is_hybrid_cpu()) {
+ for (int i = 0; i < num_core_types; ++i) {
+ kmp_hw_core_type_t core_type = core_types[i];
+ kmp_hw_attr_t attr;
+ attr.clear();
+ attr.set_core_type(core_type);
+ int ncores = get_ncores_with_attr(attr);
+ if (ncores > 0) {
+ KMP_INFORM(TopologyHybrid, env_var, ncores,
+ __kmp_hw_get_core_type_string(core_type));
+ KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS)
+ for (int eff = 0; eff < num_core_efficiencies; ++eff) {
+ attr.set_core_eff(eff);
+ int ncores_with_eff = get_ncores_with_attr(attr);
+ if (ncores_with_eff > 0) {
+ KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff);
+ }
+ }
+ }
+ }
+ }
+
if (num_hw_threads <= 0) {
__kmp_str_buf_free(&buf);
return;
@@ -585,6 +768,10 @@ void kmp_topology_t::print(const char *env_var) const {
__kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
__kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
}
+ if (__kmp_is_hybrid_cpu())
+ __kmp_str_buf_print(
+ &buf, "(%s)",
+ __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type()));
KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
}
@@ -592,6 +779,9 @@ void kmp_topology_t::print(const char *env_var) const {
}
void kmp_topology_t::canonicalize() {
+#if KMP_GROUP_AFFINITY
+ _insert_windows_proc_groups();
+#endif
_remove_radix1_layers();
_gather_enumeration_information();
_discover_uniformity();
@@ -640,6 +830,25 @@ void kmp_topology_t::canonicalize() {
__kmp_hw_get_catalog_string(gran_type));
__kmp_affinity_gran = gran_type;
}
+#if KMP_GROUP_AFFINITY
+ // If more than one processor group exists, and the level of
+ // granularity specified by the user is too coarse, then the
+ // granularity must be adjusted "down" to processor group affinity
+ // because threads can only exist within one processor group.
+ // For example, if a user sets granularity=socket and there are two
+ // processor groups that cover a socket, then the runtime must
+ // restrict the granularity down to the processor group level.
+ if (__kmp_num_proc_groups > 1) {
+ int gran_depth = __kmp_topology->get_level(gran_type);
+ int proc_group_depth = __kmp_topology->get_level(KMP_HW_PROC_GROUP);
+ if (gran_depth >= 0 && proc_group_depth >= 0 &&
+ gran_depth < proc_group_depth) {
+ KMP_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY",
+ __kmp_hw_get_catalog_string(__kmp_affinity_gran));
+ __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP;
+ }
+ }
+#endif
__kmp_affinity_gran_levels = 0;
for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
__kmp_affinity_gran_levels++;
@@ -673,6 +882,56 @@ void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
_discover_uniformity();
}
+// Represents running sub IDs for a single core attribute where
+// attribute values have SIZE possibilities.
+template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
+ int last_level; // last level in topology to consider for sub_ids
+ int sub_id[SIZE]; // The sub ID for a given attribute value
+ int prev_sub_id[KMP_HW_LAST];
+ IndexFunc indexer;
+
+public:
+ kmp_sub_ids_t(int last_level) : last_level(last_level) {
+ KMP_ASSERT(last_level < KMP_HW_LAST);
+ for (size_t i = 0; i < SIZE; ++i)
+ sub_id[i] = -1;
+ for (size_t i = 0; i < KMP_HW_LAST; ++i)
+ prev_sub_id[i] = -1;
+ }
+ void update(const kmp_hw_thread_t &hw_thread) {
+ int idx = indexer(hw_thread);
+ KMP_ASSERT(idx < (int)SIZE);
+ for (int level = 0; level <= last_level; ++level) {
+ if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
+ if (level < last_level)
+ sub_id[idx] = -1;
+ sub_id[idx]++;
+ break;
+ }
+ }
+ for (int level = 0; level <= last_level; ++level)
+ prev_sub_id[level] = hw_thread.sub_ids[level];
+ }
+ int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
+ return sub_id[indexer(hw_thread)];
+ }
+};
+
+static kmp_str_buf_t *
+__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
+ bool plural) {
+ __kmp_str_buf_init(buf);
+ if (attr.is_core_type_valid())
+ __kmp_str_buf_print(buf, "%s %s",
+ __kmp_hw_get_core_type_string(attr.get_core_type()),
+ __kmp_hw_get_catalog_string(KMP_HW_CORE, plural));
+ else
+ __kmp_str_buf_print(buf, "%s eff=%d",
+ __kmp_hw_get_catalog_string(KMP_HW_CORE, plural),
+ attr.get_core_eff());
+ return buf;
+}
+
// Apply the KMP_HW_SUBSET envirable to the topology
// Returns true if KMP_HW_SUBSET filtered any processors
// otherwise, returns false
@@ -681,18 +940,27 @@ bool kmp_topology_t::filter_hw_subset() {
if (!__kmp_hw_subset)
return false;
+ // First, sort the KMP_HW_SUBSET items by the machine topology
+ __kmp_hw_subset->sort();
+
// Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
+ bool using_core_types = false;
+ bool using_core_effs = false;
int hw_subset_depth = __kmp_hw_subset->get_depth();
kmp_hw_t specified[KMP_HW_LAST];
+ int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
KMP_ASSERT(hw_subset_depth > 0);
KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
+ int core_level = get_level(KMP_HW_CORE);
for (int i = 0; i < hw_subset_depth; ++i) {
int max_count;
- int num = __kmp_hw_subset->at(i).num;
- int offset = __kmp_hw_subset->at(i).offset;
- kmp_hw_t type = __kmp_hw_subset->at(i).type;
+ const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i);
+ int num = item.num[0];
+ int offset = item.offset[0];
+ kmp_hw_t type = item.type;
kmp_hw_t equivalent_type = equivalent[type];
int level = get_level(type);
+ topology_levels[i] = level;
// Check to see if current layer is in detected machine topology
if (equivalent_type != KMP_HW_UNKNOWN) {
@@ -703,8 +971,8 @@ bool kmp_topology_t::filter_hw_subset() {
return false;
}
- // Check to see if current layer has already been specified
- // either directly or through an equivalent type
+ // Check to see if current layer has already been
+ // specified either directly or through an equivalent type
if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type),
__kmp_hw_get_catalog_string(specified[equivalent_type]));
@@ -712,66 +980,247 @@ bool kmp_topology_t::filter_hw_subset() {
}
specified[equivalent_type] = type;
- // Check to see if layers are in order
- if (i + 1 < hw_subset_depth) {
- kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type);
- if (next_type == KMP_HW_UNKNOWN) {
- KMP_WARNING(
- AffHWSubsetNotExistGeneric,
- __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type));
- return false;
- }
- int next_topology_level = get_level(next_type);
- if (level > next_topology_level) {
- KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type),
- __kmp_hw_get_catalog_string(next_type));
- return false;
- }
- }
-
// Check to see if each layer's num & offset parameters are valid
max_count = get_ratio(level);
- if (max_count < 0 || num + offset > max_count) {
+ if (max_count < 0 ||
+ (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
bool plural = (num > 1);
KMP_WARNING(AffHWSubsetManyGeneric,
__kmp_hw_get_catalog_string(type, plural));
return false;
}
+
+ // Check to see if core attributes are consistent
+ if (core_level == level) {
+ // Determine which core attributes are specified
+ for (int j = 0; j < item.num_attrs; ++j) {
+ if (item.attr[j].is_core_type_valid())
+ using_core_types = true;
+ if (item.attr[j].is_core_eff_valid())
+ using_core_effs = true;
+ }
+
+ // Check if using a single core attribute on non-hybrid arch.
+ // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute.
+ //
+ // Check if using multiple core attributes on non-hyrbid arch.
+ // Ignore all of KMP_HW_SUBSET if this is the case.
+ if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
+ if (item.num_attrs == 1) {
+ if (using_core_effs) {
+ KMP_WARNING(AffHWSubsetIgnoringAttr, "efficiency");
+ } else {
+ KMP_WARNING(AffHWSubsetIgnoringAttr, "core_type");
+ }
+ using_core_effs = false;
+ using_core_types = false;
+ } else {
+ KMP_WARNING(AffHWSubsetAttrsNonHybrid);
+ return false;
+ }
+ }
+
+ // Check if using both core types and core efficiencies together
+ if (using_core_types && using_core_effs) {
+ KMP_WARNING(AffHWSubsetIncompat, "core_type", "efficiency");
+ return false;
+ }
+
+ // Check that core efficiency values are valid
+ if (using_core_effs) {
+ for (int j = 0; j < item.num_attrs; ++j) {
+ if (item.attr[j].is_core_eff_valid()) {
+ int core_eff = item.attr[j].get_core_eff();
+ if (core_eff < 0 || core_eff >= num_core_efficiencies) {
+ kmp_str_buf_t buf;
+ __kmp_str_buf_init(&buf);
+ __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff());
+ __kmp_msg(kmp_ms_warning,
+ KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str),
+ KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1),
+ __kmp_msg_null);
+ __kmp_str_buf_free(&buf);
+ return false;
+ }
+ }
+ }
+ }
+
+ // Check that the number of requested cores with attributes is valid
+ if (using_core_types || using_core_effs) {
+ for (int j = 0; j < item.num_attrs; ++j) {
+ int num = item.num[j];
+ int offset = item.offset[j];
+ int level_above = core_level - 1;
+ if (level_above >= 0) {
+ max_count = get_ncores_with_attr_per(item.attr[j], level_above);
+ if (max_count <= 0 ||
+ (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
+ kmp_str_buf_t buf;
+ __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
+ KMP_WARNING(AffHWSubsetManyGeneric, buf.str);
+ __kmp_str_buf_free(&buf);
+ return false;
+ }
+ }
+ }
+ }
+
+ if ((using_core_types || using_core_effs) && item.num_attrs > 1) {
+ for (int j = 0; j < item.num_attrs; ++j) {
+ // Ambiguous use of specific core attribute + generic core
+ // e.g., 4c & 3c:intel_core or 4c & 3c:eff1
+ if (!item.attr[j]) {
+ kmp_hw_attr_t other_attr;
+ for (int k = 0; k < item.num_attrs; ++k) {
+ if (item.attr[k] != item.attr[j]) {
+ other_attr = item.attr[k];
+ break;
+ }
+ }
+ kmp_str_buf_t buf;
+ __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
+ KMP_WARNING(AffHWSubsetIncompat,
+ __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
+ __kmp_str_buf_free(&buf);
+ return false;
+ }
+ // Allow specifying a specific core type or core eff exactly once
+ for (int k = 0; k < j; ++k) {
+ if (!item.attr[j] || !item.attr[k])
+ continue;
+ if (item.attr[k] == item.attr[j]) {
+ kmp_str_buf_t buf;
+ __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
+ item.num[j] > 0);
+ KMP_WARNING(AffHWSubsetAttrRepeat, buf.str);
+ __kmp_str_buf_free(&buf);
+ return false;
+ }
+ }
+ }
+ }
+ }
}
- // Apply the filtered hardware subset
- int new_index = 0;
+ struct core_type_indexer {
+ int operator()(const kmp_hw_thread_t &t) const {
+ switch (t.attrs.get_core_type()) {
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ case KMP_HW_CORE_TYPE_ATOM:
+ return 1;
+ case KMP_HW_CORE_TYPE_CORE:
+ return 2;
+#endif
+ case KMP_HW_CORE_TYPE_UNKNOWN:
+ return 0;
+ }
+ KMP_ASSERT(0);
+ return 0;
+ }
+ };
+ struct core_eff_indexer {
+ int operator()(const kmp_hw_thread_t &t) const {
+ return t.attrs.get_core_eff();
+ }
+ };
+
+ kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
+ core_level);
+ kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
+ core_level);
+
+ // Determine which hardware threads should be filtered.
+ int num_filtered = 0;
+ bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads);
for (int i = 0; i < num_hw_threads; ++i) {
kmp_hw_thread_t &hw_thread = hw_threads[i];
+ // Update type_sub_id
+ if (using_core_types)
+ core_type_sub_ids.update(hw_thread);
+ if (using_core_effs)
+ core_eff_sub_ids.update(hw_thread);
+
// Check to see if this hardware thread should be filtered
bool should_be_filtered = false;
- for (int level = 0, hw_subset_index = 0;
- level < depth && hw_subset_index < hw_subset_depth; ++level) {
- kmp_hw_t topology_type = types[level];
- auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
- kmp_hw_t hw_subset_type = hw_subset_item.type;
- if (topology_type != hw_subset_type)
+ for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth;
+ ++hw_subset_index) {
+ const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
+ int level = topology_levels[hw_subset_index];
+ if (level == -1)
continue;
- int num = hw_subset_item.num;
- int offset = hw_subset_item.offset;
- hw_subset_index++;
- if (hw_thread.sub_ids[level] < offset ||
- hw_thread.sub_ids[level] >= offset + num) {
- should_be_filtered = true;
- break;
+ if ((using_core_effs || using_core_types) && level == core_level) {
+ // Look for the core attribute in KMP_HW_SUBSET which corresponds
+ // to this hardware thread's core attribute. Use this num,offset plus
+ // the running sub_id for the particular core attribute of this hardware
+ // thread to determine if the hardware thread should be filtered or not.
+ int attr_idx;
+ kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type();
+ int core_eff = hw_thread.attrs.get_core_eff();
+ for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) {
+ if (using_core_types &&
+ hw_subset_item.attr[attr_idx].get_core_type() == core_type)
+ break;
+ if (using_core_effs &&
+ hw_subset_item.attr[attr_idx].get_core_eff() == core_eff)
+ break;
+ }
+ // This core attribute isn't in the KMP_HW_SUBSET so always filter it.
+ if (attr_idx == hw_subset_item.num_attrs) {
+ should_be_filtered = true;
+ break;
+ }
+ int sub_id;
+ int num = hw_subset_item.num[attr_idx];
+ int offset = hw_subset_item.offset[attr_idx];
+ if (using_core_types)
+ sub_id = core_type_sub_ids.get_sub_id(hw_thread);
+ else
+ sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
+ if (sub_id < offset ||
+ (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
+ should_be_filtered = true;
+ break;
+ }
+ } else {
+ int num = hw_subset_item.num[0];
+ int offset = hw_subset_item.offset[0];
+ if (hw_thread.sub_ids[level] < offset ||
+ (num != kmp_hw_subset_t::USE_ALL &&
+ hw_thread.sub_ids[level] >= offset + num)) {
+ should_be_filtered = true;
+ break;
+ }
}
}
- if (!should_be_filtered) {
+ // Collect filtering information
+ filtered[i] = should_be_filtered;
+ if (should_be_filtered)
+ num_filtered++;
+ }
+
+ // One last check that we shouldn't allow filtering entire machine
+ if (num_filtered == num_hw_threads) {
+ KMP_WARNING(AffHWSubsetAllFiltered);
+ __kmp_free(filtered);
+ return false;
+ }
+
+ // Apply the filter
+ int new_index = 0;
+ for (int i = 0; i < num_hw_threads; ++i) {
+ if (!filtered[i]) {
if (i != new_index)
- hw_threads[new_index] = hw_thread;
+ hw_threads[new_index] = hw_threads[i];
new_index++;
} else {
#if KMP_AFFINITY_SUPPORTED
- KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask);
+ KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask);
#endif
__kmp_avail_proc--;
}
}
+
KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
num_hw_threads = new_index;
@@ -780,6 +1229,7 @@ bool kmp_topology_t::filter_hw_subset() {
_discover_uniformity();
_set_globals();
_set_last_level_cache();
+ __kmp_free(filtered);
return true;
}
@@ -986,7 +1436,67 @@ kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
return buf;
}
-void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
+// Return (possibly empty) affinity mask representing the offline CPUs
+// Caller must free the mask
+kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
+ kmp_affin_mask_t *offline;
+ KMP_CPU_ALLOC(offline);
+ KMP_CPU_ZERO(offline);
+#if KMP_OS_LINUX
+ int n, begin_cpu, end_cpu;
+ kmp_safe_raii_file_t offline_file;
+ auto skip_ws = [](FILE *f) {
+ int c;
+ do {
+ c = fgetc(f);
+ } while (isspace(c));
+ if (c != EOF)
+ ungetc(c, f);
+ };
+ // File contains CSV of integer ranges representing the offline CPUs
+ // e.g., 1,2,4-7,9,11-15
+ int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
+ if (status != 0)
+ return offline;
+ while (!feof(offline_file)) {
+ skip_ws(offline_file);
+ n = fscanf(offline_file, "%d", &begin_cpu);
+ if (n != 1)
+ break;
+ skip_ws(offline_file);
+ int c = fgetc(offline_file);
+ if (c == EOF || c == ',') {
+ // Just single CPU
+ end_cpu = begin_cpu;
+ } else if (c == '-') {
+ // Range of CPUs
+ skip_ws(offline_file);
+ n = fscanf(offline_file, "%d", &end_cpu);
+ if (n != 1)
+ break;
+ skip_ws(offline_file);
+ c = fgetc(offline_file); // skip ','
+ } else {
+ // Syntax problem
+ break;
+ }
+ // Ensure a valid range of CPUs
+ if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 ||
+ end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
+ continue;
+ }
+ // Insert [begin_cpu, end_cpu] into offline mask
+ for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
+ KMP_CPU_SET(cpu, offline);
+ }
+ }
+#endif
+ return offline;
+}
+
+// Return the number of available procs
+int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
+ int avail_proc = 0;
KMP_CPU_ZERO(mask);
#if KMP_GROUP_AFFINITY
@@ -999,6 +1509,7 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
int num = __kmp_GetActiveProcessorCount(group);
for (i = 0; i < num; i++) {
KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
+ avail_proc++;
}
}
} else
@@ -1007,10 +1518,18 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
{
int proc;
+ kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus();
for (proc = 0; proc < __kmp_xproc; proc++) {
+ // Skip offline CPUs
+ if (KMP_CPU_ISSET(proc, offline_cpus))
+ continue;
KMP_CPU_SET(proc, mask);
+ avail_proc++;
}
+ KMP_CPU_FREE(offline_cpus);
}
+
+ return avail_proc;
}
// All of the __kmp_affinity_create_*_map() routines should allocate the
@@ -1156,6 +1675,45 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
return true;
}
+ // Handle multiple types of cores if they exist on the system
+ int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
+
+ typedef struct kmp_hwloc_cpukinds_info_t {
+ int efficiency;
+ kmp_hw_core_type_t core_type;
+ hwloc_bitmap_t mask;
+ } kmp_hwloc_cpukinds_info_t;
+ kmp_hwloc_cpukinds_info_t *cpukinds = nullptr;
+
+ if (nr_cpu_kinds > 0) {
+ unsigned nr_infos;
+ struct hwloc_info_s *infos;
+ cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate(
+ sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds);
+ for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) {
+ cpukinds[idx].efficiency = -1;
+ cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+ cpukinds[idx].mask = hwloc_bitmap_alloc();
+ if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask,
+ &cpukinds[idx].efficiency, &nr_infos, &infos,
+ 0) == 0) {
+ for (unsigned i = 0; i < nr_infos; ++i) {
+ if (__kmp_str_match("CoreType", 8, infos[i].name)) {
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ if (__kmp_str_match("IntelAtom", 9, infos[i].value)) {
+ cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM;
+ break;
+ } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) {
+ cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE;
+ break;
+ }
+#endif
+ }
+ }
+ }
+ }
+ }
+
root = hwloc_get_root_obj(tp);
// Figure out the depth and types in the topology
@@ -1215,6 +1773,20 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
hw_thread.clear();
hw_thread.ids[index] = pu->logical_index;
hw_thread.os_id = pu->os_index;
+ // If multiple core types, then set that attribute for the hardware thread
+ if (cpukinds) {
+ int cpukind_index = -1;
+ for (int i = 0; i < nr_cpu_kinds; ++i) {
+ if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) {
+ cpukind_index = i;
+ break;
+ }
+ }
+ if (cpukind_index >= 0) {
+ hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type);
+ hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
+ }
+ }
index--;
}
obj = pu;
@@ -1258,6 +1830,13 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
if (included)
hw_thread_index++;
}
+
+ // Free the core types information
+ if (cpukinds) {
+ for (int idx = 0; idx < nr_cpu_kinds; ++idx)
+ hwloc_bitmap_free(cpukinds[idx].mask);
+ __kmp_free(cpukinds);
+ }
__kmp_topology->sort_ids();
return true;
}
@@ -1782,6 +2361,26 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
return true;
}
+// Hybrid cpu detection using CPUID.1A
+// Thread should be pinned to processor already
+static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency,
+ unsigned *native_model_id) {
+ kmp_cpuid buf;
+ __kmp_x86_cpuid(0x1a, 0, &buf);
+ *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
+ switch (*type) {
+ case KMP_HW_CORE_TYPE_ATOM:
+ *efficiency = 0;
+ break;
+ case KMP_HW_CORE_TYPE_CORE:
+ *efficiency = 1;
+ break;
+ default:
+ *efficiency = 0;
+ }
+ *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
+}
+
// Intel(R) microarchitecture code name Nehalem, Dunnington and later
// architectures support a newer interface for specifying the x2APIC Ids,
// based on CPUID.B or CPUID.1F
@@ -2051,6 +2650,15 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
}
}
+ // Hybrid information
+ if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
+ kmp_hw_core_type_t type;
+ unsigned native_model_id;
+ int efficiency;
+ __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
+ hw_thread.attrs.set_core_type(type);
+ hw_thread.attrs.set_core_eff(efficiency);
+ }
hw_thread_index++;
}
KMP_ASSERT(hw_thread_index > 0);
@@ -2386,7 +2994,10 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
unsigned val;
if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
goto no_val;
- KMP_ASSERT(nodeIdIndex + level <= maxIndex);
+ // validate the input before using level:
+ if (level > (unsigned)__kmp_xproc) { // level is too big
+ level = __kmp_xproc;
+ }
if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
goto dup_field;
threadInfo[num_avail][nodeIdIndex + level] = val;
@@ -3497,8 +4108,8 @@ static void __kmp_aux_affinity_initialize(void) {
__kmp_affin_fullMask);
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
}
- __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
- __kmp_avail_proc = __kmp_xproc;
+ __kmp_avail_proc =
+ __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
#if KMP_OS_WINDOWS
// Set the process affinity mask since threads' affinity
// masks must be subset of process mask in Windows* OS
@@ -4145,14 +4756,19 @@ int __kmp_aux_set_affinity(void **mask) {
int __kmp_aux_get_affinity(void **mask) {
int gtid;
int retval;
+#if KMP_OS_WINDOWS || KMP_DEBUG
kmp_info_t *th;
-
+#endif
if (!KMP_AFFINITY_CAPABLE()) {
return -1;
}
gtid = __kmp_entry_gtid();
+#if KMP_OS_WINDOWS || KMP_DEBUG
th = __kmp_threads[gtid];
+#else
+ (void)gtid; // unused variable
+#endif
KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
KA_TRACE(
diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.h b/contrib/libs/cxxsupp/openmp/kmp_affinity.h
index 8e72922d2c..ce00362f04 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_affinity.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.h
@@ -15,6 +15,7 @@
#include "kmp.h"
#include "kmp_os.h"
+#include <limits>
#if KMP_AFFINITY_SUPPORTED
#if KMP_USE_HWLOC
@@ -598,6 +599,63 @@ class KMPNativeAffinity : public KMPAffinity {
#endif /* KMP_OS_WINDOWS */
#endif /* KMP_AFFINITY_SUPPORTED */
+// Describe an attribute for a level in the machine topology
+struct kmp_hw_attr_t {
+ int core_type : 8;
+ int core_eff : 8;
+ unsigned valid : 1;
+ unsigned reserved : 15;
+
+ static const int UNKNOWN_CORE_EFF = -1;
+
+ kmp_hw_attr_t()
+ : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
+ valid(0), reserved(0) {}
+ void set_core_type(kmp_hw_core_type_t type) {
+ valid = 1;
+ core_type = type;
+ }
+ void set_core_eff(int eff) {
+ valid = 1;
+ core_eff = eff;
+ }
+ kmp_hw_core_type_t get_core_type() const {
+ return (kmp_hw_core_type_t)core_type;
+ }
+ int get_core_eff() const { return core_eff; }
+ bool is_core_type_valid() const {
+ return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
+ }
+ bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
+ operator bool() const { return valid; }
+ void clear() {
+ core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+ core_eff = UNKNOWN_CORE_EFF;
+ valid = 0;
+ }
+ bool contains(const kmp_hw_attr_t &other) const {
+ if (!valid && !other.valid)
+ return true;
+ if (valid && other.valid) {
+ if (other.is_core_type_valid()) {
+ if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
+ return false;
+ }
+ if (other.is_core_eff_valid()) {
+ if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
+ return false;
+ }
+ return true;
+ }
+ return false;
+ }
+ bool operator==(const kmp_hw_attr_t &rhs) const {
+ return (rhs.valid == valid && rhs.core_eff == core_eff &&
+ rhs.core_type == core_type);
+ }
+ bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
+};
+
class kmp_hw_thread_t {
public:
static const int UNKNOWN_ID = -1;
@@ -607,11 +665,14 @@ public:
int sub_ids[KMP_HW_LAST];
bool leader;
int os_id;
+ kmp_hw_attr_t attrs;
+
void print() const;
void clear() {
for (int i = 0; i < (int)KMP_HW_LAST; ++i)
ids[i] = UNKNOWN_ID;
leader = false;
+ attrs.clear();
}
};
@@ -624,7 +685,9 @@ class kmp_topology_t {
int depth;
- // The following arrays are all 'depth' long
+ // The following arrays are all 'depth' long and have been
+ // allocated to hold up to KMP_HW_LAST number of objects if
+ // needed so layers can be added without reallocation of any array
// Orderd array of the types in the topology
kmp_hw_t *types;
@@ -637,6 +700,12 @@ class kmp_topology_t {
// Storage containing the absolute number of each topology layer
int *count;
+ // The number of core efficiencies. This is only useful for hybrid
+ // topologies. Core efficiencies will range from 0 to num efficiencies - 1
+ int num_core_efficiencies;
+ int num_core_types;
+ kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
+
// The hardware threads array
// hw_threads is num_hw_threads long
// Each hw_thread's ids and sub_ids are depth deep
@@ -652,6 +721,14 @@ class kmp_topology_t {
// Flags describing the topology
flags_t flags;
+ // Insert a new topology layer after allocation
+ void _insert_layer(kmp_hw_t type, const int *ids);
+
+#if KMP_GROUP_AFFINITY
+ // Insert topology information about Windows Processor groups
+ void _insert_windows_proc_groups();
+#endif
+
// Count each item & get the num x's per y
// e.g., get the number of cores and the number of threads per core
// for each (x, y) in (KMP_HW_* , KMP_HW_*)
@@ -675,6 +752,12 @@ class kmp_topology_t {
// Set the last level cache equivalent type
void _set_last_level_cache();
+ // Return the number of cores with a particular attribute, 'attr'.
+ // If 'find_all' is true, then find all cores on the machine, otherwise find
+ // all cores per the layer 'above'
+ int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
+ bool find_all = false) const;
+
public:
// Force use of allocate()/deallocate()
kmp_topology_t() = delete;
@@ -764,6 +847,16 @@ public:
KMP_DEBUG_ASSERT(level >= 0 && level < depth);
return count[level];
}
+ // Return the total number of cores with attribute 'attr'
+ int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
+ return _get_ncores_with_attr(attr, -1, true);
+ }
+ // Return the number of cores with attribute
+ // 'attr' per topology level 'above'
+ int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
+ return _get_ncores_with_attr(attr, above, false);
+ }
+
#if KMP_AFFINITY_SUPPORTED
void sort_compact() {
qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
@@ -773,14 +866,22 @@ public:
void print(const char *env_var = "KMP_AFFINITY") const;
void dump() const;
};
+extern kmp_topology_t *__kmp_topology;
class kmp_hw_subset_t {
+ const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
+
public:
+ // Describe a machine topology item in KMP_HW_SUBSET
struct item_t {
- int num;
kmp_hw_t type;
- int offset;
+ int num_attrs;
+ int num[MAX_ATTRS];
+ int offset[MAX_ATTRS];
+ kmp_hw_attr_t attr[MAX_ATTRS];
};
+ // Put parenthesis around max to avoid accidental use of Windows max macro.
+ const static int USE_ALL = (std::numeric_limits<int>::max)();
private:
int depth;
@@ -790,6 +891,15 @@ private:
bool absolute;
// The set must be able to handle up to KMP_HW_LAST number of layers
KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
+ // Sorting the KMP_HW_SUBSET items to follow topology order
+ // All unknown topology types will be at the beginning of the subset
+ static int hw_subset_compare(const void *i1, const void *i2) {
+ kmp_hw_t type1 = ((const item_t *)i1)->type;
+ kmp_hw_t type2 = ((const item_t *)i2)->type;
+ int level1 = __kmp_topology->get_level(type1);
+ int level2 = __kmp_topology->get_level(type2);
+ return level1 - level2;
+ }
public:
// Force use of allocate()/deallocate()
@@ -816,7 +926,20 @@ public:
}
void set_absolute() { absolute = true; }
bool is_absolute() const { return absolute; }
- void push_back(int num, kmp_hw_t type, int offset) {
+ void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
+ for (int i = 0; i < depth; ++i) {
+ // Found an existing item for this layer type
+ // Add the num, offset, and attr to this item
+ if (items[i].type == type) {
+ int idx = items[i].num_attrs++;
+ if ((size_t)idx >= MAX_ATTRS)
+ return;
+ items[i].num[idx] = num;
+ items[i].offset[idx] = offset;
+ items[i].attr[idx] = attr;
+ return;
+ }
+ }
if (depth == capacity - 1) {
capacity *= 2;
item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
@@ -825,9 +948,11 @@ public:
__kmp_free(items);
items = new_items;
}
- items[depth].num = num;
+ items[depth].num_attrs = 1;
items[depth].type = type;
- items[depth].offset = offset;
+ items[depth].num[0] = num;
+ items[depth].offset[0] = offset;
+ items[depth].attr[0] = attr;
depth++;
set |= (1ull << type);
}
@@ -848,6 +973,10 @@ public:
}
depth--;
}
+ void sort() {
+ KMP_DEBUG_ASSERT(__kmp_topology);
+ qsort(items, depth, sizeof(item_t), hw_subset_compare);
+ }
bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
void dump() const {
printf("**********************\n");
@@ -855,16 +984,25 @@ public:
printf("* depth: %d\n", depth);
printf("* items:\n");
for (int i = 0; i < depth; ++i) {
- printf("num: %d, type: %s, offset: %d\n", items[i].num,
- __kmp_hw_get_keyword(items[i].type), items[i].offset);
+ printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
+ for (int j = 0; j < items[i].num_attrs; ++j) {
+ printf(" num: %d, offset: %d, attr: ", items[i].num[j],
+ items[i].offset[j]);
+ if (!items[i].attr[j]) {
+ printf(" (none)\n");
+ } else {
+ printf(
+ " core_type = %s, core_eff = %d\n",
+ __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
+ items[i].attr[j].get_core_eff());
+ }
+ }
}
printf("* set: 0x%llx\n", set);
printf("* absolute: %d\n", absolute);
printf("**********************\n");
}
};
-
-extern kmp_topology_t *__kmp_topology;
extern kmp_hw_subset_t *__kmp_hw_subset;
/* A structure for holding machine-specific hierarchy info to be computed once
diff --git a/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp b/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp
index b373353ddd..120cad17c2 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp
@@ -895,7 +895,7 @@ static void bpool(kmp_info_t *th, void *buf, bufsize len) {
__kmp_bget_dequeue(th); /* Release any queued buffers */
#ifdef SizeQuant
- len &= ~(SizeQuant - 1);
+ len &= ~((bufsize)(SizeQuant - 1));
#endif
if (thr->pool_len == 0) {
thr->pool_len = len;
@@ -1496,31 +1496,74 @@ typedef struct kmp_mem_desc { // Memory block descriptor
void *ptr_align; // Pointer to aligned memory, returned
kmp_allocator_t *allocator; // allocator
} kmp_mem_desc_t;
-static int alignment = sizeof(void *); // let's align to pointer size
+static int alignment = sizeof(void *); // align to pointer size by default
+// external interfaces are wrappers over internal implementation
void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
+ KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator));
+ void *ptr = __kmp_alloc(gtid, 0, size, allocator);
+ KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", ptr, gtid));
+ return ptr;
+}
+
+void *__kmpc_aligned_alloc(int gtid, size_t algn, size_t size,
+ omp_allocator_handle_t allocator) {
+ KE_TRACE(25, ("__kmpc_aligned_alloc: T#%d (%d, %d, %p)\n", gtid, (int)algn,
+ (int)size, allocator));
+ void *ptr = __kmp_alloc(gtid, algn, size, allocator);
+ KE_TRACE(25, ("__kmpc_aligned_alloc returns %p, T#%d\n", ptr, gtid));
+ return ptr;
+}
+
+void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
+ omp_allocator_handle_t allocator) {
+ KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb,
+ (int)size, allocator));
+ void *ptr = __kmp_calloc(gtid, 0, nmemb, size, allocator);
+ KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid));
+ return ptr;
+}
+
+void *__kmpc_realloc(int gtid, void *ptr, size_t size,
+ omp_allocator_handle_t allocator,
+ omp_allocator_handle_t free_allocator) {
+ KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size,
+ allocator, free_allocator));
+ void *nptr = __kmp_realloc(gtid, ptr, size, allocator, free_allocator);
+ KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid));
+ return nptr;
+}
+
+void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
+ KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));
+ ___kmpc_free(gtid, ptr, allocator);
+ KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, ptr, allocator));
+ return;
+}
+
+// internal implementation, called from inside the library
+void *__kmp_alloc(int gtid, size_t algn, size_t size,
+ omp_allocator_handle_t allocator) {
void *ptr = NULL;
kmp_allocator_t *al;
KMP_DEBUG_ASSERT(__kmp_init_serial);
-
if (size == 0)
return NULL;
-
if (allocator == omp_null_allocator)
allocator = __kmp_threads[gtid]->th.th_def_allocator;
- KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator));
- al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
+ al = RCAST(kmp_allocator_t *, allocator);
int sz_desc = sizeof(kmp_mem_desc_t);
kmp_mem_desc_t desc;
kmp_uintptr_t addr; // address returned by allocator
kmp_uintptr_t addr_align; // address to return to caller
kmp_uintptr_t addr_descr; // address of memory block descriptor
- int align = alignment; // default alignment
- if (allocator > kmp_max_mem_alloc && al->alignment > 0) {
- align = al->alignment; // alignment requested by user
- }
+ size_t align = alignment; // default alignment
+ if (allocator > kmp_max_mem_alloc && al->alignment > align)
+ align = al->alignment; // alignment required by allocator trait
+ if (align < algn)
+ align = algn; // max of allocator trait, parameter and sizeof(void*)
desc.size_orig = size;
desc.size_a = size + sz_desc + align;
@@ -1549,7 +1592,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
} else if (al->fb == omp_atv_allocator_fb) {
KMP_ASSERT(al != al->fb_data);
al = al->fb_data;
- return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
+ return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
} // else ptr == NULL;
} else {
// pool has enough space
@@ -1563,7 +1606,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
} else if (al->fb == omp_atv_allocator_fb) {
KMP_ASSERT(al != al->fb_data);
al = al->fb_data;
- return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
+ return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
}
}
}
@@ -1579,7 +1622,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
} else if (al->fb == omp_atv_allocator_fb) {
KMP_ASSERT(al != al->fb_data);
al = al->fb_data;
- return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
+ return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
}
}
}
@@ -1635,7 +1678,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
} else if (al->fb == omp_atv_allocator_fb) {
KMP_ASSERT(al != al->fb_data);
al = al->fb_data;
- return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
+ return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
} // else ptr == NULL;
} else {
// pool has enough space
@@ -1651,7 +1694,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
KMP_ASSERT(0); // abort fallback requested
} // no sense to look for another fallback because of same internal alloc
}
- KE_TRACE(10, ("__kmpc_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));
+ KE_TRACE(10, ("__kmp_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));
if (ptr == NULL)
return NULL;
@@ -1665,12 +1708,11 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
*((kmp_mem_desc_t *)addr_descr) = desc; // save descriptor contents
KMP_MB();
- KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", desc.ptr_align, gtid));
return desc.ptr_align;
}
-void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
- omp_allocator_handle_t allocator) {
+void *__kmp_calloc(int gtid, size_t algn, size_t nmemb, size_t size,
+ omp_allocator_handle_t allocator) {
void *ptr = NULL;
kmp_allocator_t *al;
KMP_DEBUG_ASSERT(__kmp_init_serial);
@@ -1678,10 +1720,7 @@ void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
if (allocator == omp_null_allocator)
allocator = __kmp_threads[gtid]->th.th_def_allocator;
- KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb,
- (int)size, allocator));
-
- al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
+ al = RCAST(kmp_allocator_t *, allocator);
if (nmemb == 0 || size == 0)
return ptr;
@@ -1693,31 +1732,27 @@ void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
return ptr;
}
- ptr = __kmpc_alloc(gtid, nmemb * size, allocator);
+ ptr = __kmp_alloc(gtid, algn, nmemb * size, allocator);
if (ptr) {
memset(ptr, 0x00, nmemb * size);
}
- KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid));
return ptr;
}
-void *__kmpc_realloc(int gtid, void *ptr, size_t size,
- omp_allocator_handle_t allocator,
- omp_allocator_handle_t free_allocator) {
+void *__kmp_realloc(int gtid, void *ptr, size_t size,
+ omp_allocator_handle_t allocator,
+ omp_allocator_handle_t free_allocator) {
void *nptr = NULL;
KMP_DEBUG_ASSERT(__kmp_init_serial);
if (size == 0) {
if (ptr != NULL)
- __kmpc_free(gtid, ptr, free_allocator);
+ ___kmpc_free(gtid, ptr, free_allocator);
return nptr;
}
- KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size,
- allocator, free_allocator));
-
- nptr = __kmpc_alloc(gtid, size, allocator);
+ nptr = __kmp_alloc(gtid, 0, size, allocator);
if (nptr != NULL && ptr != NULL) {
kmp_mem_desc_t desc;
@@ -1736,15 +1771,13 @@ void *__kmpc_realloc(int gtid, void *ptr, size_t size,
}
if (nptr != NULL) {
- __kmpc_free(gtid, ptr, free_allocator);
+ ___kmpc_free(gtid, ptr, free_allocator);
}
- KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid));
return nptr;
}
-void __kmpc_free(int gtid, void *ptr, const omp_allocator_handle_t allocator) {
- KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));
+void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
if (ptr == NULL)
return;
@@ -1804,8 +1837,6 @@ void __kmpc_free(int gtid, void *ptr, const omp_allocator_handle_t allocator) {
}
__kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);
}
- KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, desc.ptr_alloc,
- allocator));
}
/* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
@@ -1939,9 +1970,10 @@ void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {
In debug mode, fill the memory block with 0xEF before call to free(). */
void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
kmp_mem_descr_t descr;
+#if KMP_DEBUG
kmp_uintptr_t addr_allocated; // Address returned by malloc().
kmp_uintptr_t addr_aligned; // Aligned address passed by caller.
-
+#endif
KE_TRACE(25,
("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));
KMP_ASSERT(ptr != NULL);
@@ -1953,18 +1985,15 @@ void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
"ptr_aligned=%p, size_aligned=%d\n",
descr.ptr_allocated, (int)descr.size_allocated,
descr.ptr_aligned, (int)descr.size_aligned));
-
+#if KMP_DEBUG
addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;
-
KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);
KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);
KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);
KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);
KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
addr_allocated + descr.size_allocated);
-
-#ifdef KMP_DEBUG
memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
// Fill memory block with 0xEF, it helps catch using freed memory.
#endif
diff --git a/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp b/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp
index fcc06216a4..0bd7b1a41a 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp
@@ -732,7 +732,7 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs,
#define OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID) \
__kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid); \
- (*lhs) = (TYPE)((*lhs)OP((TYPE)rhs)); \
+ (*lhs) = (TYPE)((*lhs)OP rhs); \
__kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
// ------------------------------------------------------------------------
@@ -791,14 +791,14 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs,
{ \
TYPE old_value, new_value; \
old_value = *(TYPE volatile *)lhs; \
- new_value = (TYPE)(old_value OP((TYPE)rhs)); \
+ new_value = (TYPE)(old_value OP rhs); \
while (!KMP_COMPARE_AND_STORE_ACQ##BITS( \
(kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value, \
*VOLATILE_CAST(kmp_int##BITS *) & new_value)) { \
KMP_DO_PAUSE; \
\
old_value = *(TYPE volatile *)lhs; \
- new_value = (TYPE)(old_value OP((TYPE)rhs)); \
+ new_value = (TYPE)(old_value OP rhs); \
} \
}
@@ -1235,6 +1235,12 @@ MIN_MAX_COMPXCHG(float8, max, kmp_real64, 64, <, 8r, 7,
KMP_ARCH_X86) // __kmpc_atomic_float8_max
MIN_MAX_COMPXCHG(float8, min, kmp_real64, 64, >, 8r, 7,
KMP_ARCH_X86) // __kmpc_atomic_float8_min
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+MIN_MAX_CRITICAL(float10, max, long double, <, 10r,
+ 1) // __kmpc_atomic_float10_max
+MIN_MAX_CRITICAL(float10, min, long double, >, 10r,
+ 1) // __kmpc_atomic_float10_min
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
#if KMP_HAVE_QUAD
MIN_MAX_CRITICAL(float16, max, QUAD_LEGACY, <, 16r,
1) // __kmpc_atomic_float16_max
@@ -1313,6 +1319,7 @@ ATOMIC_CMPX_EQV(fixed8, eqv, kmp_int64, 64, ^~, 8i, 7,
}
/* ------------------------------------------------------------------------- */
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
// routines for long double type
ATOMIC_CRITICAL(float10, add, long double, +, 10r,
1) // __kmpc_atomic_float10_add
@@ -1322,6 +1329,7 @@ ATOMIC_CRITICAL(float10, mul, long double, *, 10r,
1) // __kmpc_atomic_float10_mul
ATOMIC_CRITICAL(float10, div, long double, /, 10r,
1) // __kmpc_atomic_float10_div
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
#if KMP_HAVE_QUAD
// routines for _Quad type
ATOMIC_CRITICAL(float16, add, QUAD_LEGACY, +, 16r,
@@ -1367,6 +1375,7 @@ ATOMIC_CRITICAL(cmplx8, add, kmp_cmplx64, +, 16c, 1) // __kmpc_atomic_cmplx8_add
ATOMIC_CRITICAL(cmplx8, sub, kmp_cmplx64, -, 16c, 1) // __kmpc_atomic_cmplx8_sub
ATOMIC_CRITICAL(cmplx8, mul, kmp_cmplx64, *, 16c, 1) // __kmpc_atomic_cmplx8_mul
ATOMIC_CRITICAL(cmplx8, div, kmp_cmplx64, /, 16c, 1) // __kmpc_atomic_cmplx8_div
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
ATOMIC_CRITICAL(cmplx10, add, kmp_cmplx80, +, 20c,
1) // __kmpc_atomic_cmplx10_add
ATOMIC_CRITICAL(cmplx10, sub, kmp_cmplx80, -, 20c,
@@ -1375,6 +1384,7 @@ ATOMIC_CRITICAL(cmplx10, mul, kmp_cmplx80, *, 20c,
1) // __kmpc_atomic_cmplx10_mul
ATOMIC_CRITICAL(cmplx10, div, kmp_cmplx80, /, 20c,
1) // __kmpc_atomic_cmplx10_div
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
#if KMP_HAVE_QUAD
ATOMIC_CRITICAL(cmplx16, add, CPLX128_LEG, +, 32c,
1) // __kmpc_atomic_cmplx16_add
@@ -1793,6 +1803,7 @@ ATOMIC_CMPXCHG_MIX(float8, kmp_real64, mul, 64, *, fp, _Quad, 8r, 7,
ATOMIC_CMPXCHG_MIX(float8, kmp_real64, div, 64, /, fp, _Quad, 8r, 7,
KMP_ARCH_X86) // __kmpc_atomic_float8_div_fp
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
ATOMIC_CRITICAL_FP(float10, long double, add, +, fp, _Quad, 10r,
1) // __kmpc_atomic_float10_add_fp
ATOMIC_CRITICAL_FP(float10, long double, sub, -, fp, _Quad, 10r,
@@ -1802,7 +1813,6 @@ ATOMIC_CRITICAL_FP(float10, long double, mul, *, fp, _Quad, 10r,
ATOMIC_CRITICAL_FP(float10, long double, div, /, fp, _Quad, 10r,
1) // __kmpc_atomic_float10_div_fp
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
// Reverse operations
ATOMIC_CMPXCHG_REV_MIX(fixed1, char, sub_rev, 8, -, fp, _Quad, 1i, 0,
KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_rev_fp
@@ -2717,6 +2727,10 @@ MIN_MAX_COMPXCHG_CPT(float8, max_cpt, kmp_real64, 64, <,
KMP_ARCH_X86) // __kmpc_atomic_float8_max_cpt
MIN_MAX_COMPXCHG_CPT(float8, min_cpt, kmp_real64, 64, >,
KMP_ARCH_X86) // __kmpc_atomic_float8_min_cpt
+MIN_MAX_CRITICAL_CPT(float10, max_cpt, long double, <, 10r,
+ 1) // __kmpc_atomic_float10_max_cpt
+MIN_MAX_CRITICAL_CPT(float10, min_cpt, long double, >, 10r,
+ 1) // __kmpc_atomic_float10_min_cpt
#if KMP_HAVE_QUAD
MIN_MAX_CRITICAL_CPT(float16, max_cpt, QUAD_LEGACY, <, 16r,
1) // __kmpc_atomic_float16_max_cpt
@@ -3586,7 +3600,7 @@ void __kmpc_atomic_8(ident_t *id_ref, int gtid, void *lhs, void *rhs,
__kmp_release_atomic_lock(&__kmp_atomic_lock_8i, gtid);
}
}
-
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
void __kmpc_atomic_10(ident_t *id_ref, int gtid, void *lhs, void *rhs,
void (*f)(void *, void *, void *)) {
KMP_DEBUG_ASSERT(__kmp_init_serial);
@@ -3607,6 +3621,7 @@ void __kmpc_atomic_10(ident_t *id_ref, int gtid, void *lhs, void *rhs,
#endif /* KMP_GOMP_COMPAT */
__kmp_release_atomic_lock(&__kmp_atomic_lock_10r, gtid);
}
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
void __kmpc_atomic_16(ident_t *id_ref, int gtid, void *lhs, void *rhs,
void (*f)(void *, void *, void *)) {
@@ -3628,7 +3643,7 @@ void __kmpc_atomic_16(ident_t *id_ref, int gtid, void *lhs, void *rhs,
#endif /* KMP_GOMP_COMPAT */
__kmp_release_atomic_lock(&__kmp_atomic_lock_16c, gtid);
}
-
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
void __kmpc_atomic_20(ident_t *id_ref, int gtid, void *lhs, void *rhs,
void (*f)(void *, void *, void *)) {
KMP_DEBUG_ASSERT(__kmp_init_serial);
@@ -3649,7 +3664,7 @@ void __kmpc_atomic_20(ident_t *id_ref, int gtid, void *lhs, void *rhs,
#endif /* KMP_GOMP_COMPAT */
__kmp_release_atomic_lock(&__kmp_atomic_lock_20c, gtid);
}
-
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
void __kmpc_atomic_32(ident_t *id_ref, int gtid, void *lhs, void *rhs,
void (*f)(void *, void *, void *)) {
KMP_DEBUG_ASSERT(__kmp_init_serial);
@@ -3686,6 +3701,171 @@ void __kmpc_atomic_end(void) {
__kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
}
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// OpenMP 5.1 compare and swap
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param x Memory location to operate on
+@param e Expected value
+@param d Desired value
+@return Result of comparison
+
+Implements Compare And Swap atomic operation.
+
+Sample code:
+#pragma omp atomic compare update capture
+ { r = x == e; if(r) { x = d; } }
+*/
+bool __kmpc_atomic_bool_1_cas(ident_t *loc, int gtid, char *x, char e, char d) {
+ return KMP_COMPARE_AND_STORE_ACQ8(x, e, d);
+}
+bool __kmpc_atomic_bool_2_cas(ident_t *loc, int gtid, short *x, short e,
+ short d) {
+ return KMP_COMPARE_AND_STORE_ACQ16(x, e, d);
+}
+bool __kmpc_atomic_bool_4_cas(ident_t *loc, int gtid, kmp_int32 *x, kmp_int32 e,
+ kmp_int32 d) {
+ return KMP_COMPARE_AND_STORE_ACQ32(x, e, d);
+}
+bool __kmpc_atomic_bool_8_cas(ident_t *loc, int gtid, kmp_int64 *x, kmp_int64 e,
+ kmp_int64 d) {
+ return KMP_COMPARE_AND_STORE_ACQ64(x, e, d);
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param x Memory location to operate on
+@param e Expected value
+@param d Desired value
+@return Old value of x
+
+Implements Compare And Swap atomic operation.
+
+Sample code:
+#pragma omp atomic compare update capture
+ { v = x; if (x == e) { x = d; } }
+*/
+char __kmpc_atomic_val_1_cas(ident_t *loc, int gtid, char *x, char e, char d) {
+ return KMP_COMPARE_AND_STORE_RET8(x, e, d);
+}
+short __kmpc_atomic_val_2_cas(ident_t *loc, int gtid, short *x, short e,
+ short d) {
+ return KMP_COMPARE_AND_STORE_RET16(x, e, d);
+}
+kmp_int32 __kmpc_atomic_val_4_cas(ident_t *loc, int gtid, kmp_int32 *x,
+ kmp_int32 e, kmp_int32 d) {
+ return KMP_COMPARE_AND_STORE_RET32(x, e, d);
+}
+kmp_int64 __kmpc_atomic_val_8_cas(ident_t *loc, int gtid, kmp_int64 *x,
+ kmp_int64 e, kmp_int64 d) {
+ return KMP_COMPARE_AND_STORE_RET64(x, e, d);
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param x Memory location to operate on
+@param e Expected value
+@param d Desired value
+@param pv Captured value location
+@return Result of comparison
+
+Implements Compare And Swap + Capture atomic operation.
+
+v gets old valie of x if comparison failed, untouched otherwise.
+Sample code:
+#pragma omp atomic compare update capture
+ { r = x == e; if(r) { x = d; } else { v = x; } }
+*/
+bool __kmpc_atomic_bool_1_cas_cpt(ident_t *loc, int gtid, char *x, char e,
+ char d, char *pv) {
+ char old = KMP_COMPARE_AND_STORE_RET8(x, e, d);
+ if (old == e)
+ return true;
+ KMP_ASSERT(pv != NULL);
+ *pv = old;
+ return false;
+}
+bool __kmpc_atomic_bool_2_cas_cpt(ident_t *loc, int gtid, short *x, short e,
+ short d, short *pv) {
+ short old = KMP_COMPARE_AND_STORE_RET16(x, e, d);
+ if (old == e)
+ return true;
+ KMP_ASSERT(pv != NULL);
+ *pv = old;
+ return false;
+}
+bool __kmpc_atomic_bool_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x,
+ kmp_int32 e, kmp_int32 d, kmp_int32 *pv) {
+ kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(x, e, d);
+ if (old == e)
+ return true;
+ KMP_ASSERT(pv != NULL);
+ *pv = old;
+ return false;
+}
+bool __kmpc_atomic_bool_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x,
+ kmp_int64 e, kmp_int64 d, kmp_int64 *pv) {
+ kmp_int64 old = KMP_COMPARE_AND_STORE_RET64(x, e, d);
+ if (old == e)
+ return true;
+ KMP_ASSERT(pv != NULL);
+ *pv = old;
+ return false;
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param x Memory location to operate on
+@param e Expected value
+@param d Desired value
+@param pv Captured value location
+@return Old value of x
+
+Implements Compare And Swap + Capture atomic operation.
+
+v gets new valie of x.
+Sample code:
+#pragma omp atomic compare update capture
+ { if (x == e) { x = d; }; v = x; }
+*/
+char __kmpc_atomic_val_1_cas_cpt(ident_t *loc, int gtid, char *x, char e,
+ char d, char *pv) {
+ char old = KMP_COMPARE_AND_STORE_RET8(x, e, d);
+ KMP_ASSERT(pv != NULL);
+ *pv = old == e ? d : old;
+ return old;
+}
+short __kmpc_atomic_val_2_cas_cpt(ident_t *loc, int gtid, short *x, short e,
+ short d, short *pv) {
+ short old = KMP_COMPARE_AND_STORE_RET16(x, e, d);
+ KMP_ASSERT(pv != NULL);
+ *pv = old == e ? d : old;
+ return old;
+}
+kmp_int32 __kmpc_atomic_val_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x,
+ kmp_int32 e, kmp_int32 d, kmp_int32 *pv) {
+ kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(x, e, d);
+ KMP_ASSERT(pv != NULL);
+ *pv = old == e ? d : old;
+ return old;
+}
+kmp_int64 __kmpc_atomic_val_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x,
+ kmp_int64 e, kmp_int64 d, kmp_int64 *pv) {
+ kmp_int64 old = KMP_COMPARE_AND_STORE_RET64(x, e, d);
+ KMP_ASSERT(pv != NULL);
+ *pv = old == e ? d : old;
+ return old;
+}
+
+// End OpenMP 5.1 compare + capture
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+
/*!
@}
*/
diff --git a/contrib/libs/cxxsupp/openmp/kmp_atomic.h b/contrib/libs/cxxsupp/openmp/kmp_atomic.h
index 6a0827aaf1..079b917285 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_atomic.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_atomic.h
@@ -578,6 +578,10 @@ void __kmpc_atomic_float8_max(ident_t *id_ref, int gtid, kmp_real64 *lhs,
kmp_real64 rhs);
void __kmpc_atomic_float8_min(ident_t *id_ref, int gtid, kmp_real64 *lhs,
kmp_real64 rhs);
+void __kmpc_atomic_float10_max(ident_t *id_ref, int gtid, long double *lhs,
+ long double rhs);
+void __kmpc_atomic_float10_min(ident_t *id_ref, int gtid, long double *lhs,
+ long double rhs);
#if KMP_HAVE_QUAD
void __kmpc_atomic_float16_max(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
QUAD_LEGACY rhs);
@@ -1254,6 +1258,12 @@ kmp_real64 __kmpc_atomic_float8_max_cpt(ident_t *id_ref, int gtid,
kmp_real64 __kmpc_atomic_float8_min_cpt(ident_t *id_ref, int gtid,
kmp_real64 *lhs, kmp_real64 rhs,
int flag);
+long double __kmpc_atomic_float10_max_cpt(ident_t *id_ref, int gtid,
+ long double *lhs, long double rhs,
+ int flag);
+long double __kmpc_atomic_float10_min_cpt(ident_t *id_ref, int gtid,
+ long double *lhs, long double rhs,
+ int flag);
#if KMP_HAVE_QUAD
QUAD_LEGACY __kmpc_atomic_float16_max_cpt(ident_t *id_ref, int gtid,
QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
@@ -1756,6 +1766,78 @@ long double __kmpc_atomic_float10_div_cpt_rev_fp(ident_t *id_ref, int gtid,
// End of OpenMP 4.0 capture
+// OpenMP 5.1 compare and swap
+/*
+ __kmpc_atomic_bool_1_cas
+ __kmpc_atomic_bool_2_cas
+ __kmpc_atomic_bool_4_cas
+ __kmpc_atomic_bool_8_cas
+ __kmpc_atomic_val_1_cas
+ __kmpc_atomic_val_2_cas
+ __kmpc_atomic_val_4_cas
+ __kmpc_atomic_val_8_cas
+ __kmpc_atomic_bool_1_cas_cpt
+ __kmpc_atomic_bool_2_cas_cpt
+ __kmpc_atomic_bool_4_cas_cpt
+ __kmpc_atomic_bool_8_cas_cpt
+ __kmpc_atomic_val_1_cas_cpt
+ __kmpc_atomic_val_2_cas_cpt
+ __kmpc_atomic_val_4_cas_cpt
+ __kmpc_atomic_val_8_cas_cpt
+*/
+// In all interfaces of CAS (Compare And Swap):
+// r is the boolean result of comparison
+// x is memory location to operate on
+// e is expected (old) value
+// d is desired (new) value
+// pv is pointer to captured value v whose location may coincide with e
+
+// { r = x == e; if(r) { x = d; } }
+// functions return result of comparison
+bool __kmpc_atomic_bool_1_cas(ident_t *loc, int gtid, char *x, char e, char d);
+bool __kmpc_atomic_bool_2_cas(ident_t *loc, int gtid, short *x, short e,
+ short d);
+bool __kmpc_atomic_bool_4_cas(ident_t *loc, int gtid, kmp_int32 *x, kmp_int32 e,
+ kmp_int32 d);
+bool __kmpc_atomic_bool_8_cas(ident_t *loc, int gtid, kmp_int64 *x, kmp_int64 e,
+ kmp_int64 d);
+
+// { v = x; if (x == e) { x = d; } }
+// functions return old value
+char __kmpc_atomic_val_1_cas(ident_t *loc, int gtid, char *x, char e, char d);
+short __kmpc_atomic_val_2_cas(ident_t *loc, int gtid, short *x, short e,
+ short d);
+kmp_int32 __kmpc_atomic_val_4_cas(ident_t *loc, int gtid, kmp_int32 *x,
+ kmp_int32 e, kmp_int32 d);
+kmp_int64 __kmpc_atomic_val_8_cas(ident_t *loc, int gtid, kmp_int64 *x,
+ kmp_int64 e, kmp_int64 d);
+
+// { r = x == e; if(r) { x = d; } else { v = x; } }
+// v gets old value if comparison failed, untouched otherwise
+// functions return result of comparison
+bool __kmpc_atomic_bool_1_cas_cpt(ident_t *loc, int gtid, char *x, char e,
+ char d, char *pv);
+bool __kmpc_atomic_bool_2_cas_cpt(ident_t *loc, int gtid, short *x, short e,
+ short d, short *pv);
+bool __kmpc_atomic_bool_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x,
+ kmp_int32 e, kmp_int32 d, kmp_int32 *pv);
+bool __kmpc_atomic_bool_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x,
+ kmp_int64 e, kmp_int64 d, kmp_int64 *pv);
+
+// { if (x == e) { x = d; }; v = x; }
+// v gets old value if comparison failed, new value otherwise
+// functions return old value
+char __kmpc_atomic_val_1_cas_cpt(ident_t *loc, int gtid, char *x, char e,
+ char d, char *pv);
+short __kmpc_atomic_val_2_cas_cpt(ident_t *loc, int gtid, short *x, short e,
+ short d, short *pv);
+kmp_int32 __kmpc_atomic_val_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x,
+ kmp_int32 e, kmp_int32 d, kmp_int32 *pv);
+kmp_int64 __kmpc_atomic_val_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x,
+ kmp_int64 e, kmp_int64 d, kmp_int64 *pv);
+
+// End OpenMP 5.1 compare + capture
+
#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
/* ------------------------------------------------------------------------ */
diff --git a/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp b/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp
index 93112156a1..ee05bb3587 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp
@@ -10,12 +10,14 @@
//
//===----------------------------------------------------------------------===//
-#include "kmp.h"
#include "kmp_wait_release.h"
+#include "kmp_barrier.h"
#include "kmp_itt.h"
#include "kmp_os.h"
#include "kmp_stats.h"
#include "ompt-specific.h"
+// for distributed barrier
+#include "kmp_affinity.h"
#if KMP_MIC
#include <immintrin.h>
@@ -38,6 +40,516 @@
void __kmp_print_structure(void); // Forward declaration
// ---------------------------- Barrier Algorithms ----------------------------
+// Distributed barrier
+
+// Compute how many threads to have polling each cache-line.
+// We want to limit the number of writes to IDEAL_GO_RESOLUTION.
+void distributedBarrier::computeVarsForN(size_t n) {
+ int nsockets = 1;
+ if (__kmp_topology) {
+ int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
+ int core_level = __kmp_topology->get_level(KMP_HW_CORE);
+ int ncores_per_socket =
+ __kmp_topology->calculate_ratio(core_level, socket_level);
+ nsockets = __kmp_topology->get_count(socket_level);
+
+ if (nsockets <= 0)
+ nsockets = 1;
+ if (ncores_per_socket <= 0)
+ ncores_per_socket = 1;
+
+ threads_per_go = ncores_per_socket >> 1;
+ if (!fix_threads_per_go) {
+ // Minimize num_gos
+ if (threads_per_go > 4) {
+ if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
+ threads_per_go = threads_per_go >> 1;
+ }
+ if (threads_per_go > 4 && nsockets == 1)
+ threads_per_go = threads_per_go >> 1;
+ }
+ }
+ if (threads_per_go == 0)
+ threads_per_go = 1;
+ fix_threads_per_go = true;
+ num_gos = n / threads_per_go;
+ if (n % threads_per_go)
+ num_gos++;
+ if (nsockets == 1 || num_gos == 1)
+ num_groups = 1;
+ else {
+ num_groups = num_gos / nsockets;
+ if (num_gos % nsockets)
+ num_groups++;
+ }
+ if (num_groups <= 0)
+ num_groups = 1;
+ gos_per_group = num_gos / num_groups;
+ if (num_gos % num_groups)
+ gos_per_group++;
+ threads_per_group = threads_per_go * gos_per_group;
+ } else {
+ num_gos = n / threads_per_go;
+ if (n % threads_per_go)
+ num_gos++;
+ if (num_gos == 1)
+ num_groups = 1;
+ else {
+ num_groups = num_gos / 2;
+ if (num_gos % 2)
+ num_groups++;
+ }
+ gos_per_group = num_gos / num_groups;
+ if (num_gos % num_groups)
+ gos_per_group++;
+ threads_per_group = threads_per_go * gos_per_group;
+ }
+}
+
+void distributedBarrier::computeGo(size_t n) {
+ // Minimize num_gos
+ for (num_gos = 1;; num_gos++)
+ if (IDEAL_CONTENTION * num_gos >= n)
+ break;
+ threads_per_go = n / num_gos;
+ if (n % num_gos)
+ threads_per_go++;
+ while (num_gos > MAX_GOS) {
+ threads_per_go++;
+ num_gos = n / threads_per_go;
+ if (n % threads_per_go)
+ num_gos++;
+ }
+ computeVarsForN(n);
+}
+
+// This function is to resize the barrier arrays when the new number of threads
+// exceeds max_threads, which is the current size of all the arrays
+void distributedBarrier::resize(size_t nthr) {
+ KMP_DEBUG_ASSERT(nthr > max_threads);
+
+ // expand to requested size * 2
+ max_threads = nthr * 2;
+
+ // allocate arrays to new max threads
+ for (int i = 0; i < MAX_ITERS; ++i) {
+ if (flags[i])
+ flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
+ max_threads * sizeof(flags_s));
+ else
+ flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
+ }
+
+ if (go)
+ go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
+ else
+ go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
+
+ if (iter)
+ iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
+ else
+ iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
+
+ if (sleep)
+ sleep =
+ (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
+ else
+ sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
+}
+
+// This function is to set all the go flags that threads might be waiting
+// on, and when blocktime is not infinite, it should be followed by a wake-up
+// call to each thread
+kmp_uint64 distributedBarrier::go_release() {
+ kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
+ for (size_t j = 0; j < num_gos; j++) {
+ go[j].go.store(next_go);
+ }
+ return next_go;
+}
+
+void distributedBarrier::go_reset() {
+ for (size_t j = 0; j < max_threads; ++j) {
+ for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
+ flags[i][j].stillNeed = 1;
+ }
+ go[j].go.store(0);
+ iter[j].iter = 0;
+ }
+}
+
+// This function inits/re-inits the distributed barrier for a particular number
+// of threads. If a resize of arrays is needed, it calls the resize function.
+void distributedBarrier::init(size_t nthr) {
+ size_t old_max = max_threads;
+ if (nthr > max_threads) { // need more space in arrays
+ resize(nthr);
+ }
+
+ for (size_t i = 0; i < max_threads; i++) {
+ for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
+ flags[j][i].stillNeed = 1;
+ }
+ go[i].go.store(0);
+ iter[i].iter = 0;
+ if (i >= old_max)
+ sleep[i].sleep = false;
+ }
+
+ // Recalculate num_gos, etc. based on new nthr
+ computeVarsForN(nthr);
+
+ num_threads = nthr;
+
+ if (team_icvs == NULL)
+ team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
+}
+
+// This function is used only when KMP_BLOCKTIME is not infinite.
+// static
+void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
+ size_t start, size_t stop, size_t inc,
+ size_t tid) {
+ KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
+ if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+ return;
+
+ kmp_info_t **other_threads = team->t.t_threads;
+ for (size_t thr = start; thr < stop; thr += inc) {
+ KMP_DEBUG_ASSERT(other_threads[thr]);
+ int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
+ // Wake up worker regardless of if it appears to be sleeping or not
+ __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
+ }
+}
+
+static void __kmp_dist_barrier_gather(
+ enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+ void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+ KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
+ kmp_team_t *team;
+ distributedBarrier *b;
+ kmp_info_t **other_threads;
+ kmp_uint64 my_current_iter, my_next_iter;
+ kmp_uint32 nproc;
+ bool group_leader;
+
+ team = this_thr->th.th_team;
+ nproc = this_thr->th.th_team_nproc;
+ other_threads = team->t.t_threads;
+ b = team->t.b;
+ my_current_iter = b->iter[tid].iter;
+ my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
+ group_leader = ((tid % b->threads_per_group) == 0);
+
+ KA_TRACE(20,
+ ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
+ gtid, team->t.t_id, tid, bt));
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+ // Barrier imbalance - save arrive time to the thread
+ if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+ this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
+ __itt_get_timestamp();
+ }
+#endif
+
+ if (group_leader) {
+ // Start from the thread after the group leader
+ size_t group_start = tid + 1;
+ size_t group_end = tid + b->threads_per_group;
+ size_t threads_pending = 0;
+
+ if (group_end > nproc)
+ group_end = nproc;
+ do { // wait for threads in my group
+ threads_pending = 0;
+ // Check all the flags every time to avoid branch misspredict
+ for (size_t thr = group_start; thr < group_end; thr++) {
+ // Each thread uses a different cache line
+ threads_pending += b->flags[my_current_iter][thr].stillNeed;
+ }
+ // Execute tasks here
+ if (__kmp_tasking_mode != tskm_immediate_exec) {
+ kmp_task_team_t *task_team = this_thr->th.th_task_team;
+ if (task_team != NULL) {
+ if (TCR_SYNC_4(task_team->tt.tt_active)) {
+ if (KMP_TASKING_ENABLED(task_team)) {
+ int tasks_completed = FALSE;
+ __kmp_atomic_execute_tasks_64(
+ this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
+ &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+ } else
+ this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+ }
+ } else {
+ this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+ } // if
+ }
+ if (TCR_4(__kmp_global.g.g_done)) {
+ if (__kmp_global.g.g_abort)
+ __kmp_abort_thread();
+ break;
+ } else if (__kmp_tasking_mode != tskm_immediate_exec &&
+ this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
+ this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
+ }
+ } while (threads_pending > 0);
+
+ if (reduce) { // Perform reduction if needed
+ OMPT_REDUCTION_DECL(this_thr, gtid);
+ OMPT_REDUCTION_BEGIN;
+ // Group leader reduces all threads in group
+ for (size_t thr = group_start; thr < group_end; thr++) {
+ (*reduce)(this_thr->th.th_local.reduce_data,
+ other_threads[thr]->th.th_local.reduce_data);
+ }
+ OMPT_REDUCTION_END;
+ }
+
+ // Set flag for next iteration
+ b->flags[my_next_iter][tid].stillNeed = 1;
+ // Each thread uses a different cache line; resets stillNeed to 0 to
+ // indicate it has reached the barrier
+ b->flags[my_current_iter][tid].stillNeed = 0;
+
+ do { // wait for all group leaders
+ threads_pending = 0;
+ for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
+ threads_pending += b->flags[my_current_iter][thr].stillNeed;
+ }
+ // Execute tasks here
+ if (__kmp_tasking_mode != tskm_immediate_exec) {
+ kmp_task_team_t *task_team = this_thr->th.th_task_team;
+ if (task_team != NULL) {
+ if (TCR_SYNC_4(task_team->tt.tt_active)) {
+ if (KMP_TASKING_ENABLED(task_team)) {
+ int tasks_completed = FALSE;
+ __kmp_atomic_execute_tasks_64(
+ this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
+ &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+ } else
+ this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+ }
+ } else {
+ this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+ } // if
+ }
+ if (TCR_4(__kmp_global.g.g_done)) {
+ if (__kmp_global.g.g_abort)
+ __kmp_abort_thread();
+ break;
+ } else if (__kmp_tasking_mode != tskm_immediate_exec &&
+ this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
+ this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
+ }
+ } while (threads_pending > 0);
+
+ if (reduce) { // Perform reduction if needed
+ if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
+ OMPT_REDUCTION_DECL(this_thr, gtid);
+ OMPT_REDUCTION_BEGIN;
+ for (size_t thr = b->threads_per_group; thr < nproc;
+ thr += b->threads_per_group) {
+ (*reduce)(this_thr->th.th_local.reduce_data,
+ other_threads[thr]->th.th_local.reduce_data);
+ }
+ OMPT_REDUCTION_END;
+ }
+ }
+ } else {
+ // Set flag for next iteration
+ b->flags[my_next_iter][tid].stillNeed = 1;
+ // Each thread uses a different cache line; resets stillNeed to 0 to
+ // indicate it has reached the barrier
+ b->flags[my_current_iter][tid].stillNeed = 0;
+ }
+
+ KMP_MFENCE();
+
+ KA_TRACE(20,
+ ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+ gtid, team->t.t_id, tid, bt));
+}
+
+static void __kmp_dist_barrier_release(
+ enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+ int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+ KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
+ kmp_team_t *team;
+ distributedBarrier *b;
+ kmp_bstate_t *thr_bar;
+ kmp_uint64 my_current_iter, next_go;
+ size_t my_go_index;
+ bool group_leader;
+
+ KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
+ gtid, tid, bt));
+
+ thr_bar = &this_thr->th.th_bar[bt].bb;
+
+ if (!KMP_MASTER_TID(tid)) {
+ // workers and non-master group leaders need to check their presence in team
+ do {
+ if (this_thr->th.th_used_in_team.load() != 1 &&
+ this_thr->th.th_used_in_team.load() != 3) {
+ // Thread is not in use in a team. Wait on location in tid's thread
+ // struct. The 0 value tells anyone looking that this thread is spinning
+ // or sleeping until this location becomes 3 again; 3 is the transition
+ // state to get to 1 which is waiting on go and being in the team
+ kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
+ if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
+ 0) ||
+ this_thr->th.th_used_in_team.load() == 0) {
+ my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
+ }
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+ if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+ // In fork barrier where we could not get the object reliably
+ itt_sync_obj =
+ __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+ // Cancel wait on previous parallel region...
+ __kmp_itt_task_starting(itt_sync_obj);
+
+ if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+ return;
+
+ itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+ if (itt_sync_obj != NULL)
+ // Call prepare as early as possible for "new" barrier
+ __kmp_itt_task_finished(itt_sync_obj);
+ } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+ if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+ return;
+ }
+ if (this_thr->th.th_used_in_team.load() != 1 &&
+ this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
+ continue;
+ if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+ return;
+
+ // At this point, the thread thinks it is in use in a team, or in
+ // transition to be used in a team, but it might have reached this barrier
+ // before it was marked unused by the team. Unused threads are awoken and
+ // shifted to wait on local thread struct elsewhere. It also might reach
+ // this point by being picked up for use by a different team. Either way,
+ // we need to update the tid.
+ tid = __kmp_tid_from_gtid(gtid);
+ team = this_thr->th.th_team;
+ KMP_DEBUG_ASSERT(tid >= 0);
+ KMP_DEBUG_ASSERT(team);
+ b = team->t.b;
+ my_current_iter = b->iter[tid].iter;
+ next_go = my_current_iter + distributedBarrier::MAX_ITERS;
+ my_go_index = tid / b->threads_per_go;
+ if (this_thr->th.th_used_in_team.load() == 3) {
+ KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
+ }
+ // Check if go flag is set
+ if (b->go[my_go_index].go.load() != next_go) {
+ // Wait on go flag on team
+ kmp_atomic_flag_64<false, true> my_flag(
+ &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
+ my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
+ KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
+ b->iter[tid].iter == 0);
+ KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
+ }
+
+ if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+ return;
+ // At this point, the thread's go location was set. This means the primary
+ // thread is safely in the barrier, and so this thread's data is
+ // up-to-date, but we should check again that this thread is really in
+ // use in the team, as it could have been woken up for the purpose of
+ // changing team size, or reaping threads at shutdown.
+ if (this_thr->th.th_used_in_team.load() == 1)
+ break;
+ } while (1);
+
+ if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+ return;
+
+ group_leader = ((tid % b->threads_per_group) == 0);
+ if (group_leader) {
+ // Tell all the threads in my group they can go!
+ for (size_t go_idx = my_go_index + 1;
+ go_idx < my_go_index + b->gos_per_group; go_idx++) {
+ b->go[go_idx].go.store(next_go);
+ }
+ // Fence added so that workers can see changes to go. sfence inadequate.
+ KMP_MFENCE();
+ }
+
+#if KMP_BARRIER_ICV_PUSH
+ if (propagate_icvs) { // copy ICVs to final dest
+ __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
+ tid, FALSE);
+ copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+ (kmp_internal_control_t *)team->t.b->team_icvs);
+ copy_icvs(&thr_bar->th_fixed_icvs,
+ &team->t.t_implicit_task_taskdata[tid].td_icvs);
+ }
+#endif
+ if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
+ // This thread is now awake and participating in the barrier;
+ // wake up the other threads in the group
+ size_t nproc = this_thr->th.th_team_nproc;
+ size_t group_end = tid + b->threads_per_group;
+ if (nproc < group_end)
+ group_end = nproc;
+ __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
+ }
+ } else { // Primary thread
+ team = this_thr->th.th_team;
+ b = team->t.b;
+ my_current_iter = b->iter[tid].iter;
+ next_go = my_current_iter + distributedBarrier::MAX_ITERS;
+#if KMP_BARRIER_ICV_PUSH
+ if (propagate_icvs) {
+ // primary thread has ICVs in final destination; copy
+ copy_icvs(&thr_bar->th_fixed_icvs,
+ &team->t.t_implicit_task_taskdata[tid].td_icvs);
+ }
+#endif
+ // Tell all the group leaders they can go!
+ for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
+ b->go[go_idx].go.store(next_go);
+ }
+
+ if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+ // Wake-up the group leaders
+ size_t nproc = this_thr->th.th_team_nproc;
+ __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
+ b->threads_per_group, tid);
+ }
+
+ // Tell all the threads in my group they can go!
+ for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
+ b->go[go_idx].go.store(next_go);
+ }
+
+ // Fence added so that workers can see changes to go. sfence inadequate.
+ KMP_MFENCE();
+
+ if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+ // Wake-up the other threads in my group
+ size_t nproc = this_thr->th.th_team_nproc;
+ size_t group_end = tid + b->threads_per_group;
+ if (nproc < group_end)
+ group_end = nproc;
+ __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
+ }
+ }
+ // Update to next iteration
+ KMP_ASSERT(my_current_iter == b->iter[tid].iter);
+ b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
+
+ KA_TRACE(
+ 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+ gtid, team->t.t_id, tid, bt));
+}
// Linear Barrier
template <bool cancellable = false>
@@ -1354,6 +1866,11 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
} else {
switch (__kmp_barrier_gather_pattern[bt]) {
+ case bp_dist_bar: {
+ __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
+ reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+ break;
+ }
case bp_hyper_bar: {
// don't set branch bits to 0; use linear
KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
@@ -1467,6 +1984,12 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
} else {
switch (__kmp_barrier_release_pattern[bt]) {
+ case bp_dist_bar: {
+ KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+ __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
+ FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+ break;
+ }
case bp_hyper_bar: {
KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
__kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
@@ -1514,8 +2037,10 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
}
#endif
- KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
- TRUE);
+ KMP_DEBUG_ASSERT(
+ this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
+ this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
+ TRUE);
__kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
__kmp_task_team_setup(this_thr, team, 0);
@@ -1596,6 +2121,11 @@ void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
if (!team->t.t_serialized) {
if (KMP_MASTER_GTID(gtid)) {
switch (__kmp_barrier_release_pattern[bt]) {
+ case bp_dist_bar: {
+ __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
+ FALSE USE_ITT_BUILD_ARG(NULL));
+ break;
+ }
case bp_hyper_bar: {
KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
__kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
@@ -1634,7 +2164,6 @@ void __kmp_join_barrier(int gtid) {
kmp_info_t *this_thr = __kmp_threads[gtid];
kmp_team_t *team;
kmp_uint nproc;
- kmp_info_t *master_thread;
int tid;
#ifdef KMP_DEBUG
int team_id;
@@ -1656,9 +2185,7 @@ void __kmp_join_barrier(int gtid) {
tid = __kmp_tid_from_gtid(gtid);
#ifdef KMP_DEBUG
team_id = team->t.t_id;
-#endif /* KMP_DEBUG */
- master_thread = this_thr->th.th_team_master;
-#ifdef KMP_DEBUG
+ kmp_info_t *master_thread = this_thr->th.th_team_master;
if (master_thread != team->t.t_threads[0]) {
__kmp_print_structure();
}
@@ -1705,8 +2232,8 @@ void __kmp_join_barrier(int gtid) {
if (__kmp_tasking_mode == tskm_extra_barrier) {
__kmp_tasking_barrier(team, this_thr, gtid);
- KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
- team_id, tid));
+ KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
+ gtid, team_id, tid));
}
#ifdef KMP_DEBUG
if (__kmp_tasking_mode != tskm_immediate_exec) {
@@ -1715,8 +2242,9 @@ void __kmp_join_barrier(int gtid) {
__kmp_gtid_from_thread(this_thr), team_id,
team->t.t_task_team[this_thr->th.th_task_state],
this_thr->th.th_task_team));
- KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
- team->t.t_task_team[this_thr->th.th_task_state]);
+ if (this_thr->th.th_task_team)
+ KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
+ team->t.t_task_team[this_thr->th.th_task_state]);
}
#endif /* KMP_DEBUG */
@@ -1742,6 +2270,11 @@ void __kmp_join_barrier(int gtid) {
#endif /* USE_ITT_BUILD */
switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
+ case bp_dist_bar: {
+ __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
+ NULL USE_ITT_BUILD_ARG(itt_sync_obj));
+ break;
+ }
case bp_hyper_bar: {
KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
__kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
@@ -1787,8 +2320,7 @@ void __kmp_join_barrier(int gtid) {
team_thread->th.th_stats->setIdleFlag();
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
team_thread->th.th_sleep_loc != NULL)
- __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
- team_thread->th.th_sleep_loc);
+ __kmp_null_resume_wrapper(team_thread);
}
#endif
#if USE_ITT_BUILD
@@ -1806,8 +2338,6 @@ void __kmp_join_barrier(int gtid) {
kmp_uint64 cur_time = __itt_get_timestamp();
ident_t *loc = team->t.t_ident;
kmp_info_t **other_threads = team->t.t_threads;
- int nproc = this_thr->th.th_team_nproc;
- int i;
switch (__kmp_forkjoin_frames_mode) {
case 1:
__kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
@@ -1824,7 +2354,7 @@ void __kmp_join_barrier(int gtid) {
// Set arrive time to zero to be able to check it in
// __kmp_invoke_task(); the same is done inside the loop below
this_thr->th.th_bar_arrive_time = 0;
- for (i = 1; i < nproc; ++i) {
+ for (kmp_uint i = 1; i < nproc; ++i) {
delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
other_threads[i]->th.th_bar_arrive_time = 0;
}
@@ -1933,6 +2463,11 @@ void __kmp_fork_barrier(int gtid, int tid) {
} // primary thread
switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
+ case bp_dist_bar: {
+ __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
+ TRUE USE_ITT_BUILD_ARG(NULL));
+ break;
+ }
case bp_hyper_bar: {
KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
__kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
diff --git a/contrib/libs/cxxsupp/openmp/kmp_barrier.h b/contrib/libs/cxxsupp/openmp/kmp_barrier.h
new file mode 100644
index 0000000000..ac28a13217
--- /dev/null
+++ b/contrib/libs/cxxsupp/openmp/kmp_barrier.h
@@ -0,0 +1,141 @@
+/*
+ * kmp_barrier.h
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_BARRIER_H
+#define KMP_BARRIER_H
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+
+#if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC
+#include <xmmintrin.h>
+#define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment)
+#define KMP_ALIGNED_FREE(ptr) _mm_free(ptr)
+#elif KMP_HAVE_ALIGNED_ALLOC
+#define KMP_ALIGNED_ALLOCATE(size, alignment) aligned_alloc(alignment, size)
+#define KMP_ALIGNED_FREE(ptr) free(ptr)
+#elif KMP_HAVE_POSIX_MEMALIGN
+static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) {
+ void *ptr;
+ int n = posix_memalign(&ptr, alignment, size);
+ if (n != 0) {
+ if (ptr)
+ free(ptr);
+ return nullptr;
+ }
+ return ptr;
+}
+#define KMP_ALIGNED_FREE(ptr) free(ptr)
+#elif KMP_HAVE__ALIGNED_MALLOC
+#include <malloc.h>
+#define KMP_ALIGNED_ALLOCATE(size, alignment) _aligned_malloc(size, alignment)
+#define KMP_ALIGNED_FREE(ptr) _aligned_free(ptr)
+#else
+#define KMP_ALIGNED_ALLOCATE(size, alignment) KMP_INTERNAL_MALLOC(size)
+#define KMP_ALIGNED_FREE(ptr) KMP_INTERNAL_FREE(ptr)
+#endif
+
+// Use four cache lines: MLC tends to prefetch the next or previous cache line
+// creating a possible fake conflict between cores, so this is the only way to
+// guarantee that no such prefetch can happen.
+#ifndef KMP_FOURLINE_ALIGN_CACHE
+#define KMP_FOURLINE_ALIGN_CACHE KMP_ALIGN(4 * CACHE_LINE)
+#endif
+
+#define KMP_OPTIMIZE_FOR_REDUCTIONS 0
+
+class distributedBarrier {
+ struct flags_s {
+ kmp_uint32 volatile KMP_FOURLINE_ALIGN_CACHE stillNeed;
+ };
+
+ struct go_s {
+ std::atomic<kmp_uint64> KMP_FOURLINE_ALIGN_CACHE go;
+ };
+
+ struct iter_s {
+ kmp_uint64 volatile KMP_FOURLINE_ALIGN_CACHE iter;
+ };
+
+ struct sleep_s {
+ std::atomic<bool> KMP_FOURLINE_ALIGN_CACHE sleep;
+ };
+
+ void init(size_t nthr);
+ void resize(size_t nthr);
+ void computeGo(size_t n);
+ void computeVarsForN(size_t n);
+
+public:
+ enum {
+ MAX_ITERS = 3,
+ MAX_GOS = 8,
+ IDEAL_GOS = 4,
+ IDEAL_CONTENTION = 16,
+ };
+
+ flags_s *flags[MAX_ITERS];
+ go_s *go;
+ iter_s *iter;
+ sleep_s *sleep;
+
+ size_t KMP_ALIGN_CACHE num_threads; // number of threads in barrier
+ size_t KMP_ALIGN_CACHE max_threads; // size of arrays in data structure
+ // number of go signals each requiring one write per iteration
+ size_t KMP_ALIGN_CACHE num_gos;
+ // number of groups of gos
+ size_t KMP_ALIGN_CACHE num_groups;
+ // threads per go signal
+ size_t KMP_ALIGN_CACHE threads_per_go;
+ bool KMP_ALIGN_CACHE fix_threads_per_go;
+ // threads per group
+ size_t KMP_ALIGN_CACHE threads_per_group;
+ // number of go signals in a group
+ size_t KMP_ALIGN_CACHE gos_per_group;
+ void *team_icvs;
+
+ distributedBarrier() = delete;
+ ~distributedBarrier() = delete;
+
+ // Used instead of constructor to create aligned data
+ static distributedBarrier *allocate(int nThreads) {
+ distributedBarrier *d = (distributedBarrier *)KMP_ALIGNED_ALLOCATE(
+ sizeof(distributedBarrier), 4 * CACHE_LINE);
+ if (!d) {
+ KMP_FATAL(MemoryAllocFailed);
+ }
+ d->num_threads = 0;
+ d->max_threads = 0;
+ for (int i = 0; i < MAX_ITERS; ++i)
+ d->flags[i] = NULL;
+ d->go = NULL;
+ d->iter = NULL;
+ d->sleep = NULL;
+ d->team_icvs = NULL;
+ d->fix_threads_per_go = false;
+ // calculate gos and groups ONCE on base size
+ d->computeGo(nThreads);
+ d->init(nThreads);
+ return d;
+ }
+
+ static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); }
+
+ void update_num_threads(size_t nthr) { init(nthr); }
+
+ bool need_resize(size_t new_nthr) { return (new_nthr > max_threads); }
+ size_t get_num_threads() { return num_threads; }
+ kmp_uint64 go_release();
+ void go_reset();
+};
+
+#endif // KMP_BARRIER_H
diff --git a/contrib/libs/cxxsupp/openmp/kmp_config.h b/contrib/libs/cxxsupp/openmp/kmp_config.h
index 81314ed20a..2f7a7f9320 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_config.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_config.h
@@ -80,6 +80,16 @@
#define KMP_HAVE_ATTRIBUTE_RTM LIBOMP_HAVE_ATTRIBUTE_RTM
#define LIBOMP_ARCH_AARCH64_A64FX 0
#define KMP_ARCH_AARCH64_A64FX LIBOMP_ARCH_AARCH64_A64FX
+#define LIBOMP_HAVE_XMMINTRIN_H 1
+#define KMP_HAVE_XMMINTRIN_H LIBOMP_HAVE_XMMINTRIN_H
+#define LIBOMP_HAVE__MM_MALLOC 1
+#define KMP_HAVE__MM_MALLOC LIBOMP_HAVE__MM_MALLOC
+#define LIBOMP_HAVE_ALIGNED_ALLOC 1
+#define KMP_HAVE_ALIGNED_ALLOC LIBOMP_HAVE_ALIGNED_ALLOC
+#define LIBOMP_HAVE_POSIX_MEMALIGN 1
+#define KMP_HAVE_POSIX_MEMALIGN LIBOMP_HAVE_POSIX_MEMALIGN
+#define LIBOMP_HAVE__ALIGNED_MALLOC 0
+#define KMP_HAVE__ALIGNED_MALLOC LIBOMP_HAVE__ALIGNED_MALLOC
// Configured cache line based on architecture
#if KMP_ARCH_PPC64
@@ -119,4 +129,9 @@
# define KMP_GOMP_COMPAT
#endif
+// use shared memory with dynamic library (except Android, where shm_*
+// functions don't exist).
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !__ANDROID__
+#define KMP_USE_SHM
+#endif
#endif // KMP_CONFIG_H
diff --git a/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp b/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp
index 2a7c9a8cb2..e263558517 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp
@@ -288,15 +288,7 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
ompt_frame_t *ompt_frame;
if (ompt_enabled.enabled) {
kmp_info_t *master_th = __kmp_threads[gtid];
- kmp_team_t *parent_team = master_th->th.th_team;
- ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
- if (lwt)
- ompt_frame = &(lwt->ompt_task_info.frame);
- else {
- int tid = __kmp_tid_from_gtid(gtid);
- ompt_frame = &(
- parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
- }
+ ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
}
OMPT_STORE_RETURN_ADDRESS(gtid);
@@ -320,6 +312,12 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
);
va_end(ap);
+
+#if OMPT_SUPPORT
+ if (ompt_enabled.enabled) {
+ ompt_frame->enter_frame = ompt_data_none;
+ }
+#endif
}
#if KMP_STATS_ENABLED
@@ -533,7 +531,8 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
kmp_task_team_t *task_team = this_thr->th.th_task_team;
// we need to wait for the proxy tasks before finishing the thread
- if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
+ if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
+ task_team->tt.tt_hidden_helper_task_encountered))
__kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
KMP_MB();
@@ -578,9 +577,6 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
__kmp_free(top);
}
- // if( serial_team -> t.t_serialized > 1 )
- serial_team->t.t_level--;
-
/* pop dispatch buffers stack */
KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
{
@@ -605,6 +601,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
}
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+ __kmp_pop_current_task_from_thread(this_thr);
#if OMPD_SUPPORT
if (ompd_state & OMPD_ENABLE_BP)
ompd_bp_parallel_end();
@@ -623,8 +620,6 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
this_thr->th.th_dispatch =
&this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
- __kmp_pop_current_task_from_thread(this_thr);
-
KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
this_thr->th.th_current_task->td_flags.executing = 1;
@@ -645,6 +640,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
}
}
+ serial_team->t.t_level--;
if (__kmp_env_consistency_check)
__kmp_pop_parallel(global_tid, NULL);
#if OMPT_SUPPORT
@@ -686,7 +682,7 @@ void __kmpc_flush(ident_t *loc) {
if (!__kmp_cpuinfo.initialized) {
__kmp_query_cpuid(&__kmp_cpuinfo);
}
- if (!__kmp_cpuinfo.sse2) {
+ if (!__kmp_cpuinfo.flags.sse2) {
// CPU cannot execute SSE2 instructions.
} else {
#if KMP_COMPILER_ICC
@@ -1359,7 +1355,7 @@ static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
#endif
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
-#define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
+#define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
#else
#define KMP_CPUINFO_RTM 0
#endif
@@ -4330,24 +4326,35 @@ void __kmpc_doacross_fini(ident_t *loc, int gtid) {
KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
}
-/* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
+/* OpenMP 5.1 Memory Management routines */
void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
- return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
+ return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
+}
+
+void *omp_aligned_alloc(size_t align, size_t size,
+ omp_allocator_handle_t allocator) {
+ return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
}
void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
- return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
+ return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
+}
+
+void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
+ omp_allocator_handle_t allocator) {
+ return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
}
void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
omp_allocator_handle_t free_allocator) {
- return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
+ return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
free_allocator);
}
void omp_free(void *ptr, omp_allocator_handle_t allocator) {
- __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
+ ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
}
+/* end of OpenMP 5.1 Memory Management routines */
int __kmpc_get_target_offload(void) {
if (!__kmp_init_serial) {
@@ -4395,6 +4402,38 @@ void __kmpc_error(ident_t *loc, int severity, const char *message) {
__kmp_str_free(&src_loc);
}
+// Mark begin of scope directive.
+void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
+// reserved is for extension of scope directive and not used.
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+ if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
+ kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
+ int tid = __kmp_tid_from_gtid(gtid);
+ ompt_callbacks.ompt_callback(ompt_callback_work)(
+ ompt_work_scope, ompt_scope_begin,
+ &(team->t.ompt_team_info.parallel_data),
+ &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
+ OMPT_GET_RETURN_ADDRESS(0));
+ }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+}
+
+// Mark end of scope directive
+void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
+// reserved is for extension of scope directive and not used.
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+ if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
+ kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
+ int tid = __kmp_tid_from_gtid(gtid);
+ ompt_callbacks.ompt_callback(ompt_callback_work)(
+ ompt_work_scope, ompt_scope_end,
+ &(team->t.ompt_team_info.parallel_data),
+ &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
+ OMPT_GET_RETURN_ADDRESS(0));
+ }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+}
+
#ifdef KMP_USE_VERSION_SYMBOLS
// For GOMP compatibility there are two versions of each omp_* API.
// One is the plain C symbol and one is the Fortran symbol with an appended
diff --git a/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp b/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp
index 108384e1cc..f3407bf889 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp
@@ -72,8 +72,8 @@ void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
bool use_hier = false) {
// Pick up the nonmonotonic/monotonic bits from the scheduling type
- // TODO: make nonmonotonic when static_steal is fixed
- int monotonicity = SCHEDULE_MONOTONIC;
+ // Nonmonotonic as default for dynamic schedule when no modifier is specified
+ int monotonicity = SCHEDULE_NONMONOTONIC;
// Let default be monotonic for executables
// compiled with OpenMP* 4.5 or less compilers
@@ -561,6 +561,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
_control87(_PC_64, _MCW_PC); // 0,0x30000
#endif
/* value used for comparison in solver for cross-over point */
+ KMP_ASSERT(tc > 0);
long double target = ((long double)chunk * 2 + 1) * nproc / tc;
/* crossover point--chunk indexes equal to or greater than
@@ -668,6 +669,8 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
case kmp_sch_static_chunked:
case kmp_sch_dynamic_chunked:
dynamic_init:
+ if (tc == 0)
+ break;
if (pr->u.p.parm1 <= 0)
pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
else if (pr->u.p.parm1 > tc)
@@ -1713,7 +1716,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
status = 0; // nothing to do, don't try atomic op
break;
}
- KMP_DEBUG_ASSERT(init % chunk == 0);
+ KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
// compare with K*nproc*(chunk+1), K=2 by default
if ((T)remaining < pr->u.p.parm2) {
// use dynamic-style schedule
@@ -2652,9 +2655,11 @@ __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
kmp_uint32 spins;
kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
kmp_uint32 r;
+ kmp_uint64 time;
KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
KMP_INIT_YIELD(spins);
+ KMP_INIT_BACKOFF(time);
// main wait spin loop
while (!f(r = TCR_4(*spin), check)) {
KMP_FSYNC_SPIN_PREPARE(obj);
@@ -2662,7 +2667,7 @@ __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
split. It causes problems with infinite recursion because of exit lock */
/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
__kmp_abort_thread(); */
- KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
}
KMP_FSYNC_SPIN_ACQUIRED(obj);
return r;
@@ -2677,15 +2682,17 @@ void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
kmp_uint32 check = checker;
kmp_uint32 spins;
kmp_uint32 (*f)(void *, kmp_uint32) = pred;
+ kmp_uint64 time;
KMP_FSYNC_SPIN_INIT(obj, spin);
KMP_INIT_YIELD(spins);
+ KMP_INIT_BACKOFF(time);
// main wait spin loop
while (!f(spin, check)) {
KMP_FSYNC_SPIN_PREPARE(obj);
/* if we have waited a bit, or are noversubscribed, yield */
/* pause is in the following code */
- KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
}
KMP_FSYNC_SPIN_ACQUIRED(obj);
}
diff --git a/contrib/libs/cxxsupp/openmp/kmp_dispatch.h b/contrib/libs/cxxsupp/openmp/kmp_dispatch.h
index ae11361ca5..154db17461 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_dispatch.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_dispatch.h
@@ -292,10 +292,12 @@ static UT __kmp_wait(volatile UT *spinner, UT checker,
UT check = checker;
kmp_uint32 spins;
kmp_uint32 (*f)(UT, UT) = pred;
+ kmp_uint64 time;
UT r;
KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
KMP_INIT_YIELD(spins);
+ KMP_INIT_BACKOFF(time);
// main wait spin loop
while (!f(r = *spin, check)) {
KMP_FSYNC_SPIN_PREPARE(obj);
@@ -305,7 +307,7 @@ static UT __kmp_wait(volatile UT *spinner, UT checker,
/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
__kmp_abort_thread(); */
// If oversubscribed, or have waited a bit then yield.
- KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
}
KMP_FSYNC_SPIN_ACQUIRED(obj);
return r;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h b/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h
index 30c967af3d..bf9ebf9b2e 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h
@@ -1446,6 +1446,120 @@ int FTN_STDCALL FTN_GET_TEAMS_THREAD_LIMIT(void) {
#endif
}
+/// TODO: Include the `omp.h` of the current build
+/* OpenMP 5.1 interop */
+typedef intptr_t omp_intptr_t;
+
+/* 0..omp_get_num_interop_properties()-1 are reserved for implementation-defined
+ * properties */
+typedef enum omp_interop_property {
+ omp_ipr_fr_id = -1,
+ omp_ipr_fr_name = -2,
+ omp_ipr_vendor = -3,
+ omp_ipr_vendor_name = -4,
+ omp_ipr_device_num = -5,
+ omp_ipr_platform = -6,
+ omp_ipr_device = -7,
+ omp_ipr_device_context = -8,
+ omp_ipr_targetsync = -9,
+ omp_ipr_first = -9
+} omp_interop_property_t;
+
+#define omp_interop_none 0
+
+typedef enum omp_interop_rc {
+ omp_irc_no_value = 1,
+ omp_irc_success = 0,
+ omp_irc_empty = -1,
+ omp_irc_out_of_range = -2,
+ omp_irc_type_int = -3,
+ omp_irc_type_ptr = -4,
+ omp_irc_type_str = -5,
+ omp_irc_other = -6
+} omp_interop_rc_t;
+
+typedef enum omp_interop_fr {
+ omp_ifr_cuda = 1,
+ omp_ifr_cuda_driver = 2,
+ omp_ifr_opencl = 3,
+ omp_ifr_sycl = 4,
+ omp_ifr_hip = 5,
+ omp_ifr_level_zero = 6,
+ omp_ifr_last = 7
+} omp_interop_fr_t;
+
+typedef void *omp_interop_t;
+
+// libomptarget, if loaded, provides this function
+int FTN_STDCALL FTN_GET_NUM_INTEROP_PROPERTIES(const omp_interop_t interop) {
+#if KMP_MIC || KMP_OS_DARWIN || defined(KMP_STUB)
+ return 0;
+#else
+ int (*fptr)(const omp_interop_t);
+ if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_num_interop_properties")))
+ return (*fptr)(interop);
+ return 0;
+#endif // KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
+}
+
+/// TODO Convert FTN_GET_INTEROP_XXX functions into a macro like interop.cpp
+// libomptarget, if loaded, provides this function
+intptr_t FTN_STDCALL FTN_GET_INTEROP_INT(const omp_interop_t interop,
+ omp_interop_property_t property_id,
+ int *err) {
+ intptr_t (*fptr)(const omp_interop_t, omp_interop_property_t, int *);
+ if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_int")))
+ return (*fptr)(interop, property_id, err);
+ return 0;
+}
+
+// libomptarget, if loaded, provides this function
+void *FTN_STDCALL FTN_GET_INTEROP_PTR(const omp_interop_t interop,
+ omp_interop_property_t property_id,
+ int *err) {
+ void *(*fptr)(const omp_interop_t, omp_interop_property_t, int *);
+ if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_ptr")))
+ return (*fptr)(interop, property_id, err);
+ return nullptr;
+}
+
+// libomptarget, if loaded, provides this function
+const char *FTN_STDCALL FTN_GET_INTEROP_STR(const omp_interop_t interop,
+ omp_interop_property_t property_id,
+ int *err) {
+ const char *(*fptr)(const omp_interop_t, omp_interop_property_t, int *);
+ if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_str")))
+ return (*fptr)(interop, property_id, err);
+ return nullptr;
+}
+
+// libomptarget, if loaded, provides this function
+const char *FTN_STDCALL FTN_GET_INTEROP_NAME(
+ const omp_interop_t interop, omp_interop_property_t property_id) {
+ const char *(*fptr)(const omp_interop_t, omp_interop_property_t);
+ if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_name")))
+ return (*fptr)(interop, property_id);
+ return nullptr;
+}
+
+// libomptarget, if loaded, provides this function
+const char *FTN_STDCALL FTN_GET_INTEROP_TYPE_DESC(
+ const omp_interop_t interop, omp_interop_property_t property_id) {
+ const char *(*fptr)(const omp_interop_t, omp_interop_property_t);
+ if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_type_desc")))
+ return (*fptr)(interop, property_id);
+ return nullptr;
+}
+
+// libomptarget, if loaded, provides this function
+const char *FTN_STDCALL FTN_GET_INTEROP_RC_DESC(
+ const omp_interop_t interop, omp_interop_property_t property_id) {
+ const char *(*fptr)(const omp_interop_t, omp_interop_property_t);
+ if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_rec_desc")))
+ return (*fptr)(interop, property_id);
+ return nullptr;
+}
+
// display environment variables when requested
void FTN_STDCALL FTN_DISPLAY_ENV(int verbose) {
#ifndef KMP_STUB
diff --git a/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h b/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h
index 5b9e396e3d..66e1e1ecd2 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h
@@ -140,6 +140,14 @@
#define FTN_SET_TEAMS_THREAD_LIMIT omp_set_teams_thread_limit
#define FTN_GET_TEAMS_THREAD_LIMIT omp_get_teams_thread_limit
+#define FTN_GET_NUM_INTEROP_PROPERTIES omp_get_num_interop_properties
+#define FTN_GET_INTEROP_INT omp_get_interop_int
+#define FTN_GET_INTEROP_PTR omp_get_interop_ptr
+#define FTN_GET_INTEROP_STR omp_get_interop_str
+#define FTN_GET_INTEROP_NAME omp_get_interop_name
+#define FTN_GET_INTEROP_TYPE_DESC omp_get_interop_type_desc
+#define FTN_GET_INTEROP_RC_DESC omp_get_interop_rc_desc
+
#endif /* KMP_FTN_PLAIN */
/* ------------------------------------------------------------------------ */
@@ -268,6 +276,14 @@
#define FTN_SET_TEAMS_THREAD_LIMIT omp_set_teams_thread_limit_
#define FTN_GET_TEAMS_THREAD_LIMIT omp_get_teams_thread_limit_
+#define FTN_GET_NUM_INTEROP_PROPERTIES omp_get_num_interop_properties_
+#define FTN_GET_INTEROP_INT omp_get_interop_int_
+#define FTN_GET_INTEROP_PTR omp_get_interop_ptr_
+#define FTN_GET_INTEROP_STR omp_get_interop_str_
+#define FTN_GET_INTEROP_NAME omp_get_interop_name_
+#define FTN_GET_INTEROP_TYPE_DESC omp_get_interop_type_desc_
+#define FTN_GET_INTEROP_RC_DESC omp_get_interop_rc_desc_
+
#endif /* KMP_FTN_APPEND */
/* ------------------------------------------------------------------------ */
@@ -394,6 +410,14 @@
#define FTN_SET_TEAMS_THREAD_LIMIT OMP_SET_TEAMS_THREAD_LIMIT
#define FTN_GET_TEAMS_THREAD_LIMIT OMP_GET_TEAMS_THREAD_LIMIT
+#define FTN_GET_NUM_INTEROP_PROPERTIES OMP_GET_NUM_INTEROP_PROPERTIES
+#define FTN_GET_INTEROP_INT OMP_GET_INTEROP_INT
+#define FTN_GET_INTEROP_PTR OMP_GET_INTEROP_PTR
+#define FTN_GET_INTEROP_STR OMP_GET_INTEROP_STR
+#define FTN_GET_INTEROP_NAME OMP_GET_INTEROP_NAME
+#define FTN_GET_INTEROP_TYPE_DESC OMP_GET_INTEROP_TYPE_DESC
+#define FTN_GET_INTEROP_RC_DESC OMP_GET_INTEROP_RC_DESC
+
#endif /* KMP_FTN_UPPER */
/* ------------------------------------------------------------------------ */
@@ -522,6 +546,14 @@
#define FTN_SET_TEAMS_THREAD_LIMIT OMP_SET_TEAMS_THREAD_LIMIT_
#define FTN_GET_TEAMS_THREAD_LIMIT OMP_GET_TEAMS_THREAD_LIMIT_
+#define FTN_GET_NUM_INTEROP_PROPERTIES OMP_GET_NUM_INTEROP_PROPERTIES_
+#define FTN_GET_INTEROP_INT OMP_GET_INTEROP_INT_
+#define FTN_GET_INTEROP_PTR OMP_GET_INTEROP_PTR_
+#define FTN_GET_INTEROP_STR OMP_GET_INTEROP_STR_
+#define FTN_GET_INTEROP_NAME OMP_GET_INTEROP_NAME_
+#define FTN_GET_INTEROP_TYPE_DESC OMP_GET_INTEROP_TYPE_DESC_
+#define FTN_GET_INTEROP_RC_DESC OMP_GET_INTEROP_RC_DESC_
+
#endif /* KMP_FTN_UAPPEND */
/* -------------------------- GOMP API NAMES ------------------------ */
@@ -712,5 +744,6 @@
#define KMP_API_NAME_GOMP_SECTIONS2_START GOMP_sections2_start
#define KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER \
GOMP_workshare_task_reduction_unregister
-
+#define KMP_API_NAME_GOMP_ALLOC GOMP_alloc
+#define KMP_API_NAME_GOMP_FREE GOMP_free
#endif /* KMP_FTN_OS_H */
diff --git a/contrib/libs/cxxsupp/openmp/kmp_global.cpp b/contrib/libs/cxxsupp/openmp/kmp_global.cpp
index b519fcf678..62bdac3c4b 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_global.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_global.cpp
@@ -110,8 +110,8 @@ char const *__kmp_barrier_type_name[bs_last_barrier] = {"plain", "forkjoin"
"reduction"
#endif // KMP_FAST_REDUCTION_BARRIER
};
-char const *__kmp_barrier_pattern_name[bp_last_bar] = {"linear", "tree",
- "hyper", "hierarchical"};
+char const *__kmp_barrier_pattern_name[bp_last_bar] = {
+ "linear", "tree", "hyper", "hierarchical", "dist"};
int __kmp_allThreadsSpecified = 0;
size_t __kmp_align_alloc = CACHE_LINE;
@@ -219,6 +219,13 @@ int __kmp_mwait_enabled = FALSE;
int __kmp_mwait_hints = 0;
#endif
+#if KMP_HAVE_UMWAIT
+int __kmp_waitpkg_enabled = 0;
+int __kmp_tpause_state = 0;
+int __kmp_tpause_hint = 1;
+int __kmp_tpause_enabled = 0;
+#endif
+
/* map OMP 3.0 schedule types with our internal schedule types */
enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext +
kmp_sched_upper_std - kmp_sched_lower - 2] = {
@@ -280,6 +287,7 @@ char *__kmp_cpuinfo_file = NULL;
#endif /* KMP_AFFINITY_SUPPORTED */
kmp_nested_proc_bind_t __kmp_nested_proc_bind = {NULL, 0, 0};
+kmp_proc_bind_t __kmp_teams_proc_bind = proc_bind_spread;
int __kmp_affinity_num_places = 0;
int __kmp_display_affinity = FALSE;
char *__kmp_affinity_format = NULL;
@@ -424,6 +432,7 @@ kmp_int32 __kmp_use_yield_exp_set = 0;
kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT;
kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
+kmp_uint64 __kmp_pause_init = 1; // for tpause
/* ------------------------------------------------------ */
/* STATE mostly syncronized with global lock */
diff --git a/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp b/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp
index 61a3199f1a..d77d4809a7 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp
@@ -23,18 +23,24 @@ enum {
KMP_GOMP_TASK_DEPENDS_FLAG = 8
};
+enum {
+ KMP_GOMP_DEPOBJ_IN = 1,
+ KMP_GOMP_DEPOBJ_OUT = 2,
+ KMP_GOMP_DEPOBJ_INOUT = 3,
+ KMP_GOMP_DEPOBJ_MTXINOUTSET = 4
+};
+
// This class helps convert gomp dependency info into
// kmp_depend_info_t structures
class kmp_gomp_depends_info_t {
void **depend;
kmp_int32 num_deps;
- size_t num_out, num_mutexinout, num_in;
+ size_t num_out, num_mutexinout, num_in, num_depobj;
size_t offset;
public:
kmp_gomp_depends_info_t(void **depend) : depend(depend) {
size_t ndeps = (kmp_intptr_t)depend[0];
- size_t num_doable;
// GOMP taskdep structure:
// if depend[0] != 0:
// depend = [ ndeps | nout | &out | ... | &out | &in | ... | &in ]
@@ -45,21 +51,17 @@ public:
if (ndeps) {
num_out = (kmp_intptr_t)depend[1];
num_in = ndeps - num_out;
- num_mutexinout = 0;
- num_doable = ndeps;
+ num_mutexinout = num_depobj = 0;
offset = 2;
} else {
ndeps = (kmp_intptr_t)depend[1];
num_out = (kmp_intptr_t)depend[2];
num_mutexinout = (kmp_intptr_t)depend[3];
num_in = (kmp_intptr_t)depend[4];
- num_doable = num_out + num_mutexinout + num_in;
+ num_depobj = ndeps - num_out - num_mutexinout - num_in;
+ KMP_ASSERT(num_depobj <= ndeps);
offset = 5;
}
- // TODO: Support gomp depobj
- if (ndeps != num_doable) {
- KMP_FATAL(GompFeatureNotSupported, "depobj");
- }
num_deps = static_cast<kmp_int32>(ndeps);
}
kmp_int32 get_num_deps() const { return num_deps; }
@@ -67,7 +69,6 @@ public:
kmp_depend_info_t retval;
memset(&retval, '\0', sizeof(retval));
KMP_ASSERT(index < (size_t)num_deps);
- retval.base_addr = (kmp_intptr_t)depend[offset + index];
retval.len = 0;
// Because inout and out are logically equivalent,
// use inout and in dependency flags. GOMP does not provide a
@@ -75,10 +76,37 @@ public:
if (index < num_out) {
retval.flags.in = 1;
retval.flags.out = 1;
+ retval.base_addr = (kmp_intptr_t)depend[offset + index];
} else if (index >= num_out && index < (num_out + num_mutexinout)) {
retval.flags.mtx = 1;
- } else {
+ retval.base_addr = (kmp_intptr_t)depend[offset + index];
+ } else if (index >= (num_out + num_mutexinout) &&
+ index < (num_out + num_mutexinout + num_in)) {
retval.flags.in = 1;
+ retval.base_addr = (kmp_intptr_t)depend[offset + index];
+ } else {
+ // depobj is a two element array (size of elements are size of pointer)
+ // depobj[0] = base_addr
+ // depobj[1] = type (in, out, inout, mutexinoutset, etc.)
+ kmp_intptr_t *depobj = (kmp_intptr_t *)depend[offset + index];
+ retval.base_addr = depobj[0];
+ switch (depobj[1]) {
+ case KMP_GOMP_DEPOBJ_IN:
+ retval.flags.in = 1;
+ break;
+ case KMP_GOMP_DEPOBJ_OUT:
+ retval.flags.out = 1;
+ break;
+ case KMP_GOMP_DEPOBJ_INOUT:
+ retval.flags.in = 1;
+ retval.flags.out = 1;
+ break;
+ case KMP_GOMP_DEPOBJ_MTXINOUTSET:
+ retval.flags.mtx = 1;
+ break;
+ default:
+ KMP_FATAL(GompFeatureNotSupported, "Unknown depobj type");
+ }
}
return retval;
}
@@ -1206,7 +1234,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data,
// The low-order bit is the "untied" flag
if (!(gomp_flags & KMP_GOMP_TASK_UNTIED_FLAG)) {
- input_flags->tiedness = 1;
+ input_flags->tiedness = TASK_TIED;
}
// The second low-order bit is the "final" flag
if (gomp_flags & KMP_GOMP_TASK_FINAL_FLAG) {
@@ -1494,6 +1522,13 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task)(void *),
KA_TRACE(20, ("GOMP_parallel_sections: T#%d\n", gtid));
#if OMPT_SUPPORT
+ ompt_frame_t *task_frame;
+ kmp_info_t *thr;
+ if (ompt_enabled.enabled) {
+ thr = __kmp_threads[gtid];
+ task_frame = &(thr->th.th_current_task->ompt_task_info.frame);
+ task_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+ }
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
@@ -1509,9 +1544,31 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task)(void *),
KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
}
+
+#if OMPT_SUPPORT
+ ompt_frame_t *child_frame;
+ if (ompt_enabled.enabled) {
+ child_frame = &(thr->th.th_current_task->ompt_task_info.frame);
+ child_frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+ }
+#endif
+
task(data);
+
+#if OMPT_SUPPORT
+ if (ompt_enabled.enabled) {
+ child_frame->exit_frame = ompt_data_none;
+ }
+#endif
+
KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
KA_TRACE(20, ("GOMP_parallel_sections exit: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+ if (ompt_enabled.enabled) {
+ task_frame->enter_frame = ompt_data_none;
+ }
+#endif
}
#define PARALLEL_LOOP(func, schedule, ompt_pre, ompt_post) \
@@ -1738,7 +1795,7 @@ void __GOMP_taskloop(void (*func)(void *), void *data,
KMP_ASSERT(arg_align > 0);
// The low-order bit is the "untied" flag
if (!(gomp_flags & 1)) {
- input_flags->tiedness = 1;
+ input_flags->tiedness = TASK_TIED;
}
// The second low-order bit is the "final" flag
if (gomp_flags & 2) {
@@ -2428,6 +2485,26 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER)(
}
}
+// allocator construct
+void *KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ALLOC)(size_t alignment, size_t size,
+ uintptr_t allocator) {
+ int gtid = __kmp_entry_gtid();
+ KA_TRACE(20, ("GOMP_alloc: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+ OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+ return __kmp_alloc(gtid, alignment, size, (omp_allocator_handle_t)allocator);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_FREE)(void *ptr, uintptr_t allocator) {
+ int gtid = __kmp_entry_gtid();
+ KA_TRACE(20, ("GOMP_free: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+ OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+ return ___kmpc_free(gtid, ptr, (omp_allocator_handle_t)allocator);
+}
+
/* The following sections of code create aliases for the GOMP_* functions, then
create versioned symbols using the assembler directive .symver. This is only
pertinent for ELF .so library. The KMP_VERSION_SYMBOL macro is defined in
@@ -2616,6 +2693,10 @@ KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_START, 50, "GOMP_5.0");
KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS2_START, 50, "GOMP_5.0");
KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER, 50,
"GOMP_5.0");
+
+// GOMP_5.0.1 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ALLOC, 501, "GOMP_5.0.1");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_FREE, 501, "GOMP_5.0.1");
#endif // KMP_USE_VERSION_SYMBOLS
#ifdef __cplusplus
diff --git a/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc b/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc
index 8e3e90caae..776cca2b66 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc
+++ b/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc
@@ -223,6 +223,7 @@ __kmp_i18n_default_messages[] =
"%1$s value \"%2$u\" will be used.",
"%1$s value \"%2$s\" will be used.",
"%1$s value \"%2$s\" will be used.",
+ "Mixing other barrier patterns with dist is prohibited. Using dist for all barrier patterns.",
"%1$s maximum value \"%2$d\" will be used.",
"%1$s minimum value \"%2$d\" will be used.",
"Memory allocation failed.",
@@ -307,6 +308,8 @@ __kmp_i18n_default_messages[] =
"Allocator %1$s is not available, will use default allocator.",
"%1$s: %2$s (%3$d total cores)",
"%1$s: granularity setting: %2$s does not exist in topology. Using granularity=%3$s instead.",
+ "%1$s: hybrid core type detected: %2$d %3$s cores.",
+ "%1$s: %2$d with core efficiency %3$d.",
"%1$s must be bound to a work-sharing or work-queuing construct with an \"ordered\" clause",
"Detected end of %1$s without first executing a corresponding beginning.",
"Iteration range too large in %1$s.",
@@ -402,6 +405,15 @@ __kmp_i18n_default_messages[] =
"KMP_HW_SUBSET ignored: %1$s, %2$s: layers are equivalent, please only specify one.",
"KMP_HW_SUBSET ignored: %1$s layer should come after %2$s.",
"%1$s: topology layer \"%2$s\" is equivalent to \"%3$s\".",
+ "%1$s: granularity=%2$s is too coarse, setting granularity=group.",
+ "%1$s: \"%2$s\" value is deprecated. Please use \"%3$s\" instead.",
+ "num_teams value must be positive, it is %1$d, using %2$d instead.",
+ "KMP_HW_SUBSET ignored: %1$s, %2$s: attributes are ambiguous, please only specify one.",
+ "KMP_HW_SUBSET ignored: %1$s: attribute specified more than once.",
+ "KMP_HW_SUBSET ignored: %1$s: attribute value %2$s is invalid.",
+ "KMP_HW_SUBSET ignored: all hardware resources would be filtered, please reduce the filter.",
+ "KMP_HW_SUBSET ignored: Too many attributes specified. This machine is not a hybrid architecutre.",
+ "KMP_HW_SUBSET: ignoring %1$s attribute. This machine is not a hybrid architecutre.",
NULL
};
@@ -437,6 +449,7 @@ __kmp_i18n_default_hints[] =
"System error #193 is \"Bad format of EXE or DLL file\". Usually it means the file is found, but it is corrupted or a file for another architecture. Check whether \"%1$s\" is a file for %2$s architecture.",
"System-related limit on the number of threads.",
"Try setting new bounds (preferably less than or equal to %1$d) for num_teams clause.",
+ "Valid values are from %1$d to %2$d.",
NULL
};
@@ -453,8 +466,8 @@ __kmp_i18n_sections[] =
{ 5, __kmp_i18n_default_meta },
{ 79, __kmp_i18n_default_strings },
{ 6, __kmp_i18n_default_formats },
- { 286, __kmp_i18n_default_messages },
- { 28, __kmp_i18n_default_hints },
+ { 298, __kmp_i18n_default_messages },
+ { 29, __kmp_i18n_default_hints },
{ 0, NULL }
};
diff --git a/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc b/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc
index 7fec5e6223..a66f8117c2 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc
+++ b/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc
@@ -217,6 +217,7 @@ enum kmp_i18n_id {
kmp_i18n_msg_Using_uint_Value,
kmp_i18n_msg_Using_uint64_Value,
kmp_i18n_msg_Using_str_Value,
+ kmp_i18n_msg_BarrierPatternOverride,
kmp_i18n_msg_MaxValueUsing,
kmp_i18n_msg_MinValueUsing,
kmp_i18n_msg_MemoryAllocFailed,
@@ -301,6 +302,8 @@ enum kmp_i18n_id {
kmp_i18n_msg_OmpNoAllocator,
kmp_i18n_msg_TopologyGeneric,
kmp_i18n_msg_AffGranularityBad,
+ kmp_i18n_msg_TopologyHybrid,
+ kmp_i18n_msg_TopologyHybridCoreEff,
kmp_i18n_msg_CnsBoundToWorksharing,
kmp_i18n_msg_CnsDetectedEnd,
kmp_i18n_msg_CnsIterationRangeTooLarge,
@@ -396,6 +399,15 @@ enum kmp_i18n_id {
kmp_i18n_msg_AffHWSubsetEqvLayers,
kmp_i18n_msg_AffHWSubsetOutOfOrder,
kmp_i18n_msg_AffEqualTopologyTypes,
+ kmp_i18n_msg_AffGranTooCoarseProcGroup,
+ kmp_i18n_msg_StgDeprecatedValue,
+ kmp_i18n_msg_NumTeamsNotPositive,
+ kmp_i18n_msg_AffHWSubsetIncompat,
+ kmp_i18n_msg_AffHWSubsetAttrRepeat,
+ kmp_i18n_msg_AffHWSubsetAttrInvalid,
+ kmp_i18n_msg_AffHWSubsetAllFiltered,
+ kmp_i18n_msg_AffHWSubsetAttrsNonHybrid,
+ kmp_i18n_msg_AffHWSubsetIgnoringAttr,
kmp_i18n_msg_last,
// Set #5, hints.
@@ -428,6 +440,7 @@ enum kmp_i18n_id {
kmp_i18n_hnt_BadExeFormat,
kmp_i18n_hnt_SystemLimitOnThreads,
kmp_i18n_hnt_SetNewBound,
+ kmp_i18n_hnt_ValidValuesRange,
kmp_i18n_hnt_last,
kmp_i18n_xxx_lastest
diff --git a/contrib/libs/cxxsupp/openmp/kmp_itt.cpp b/contrib/libs/cxxsupp/openmp/kmp_itt.cpp
index a76c639625..f99b264da6 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_itt.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_itt.cpp
@@ -24,12 +24,9 @@
#error #include "ittnotify_config.h"
__itt_global __kmp_ittapi_clean_global;
extern __itt_global __kmp_itt__ittapi_global;
-kmp_int32 __kmp_barrier_domain_count;
-kmp_int32 __kmp_region_domain_count;
-__itt_domain *__kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS];
-__itt_domain *__kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS];
-__itt_domain *__kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS];
-kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS];
+
+kmp_itthash_t __kmp_itt_barrier_domains = {{0}, 0};
+kmp_itthash_t __kmp_itt_region_domains = {{0}, 0};
__itt_domain *metadata_domain = NULL;
__itt_string_handle *string_handle_imbl = NULL;
__itt_string_handle *string_handle_loop = NULL;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_itt.h b/contrib/libs/cxxsupp/openmp/kmp_itt.h
index 75a24540d4..c640e83b71 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_itt.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_itt.h
@@ -278,15 +278,21 @@ __kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller);
} /* if */ \
} while (0)
-const int KMP_MAX_FRAME_DOMAINS =
- 512; // Maximum number of frame domains to use (maps to
+// Maximum number of frame domains to use (maps to
// different OpenMP regions in the user source code).
-extern kmp_int32 __kmp_barrier_domain_count;
-extern kmp_int32 __kmp_region_domain_count;
-extern __itt_domain *__kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS];
-extern __itt_domain *__kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS];
-extern __itt_domain *__kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS];
-extern kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS];
+const int KMP_MAX_FRAME_DOMAINS = 997;
+typedef struct kmp_itthash_entry {
+ ident_t *loc;
+ int team_size;
+ __itt_domain *d;
+ struct kmp_itthash_entry *next_in_bucket;
+} kmp_itthash_entry_t;
+typedef struct kmp_itthash {
+ kmp_itthash_entry_t *buckets[KMP_MAX_FRAME_DOMAINS];
+ int count; // just a heuristic to limit number of entries
+} kmp_itthash_t;
+extern kmp_itthash_t __kmp_itt_region_domains;
+extern kmp_itthash_t __kmp_itt_barrier_domains;
extern __itt_domain *metadata_domain;
extern __itt_string_handle *string_handle_imbl;
extern __itt_string_handle *string_handle_loop;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_lock.cpp b/contrib/libs/cxxsupp/openmp/kmp_lock.cpp
index 59726f2b9f..fff7305b57 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_lock.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_lock.cpp
@@ -96,12 +96,19 @@ __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
}
kmp_uint32 spins;
+ kmp_uint64 time;
KMP_FSYNC_PREPARE(lck);
KMP_INIT_YIELD(spins);
+ KMP_INIT_BACKOFF(time);
kmp_backoff_t backoff = __kmp_spin_backoff_params;
do {
+#if !KMP_HAVE_UMWAIT
__kmp_spin_backoff(&backoff);
- KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+#else
+ if (!__kmp_tpause_enabled)
+ __kmp_spin_backoff(&backoff);
+#endif
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
} while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
!__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy));
KMP_FSYNC_ACQUIRED(lck);
@@ -1344,14 +1351,15 @@ static int __kmp_test_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
}
int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
- kmp_info_t *this_thr;
volatile kmp_int32 *head_id_p = &lck->lk.head_id;
volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
KA_TRACE(1000,
("__kmp_release_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
KMP_DEBUG_ASSERT(gtid >= 0);
- this_thr = __kmp_thread_from_gtid(gtid);
+#if KMP_DEBUG || DEBUG_QUEUING_LOCKS
+ kmp_info_t *this_thr = __kmp_thread_from_gtid(gtid);
+#endif
KMP_DEBUG_ASSERT(this_thr != NULL);
#ifdef DEBUG_QUEUING_LOCKS
TRACE_LOCK(gtid + 1, "rel ent");
@@ -2226,10 +2234,12 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
// The current implementation of KMP_WAIT doesn't allow for mask
// and poll to be re-read every spin iteration.
kmp_uint32 spins;
+ kmp_uint64 time;
KMP_FSYNC_PREPARE(lck);
KMP_INIT_YIELD(spins);
+ KMP_INIT_BACKOFF(time);
while (polls[ticket & mask] < ticket) { // atomic load
- KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
// Re-read the mask and the poll pointer from the lock structure.
//
// Make certain that "mask" is read before "polls" !!!
@@ -2658,9 +2668,17 @@ void __kmp_spin_backoff(kmp_backoff_t *boff) {
kmp_uint32 i;
for (i = boff->step; i > 0; i--) {
kmp_uint64 goal = __kmp_tsc() + boff->min_tick;
- do {
- KMP_CPU_PAUSE();
- } while (before(__kmp_tsc(), goal));
+#if KMP_HAVE_UMWAIT
+ if (__kmp_umwait_enabled) {
+ __kmp_tpause(0, boff->min_tick);
+ } else {
+#endif
+ do {
+ KMP_CPU_PAUSE();
+ } while (before(__kmp_tsc(), goal));
+#if KMP_HAVE_UMWAIT
+ }
+#endif
}
boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1);
}
@@ -3103,7 +3121,7 @@ kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
kmp_int32 gtid,
kmp_indirect_locktag_t tag) {
kmp_indirect_lock_t *lck;
- kmp_lock_index_t idx;
+ kmp_lock_index_t idx, table_idx;
__kmp_acquire_lock(&__kmp_global_lock, gtid);
@@ -3116,26 +3134,41 @@ kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
KA_TRACE(20, ("__kmp_allocate_indirect_lock: reusing an existing lock %p\n",
lck));
} else {
- idx = __kmp_i_lock_table.next;
- // Check capacity and double the size if it is full
- if (idx == __kmp_i_lock_table.size) {
- // Double up the space for block pointers
- int row = __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK;
- kmp_indirect_lock_t **new_table = (kmp_indirect_lock_t **)__kmp_allocate(
- 2 * row * sizeof(kmp_indirect_lock_t *));
- KMP_MEMCPY(new_table, __kmp_i_lock_table.table,
- row * sizeof(kmp_indirect_lock_t *));
- kmp_indirect_lock_t **old_table = __kmp_i_lock_table.table;
- __kmp_i_lock_table.table = new_table;
- __kmp_free(old_table);
- // Allocate new objects in the new blocks
- for (int i = row; i < 2 * row; ++i)
- *(__kmp_i_lock_table.table + i) = (kmp_indirect_lock_t *)__kmp_allocate(
- KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
- __kmp_i_lock_table.size = 2 * idx;
+ kmp_uint32 row, col;
+ kmp_indirect_lock_table_t *lock_table = &__kmp_i_lock_table;
+ idx = 0;
+ // Find location in list of lock tables to put new lock
+ while (1) {
+ table_idx = lock_table->next; // index within this table
+ idx += lock_table->next; // global index within list of tables
+ if (table_idx < lock_table->nrow_ptrs * KMP_I_LOCK_CHUNK) {
+ row = table_idx / KMP_I_LOCK_CHUNK;
+ col = table_idx % KMP_I_LOCK_CHUNK;
+ // Allocate a new row of locks if necessary
+ if (!lock_table->table[row]) {
+ lock_table->table[row] = (kmp_indirect_lock_t *)__kmp_allocate(
+ sizeof(kmp_indirect_lock_t) * KMP_I_LOCK_CHUNK);
+ }
+ break;
+ }
+ // Allocate a new lock table if necessary with double the capacity
+ if (!lock_table->next_table) {
+ kmp_indirect_lock_table_t *next_table =
+ (kmp_indirect_lock_table_t *)__kmp_allocate(
+ sizeof(kmp_indirect_lock_table_t));
+ next_table->table = (kmp_indirect_lock_t **)__kmp_allocate(
+ sizeof(kmp_indirect_lock_t *) * 2 * lock_table->nrow_ptrs);
+ next_table->nrow_ptrs = 2 * lock_table->nrow_ptrs;
+ next_table->next = 0;
+ next_table->next_table = nullptr;
+ lock_table->next_table = next_table;
+ }
+ lock_table = lock_table->next_table;
+ KMP_ASSERT(lock_table);
}
- __kmp_i_lock_table.next++;
- lck = KMP_GET_I_LOCK(idx);
+ lock_table->next++;
+
+ lck = &lock_table->table[row][col];
// Allocate a new base lock object
lck->lock = (kmp_user_lock_p)__kmp_allocate(__kmp_indirect_lock_size[tag]);
KA_TRACE(20,
@@ -3166,10 +3199,7 @@ __kmp_lookup_indirect_lock(void **user_lock, const char *func) {
}
if (OMP_LOCK_T_SIZE < sizeof(void *)) {
kmp_lock_index_t idx = KMP_EXTRACT_I_INDEX(user_lock);
- if (idx >= __kmp_i_lock_table.size) {
- KMP_FATAL(LockIsUninitialized, func);
- }
- lck = KMP_GET_I_LOCK(idx);
+ lck = __kmp_get_i_lock(idx);
} else {
lck = *((kmp_indirect_lock_t **)user_lock);
}
@@ -3179,7 +3209,7 @@ __kmp_lookup_indirect_lock(void **user_lock, const char *func) {
return lck;
} else {
if (OMP_LOCK_T_SIZE < sizeof(void *)) {
- return KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(user_lock));
+ return __kmp_get_i_lock(KMP_EXTRACT_I_INDEX(user_lock));
} else {
return *((kmp_indirect_lock_t **)user_lock);
}
@@ -3189,13 +3219,13 @@ __kmp_lookup_indirect_lock(void **user_lock, const char *func) {
static void __kmp_init_indirect_lock(kmp_dyna_lock_t *lock,
kmp_dyna_lockseq_t seq) {
#if KMP_USE_ADAPTIVE_LOCKS
- if (seq == lockseq_adaptive && !__kmp_cpuinfo.rtm) {
+ if (seq == lockseq_adaptive && !__kmp_cpuinfo.flags.rtm) {
KMP_WARNING(AdaptiveNotSupported, "kmp_lockseq_t", "adaptive");
seq = lockseq_queuing;
}
#endif
#if KMP_USE_TSX
- if (seq == lockseq_rtm_queuing && !__kmp_cpuinfo.rtm) {
+ if (seq == lockseq_rtm_queuing && !__kmp_cpuinfo.flags.rtm) {
seq = lockseq_queuing;
}
#endif
@@ -3322,12 +3352,13 @@ void __kmp_init_dynamic_user_locks() {
return;
// Initialize lock index table
- __kmp_i_lock_table.size = KMP_I_LOCK_CHUNK;
- __kmp_i_lock_table.table =
- (kmp_indirect_lock_t **)__kmp_allocate(sizeof(kmp_indirect_lock_t *));
+ __kmp_i_lock_table.nrow_ptrs = KMP_I_LOCK_TABLE_INIT_NROW_PTRS;
+ __kmp_i_lock_table.table = (kmp_indirect_lock_t **)__kmp_allocate(
+ sizeof(kmp_indirect_lock_t *) * KMP_I_LOCK_TABLE_INIT_NROW_PTRS);
*(__kmp_i_lock_table.table) = (kmp_indirect_lock_t *)__kmp_allocate(
KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
__kmp_i_lock_table.next = 0;
+ __kmp_i_lock_table.next_table = nullptr;
// Indirect lock size
__kmp_indirect_lock_size[locktag_ticket] = sizeof(kmp_ticket_lock_t);
@@ -3392,7 +3423,6 @@ void __kmp_init_dynamic_user_locks() {
// Clean up the lock table.
void __kmp_cleanup_indirect_user_locks() {
- kmp_lock_index_t i;
int k;
// Clean up locks in the pools first (they were already destroyed before going
@@ -3410,22 +3440,29 @@ void __kmp_cleanup_indirect_user_locks() {
__kmp_indirect_lock_pool[k] = NULL;
}
// Clean up the remaining undestroyed locks.
- for (i = 0; i < __kmp_i_lock_table.next; i++) {
- kmp_indirect_lock_t *l = KMP_GET_I_LOCK(i);
- if (l->lock != NULL) {
- // Locks not destroyed explicitly need to be destroyed here.
- KMP_I_LOCK_FUNC(l, destroy)(l->lock);
- KA_TRACE(
- 20,
- ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p from table\n",
- l));
- __kmp_free(l->lock);
+ kmp_indirect_lock_table_t *ptr = &__kmp_i_lock_table;
+ while (ptr) {
+ for (kmp_uint32 row = 0; row < ptr->nrow_ptrs; ++row) {
+ if (!ptr->table[row])
+ continue;
+ for (kmp_uint32 col = 0; col < KMP_I_LOCK_CHUNK; ++col) {
+ kmp_indirect_lock_t *l = &ptr->table[row][col];
+ if (l->lock) {
+ // Locks not destroyed explicitly need to be destroyed here.
+ KMP_I_LOCK_FUNC(l, destroy)(l->lock);
+ KA_TRACE(20, ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p "
+ "from table\n",
+ l));
+ __kmp_free(l->lock);
+ }
+ }
+ __kmp_free(ptr->table[row]);
}
+ kmp_indirect_lock_table_t *next_table = ptr->next_table;
+ if (ptr != &__kmp_i_lock_table)
+ __kmp_free(ptr);
+ ptr = next_table;
}
- // Free the table
- for (i = 0; i < __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK; i++)
- __kmp_free(__kmp_i_lock_table.table[i]);
- __kmp_free(__kmp_i_lock_table.table);
__kmp_init_user_locks = FALSE;
}
diff --git a/contrib/libs/cxxsupp/openmp/kmp_lock.h b/contrib/libs/cxxsupp/openmp/kmp_lock.h
index 4f6ad6414e..a19f4ca323 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_lock.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_lock.h
@@ -651,12 +651,15 @@ extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
if (lck->tas.lk.poll != 0 || \
!__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \
kmp_uint32 spins; \
+ kmp_uint64 time; \
KMP_FSYNC_PREPARE(lck); \
KMP_INIT_YIELD(spins); \
+ KMP_INIT_BACKOFF(time); \
do { \
- KMP_YIELD_OVERSUB_ELSE_SPIN(spins); \
- } while (lck->tas.lk.poll != 0 || !__kmp_atomic_compare_store_acq( \
- &lck->tas.lk.poll, 0, gtid + 1)); \
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); \
+ } while ( \
+ lck->tas.lk.poll != 0 || \
+ !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)); \
} \
KMP_FSYNC_ACQUIRED(lck); \
} else { \
@@ -758,10 +761,12 @@ extern int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
if ((lck->tas.lk.poll != 0) || \
!__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \
kmp_uint32 spins; \
+ kmp_uint64 time; \
KMP_FSYNC_PREPARE(lck); \
KMP_INIT_YIELD(spins); \
+ KMP_INIT_BACKOFF(time); \
do { \
- KMP_YIELD_OVERSUB_ELSE_SPIN(spins); \
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); \
} while ( \
(lck->tas.lk.poll != 0) || \
!__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)); \
@@ -1217,22 +1222,41 @@ extern kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
? __kmp_indirect_get_flags[(lck)->type]((lck)->lock) \
: NULL)
-#define KMP_I_LOCK_CHUNK \
- 1024 // number of kmp_indirect_lock_t objects to be allocated together
+// number of kmp_indirect_lock_t objects to be allocated together
+#define KMP_I_LOCK_CHUNK 1024
+// Keep at a power of 2 since it is used in multiplication & division
+KMP_BUILD_ASSERT(KMP_I_LOCK_CHUNK % 2 == 0);
+// number of row entries in the initial lock table
+#define KMP_I_LOCK_TABLE_INIT_NROW_PTRS 8
// Lock table for indirect locks.
typedef struct kmp_indirect_lock_table {
kmp_indirect_lock_t **table; // blocks of indirect locks allocated
- kmp_lock_index_t size; // size of the indirect lock table
+ kmp_uint32 nrow_ptrs; // number *table pointer entries in table
kmp_lock_index_t next; // index to the next lock to be allocated
+ struct kmp_indirect_lock_table *next_table;
} kmp_indirect_lock_table_t;
extern kmp_indirect_lock_table_t __kmp_i_lock_table;
// Returns the indirect lock associated with the given index.
-#define KMP_GET_I_LOCK(index) \
- (*(__kmp_i_lock_table.table + (index) / KMP_I_LOCK_CHUNK) + \
- (index) % KMP_I_LOCK_CHUNK)
+// Returns nullptr if no lock at given index
+static inline kmp_indirect_lock_t *__kmp_get_i_lock(kmp_lock_index_t idx) {
+ kmp_indirect_lock_table_t *lock_table = &__kmp_i_lock_table;
+ while (lock_table) {
+ kmp_lock_index_t max_locks = lock_table->nrow_ptrs * KMP_I_LOCK_CHUNK;
+ if (idx < max_locks) {
+ kmp_lock_index_t row = idx / KMP_I_LOCK_CHUNK;
+ kmp_lock_index_t col = idx % KMP_I_LOCK_CHUNK;
+ if (!lock_table->table[row] || idx >= lock_table->next)
+ break;
+ return &lock_table->table[row][col];
+ }
+ idx -= max_locks;
+ lock_table = lock_table->next_table;
+ }
+ return nullptr;
+}
// Number of locks in a lock block, which is fixed to "1" now.
// TODO: No lock block implementation now. If we do support, we need to manage
@@ -1241,8 +1265,9 @@ extern int __kmp_num_locks_in_block;
// Fast lock table lookup without consistency checking
#define KMP_LOOKUP_I_LOCK(l) \
- ((OMP_LOCK_T_SIZE < sizeof(void *)) ? KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(l)) \
- : *((kmp_indirect_lock_t **)(l)))
+ ((OMP_LOCK_T_SIZE < sizeof(void *)) \
+ ? __kmp_get_i_lock(KMP_EXTRACT_I_INDEX(l)) \
+ : *((kmp_indirect_lock_t **)(l)))
// Used once in kmp_error.cpp
extern kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p, kmp_uint32);
diff --git a/contrib/libs/cxxsupp/openmp/kmp_os.h b/contrib/libs/cxxsupp/openmp/kmp_os.h
index 4437cf2518..d71e9aecb3 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_os.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_os.h
@@ -1025,6 +1025,30 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
#define KMP_MB() /* nothing to do */
#endif
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if KMP_COMPILER_ICC
+#define KMP_MFENCE_() _mm_mfence()
+#define KMP_SFENCE_() _mm_sfence()
+#elif KMP_COMPILER_MSVC
+#define KMP_MFENCE_() MemoryBarrier()
+#define KMP_SFENCE_() MemoryBarrier()
+#else
+#define KMP_MFENCE_() __sync_synchronize()
+#define KMP_SFENCE_() __sync_synchronize()
+#endif
+#define KMP_MFENCE() \
+ if (UNLIKELY(!__kmp_cpuinfo.initialized)) { \
+ __kmp_query_cpuid(&__kmp_cpuinfo); \
+ } \
+ if (__kmp_cpuinfo.flags.sse2) { \
+ KMP_MFENCE_(); \
+ }
+#define KMP_SFENCE() KMP_SFENCE_()
+#else
+#define KMP_MFENCE() KMP_MB()
+#define KMP_SFENCE() KMP_MB()
+#endif
+
#ifndef KMP_IMB
#define KMP_IMB() /* nothing to do */
#endif
diff --git a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
index fe931bb157..34f8a01743 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
@@ -107,6 +107,10 @@ static int __kmp_unregister_root_other_thread(int gtid);
static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
+void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
+ int new_nthreads);
+void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
+
/* Calculate the identifier of the current thread */
/* fast (and somewhat portable) way to get unique identifier of executing
thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
@@ -910,7 +914,8 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
assured that there are enough threads available, because we checked on that
earlier within critical section forkjoin */
static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
- kmp_info_t *master_th, int master_gtid) {
+ kmp_info_t *master_th, int master_gtid,
+ int fork_teams_workers) {
int i;
int use_hot_team;
@@ -999,7 +1004,12 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
}
#if KMP_AFFINITY_SUPPORTED
- __kmp_partition_places(team);
+ // Do not partition the places list for teams construct workers who
+ // haven't actually been forked to do real work yet. This partitioning
+ // will take place in the parallel region nested within the teams construct.
+ if (!fork_teams_workers) {
+ __kmp_partition_places(team);
+ }
#endif
}
@@ -1204,7 +1214,7 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
this_thr->th.th_team = serial_team;
serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
- KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
+ KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
this_thr->th.th_current_task));
KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
this_thr->th.th_current_task->td_flags.executing = 0;
@@ -1563,15 +1573,24 @@ int __kmp_fork_call(ident_t *loc, int gtid,
/* Change number of threads in the team if requested */
if (master_set_numthreads) { // The parallel has num_threads clause
- if (master_set_numthreads < master_th->th.th_teams_size.nth) {
+ if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
// AC: only can reduce number of threads dynamically, can't increase
kmp_info_t **other_threads = parent_team->t.t_threads;
+ // NOTE: if using distributed barrier, we need to run this code block
+ // even when the team size appears not to have changed from the max.
+ int old_proc = master_th->th.th_teams_size.nth;
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
+ bp_dist_bar) {
+ __kmp_resize_dist_barrier(parent_team, old_proc,
+ master_set_numthreads);
+ __kmp_add_threads_to_team(parent_team, master_set_numthreads);
+ }
parent_team->t.t_nproc = master_set_numthreads;
for (i = 0; i < master_set_numthreads; ++i) {
other_threads[i]->th.th_team_nproc = master_set_numthreads;
}
- // Keep extra threads hot in the team for possible next parallels
}
+ // Keep extra threads hot in the team for possible next parallels
master_th->th.th_set_nproc = 0;
}
@@ -1584,6 +1603,41 @@ int __kmp_fork_call(ident_t *loc, int gtid,
}
#endif
+ // Figure out the proc_bind policy for the nested parallel within teams
+ kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
+ // proc_bind_default means don't update
+ kmp_proc_bind_t proc_bind_icv = proc_bind_default;
+ if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
+ proc_bind = proc_bind_false;
+ } else {
+ // No proc_bind clause specified; use current proc-bind-var
+ if (proc_bind == proc_bind_default) {
+ proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
+ }
+ /* else: The proc_bind policy was specified explicitly on parallel
+ clause.
+ This overrides proc-bind-var for this parallel region, but does not
+ change proc-bind-var. */
+ // Figure the value of proc-bind-var for the child threads.
+ if ((level + 1 < __kmp_nested_proc_bind.used) &&
+ (__kmp_nested_proc_bind.bind_types[level + 1] !=
+ master_th->th.th_current_task->td_icvs.proc_bind)) {
+ proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+ }
+ }
+ KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
+ // Need to change the bind-var ICV to correct value for each implicit task
+ if (proc_bind_icv != proc_bind_default &&
+ master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
+ kmp_info_t **other_threads = parent_team->t.t_threads;
+ for (i = 0; i < master_th->th.th_team_nproc; ++i) {
+ other_threads[i]->th.th_current_task->td_icvs.proc_bind =
+ proc_bind_icv;
+ }
+ }
+ // Reset for next parallel region
+ master_th->th.th_set_proc_bind = proc_bind_default;
+
#if USE_ITT_BUILD && USE_ITT_NOTIFY
if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
KMP_ITT_DEBUG) &&
@@ -1600,6 +1654,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
}
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+#if KMP_AFFINITY_SUPPORTED
+ __kmp_partition_places(parent_team);
+#endif
KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
@@ -1635,6 +1692,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
}
#endif
+ // Need this to happen before we determine the number of threads, not while
+ // we are allocating the team
+ //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
int enter_teams = 0;
if (parent_team->t.t_active_level >=
master_th->th.th_current_task->td_icvs.max_active_levels) {
@@ -1642,13 +1702,10 @@ int __kmp_fork_call(ident_t *loc, int gtid,
} else {
enter_teams = ((ap == NULL && active_level == 0) ||
(ap && teams_level > 0 && teams_level == level));
- nthreads =
- master_set_numthreads
- ? master_set_numthreads
- : get__nproc_2(
- parent_team,
- master_tid); // TODO: get nproc directly from current task
-
+ nthreads = master_set_numthreads
+ ? master_set_numthreads
+ // TODO: get nproc directly from current task
+ : get__nproc_2(parent_team, master_tid);
// Check if we need to take forkjoin lock? (no need for serialized
// parallel out of teams construct). This code moved here from
// __kmp_reserve_threads() to speedup nested serialized parallels.
@@ -1940,16 +1997,21 @@ int __kmp_fork_call(ident_t *loc, int gtid,
// Figure out the proc_bind_policy for the new team.
kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
- kmp_proc_bind_t proc_bind_icv =
- proc_bind_default; // proc_bind_default means don't update
+ // proc_bind_default means don't update
+ kmp_proc_bind_t proc_bind_icv = proc_bind_default;
if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
} else {
+ // No proc_bind clause specified; use current proc-bind-var for this
+ // parallel region
if (proc_bind == proc_bind_default) {
- // No proc_bind clause specified; use current proc-bind-var for this
- // parallel region
proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
}
+ // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
+ if (master_th->th.th_teams_microtask &&
+ microtask == (microtask_t)__kmp_teams_master) {
+ proc_bind = __kmp_teams_proc_bind;
+ }
/* else: The proc_bind policy was specified explicitly on parallel clause.
This overrides proc-bind-var for this parallel region, but does not
change proc-bind-var. */
@@ -1957,7 +2019,11 @@ int __kmp_fork_call(ident_t *loc, int gtid,
if ((level + 1 < __kmp_nested_proc_bind.used) &&
(__kmp_nested_proc_bind.bind_types[level + 1] !=
master_th->th.th_current_task->td_icvs.proc_bind)) {
- proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+ // Do not modify the proc bind icv for the two teams construct forks
+ // They just let the proc bind icv pass through
+ if (!master_th->th.th_teams_microtask ||
+ !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
+ proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
}
}
@@ -1983,6 +2049,8 @@ int __kmp_fork_call(ident_t *loc, int gtid,
#endif
proc_bind, &new_icvs,
argc USE_NESTED_HOT_ARG(master_th));
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
+ copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
} else {
/* allocate a new parallel team */
KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
@@ -1993,6 +2061,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
proc_bind,
&master_th->th.th_current_task->td_icvs,
argc USE_NESTED_HOT_ARG(master_th));
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
+ copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
+ &master_th->th.th_current_task->td_icvs);
}
KF_TRACE(
10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
@@ -2124,7 +2195,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
root->r.r_active = TRUE;
- __kmp_fork_team_threads(root, team, master_th, gtid);
+ __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
__kmp_setup_icv_copy(team, nthreads,
&master_th->th.th_current_task->td_icvs, loc);
@@ -2359,6 +2430,12 @@ void __kmp_join_call(ident_t *loc, int gtid
parent_team->t.t_stack_id = NULL;
}
#endif
+
+ if (team->t.t_nproc > 1 &&
+ __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ team->t.b->update_num_threads(team->t.t_nproc);
+ __kmp_add_threads_to_team(team, team->t.t_nproc);
+ }
}
KMP_MB();
@@ -2387,6 +2464,14 @@ void __kmp_join_call(ident_t *loc, int gtid
} // active_level == 1
#endif /* USE_ITT_BUILD */
+#if KMP_AFFINITY_SUPPORTED
+ if (!exit_teams) {
+ // Restore master thread's partition.
+ master_th->th.th_first_place = team->t.t_first_place;
+ master_th->th.th_last_place = team->t.t_last_place;
+ }
+#endif // KMP_AFFINITY_SUPPORTED
+
if (master_th->th.th_teams_microtask && !exit_teams &&
team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
team->t.t_level == master_th->th.th_teams_level + 1) {
@@ -2494,11 +2579,6 @@ void __kmp_join_call(ident_t *loc, int gtid
master_th, team));
__kmp_pop_current_task_from_thread(master_th);
-#if KMP_AFFINITY_SUPPORTED
- // Restore master thread's partition.
- master_th->th.th_first_place = team->t.t_first_place;
- master_th->th.th_last_place = team->t.t_last_place;
-#endif // KMP_AFFINITY_SUPPORTED
master_th->th.th_def_allocator = team->t.t_def_allocator;
#if OMPD_SUPPORT
@@ -2646,6 +2726,9 @@ void __kmp_set_num_threads(int new_nth, int gtid) {
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
+ }
// Release the extra threads we don't need any more.
for (f = new_nth; f < hot_team->t.t_nproc; f++) {
KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
@@ -2665,6 +2748,11 @@ void __kmp_set_num_threads(int new_nth, int gtid) {
}
#endif
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ hot_team->t.b->update_num_threads(new_nth);
+ __kmp_add_threads_to_team(hot_team, new_nth);
+ }
+
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
// Update the t_nproc field in the threads that are still active.
@@ -4018,7 +4106,8 @@ void __kmp_unregister_root_current_thread(int gtid) {
kmp_task_team_t *task_team = thread->th.th_task_team;
// we need to wait for the proxy tasks before finishing the thread
- if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
+ if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
+ task_team->tt.tt_hidden_helper_task_encountered)) {
#if OMPT_SUPPORT
// the runtime is shutting down so we won't report any events
thread->th.ompt_thread_info.state = ompt_state_undefined;
@@ -4112,7 +4201,6 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
this_thr->th.th_team_nproc = team->t.t_nproc;
this_thr->th.th_team_master = master;
this_thr->th.th_team_serialized = team->t.t_serialized;
- TCW_PTR(this_thr->th.th_sleep_loc, NULL);
KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
@@ -4281,6 +4369,12 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
new_thr->th.th_task_state_top = 0;
new_thr->th.th_task_state_stack_sz = 4;
+ if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ // Make sure pool thread has transitioned to waiting on own thread struct
+ KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
+ // Thread activated in __kmp_allocate_team when increasing team size
+ }
+
#ifdef KMP_ADJUST_BLOCKTIME
/* Adjust blocktime back to zero if necessary */
/* Middle initialization might not have occurred yet */
@@ -4448,6 +4542,9 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
balign[b].bb.use_oncore_barrier = 0;
}
+ TCW_PTR(new_thr->th.th_sleep_loc, NULL);
+ new_thr->th.th_sleep_loc_type = flag_unset;
+
new_thr->th.th_spin_here = FALSE;
new_thr->th.th_next_waiting = 0;
#if KMP_OS_UNIX
@@ -4976,6 +5073,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
kmp_team_t *team;
int use_hot_team = !root->r.r_active;
int level = 0;
+ int do_place_partition = 1;
KA_TRACE(20, ("__kmp_allocate_team: called\n"));
KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
@@ -4997,6 +5095,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
++level; // not increment if #teams==1, or for outer fork of the teams;
// increment otherwise
}
+ // Do not perform the place partition if inner fork of the teams
+ // Wait until nested parallel region encountered inside teams construct
+ if ((master->th.th_teams_size.nteams == 1 &&
+ master->th.th_teams_level >= team->t.t_level) ||
+ (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
+ do_place_partition = 0;
}
hot_teams = master->th.th_hot_teams;
if (level < __kmp_hot_teams_max_level && hot_teams &&
@@ -5027,6 +5131,17 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
}
#endif
+ if (team->t.t_nproc != new_nproc &&
+ __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ // Distributed barrier may need a resize
+ int old_nthr = team->t.t_nproc;
+ __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
+ }
+
+ // If not doing the place partition, then reset the team's proc bind
+ // to indicate that partitioning of all threads still needs to take place
+ if (do_place_partition == 0)
+ team->t.t_proc_bind = proc_bind_default;
// Has the number of threads changed?
/* Let's assume the most common case is that the number of threads is
unchanged, and put that case first. */
@@ -5056,16 +5171,20 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
if ((team->t.t_size_changed == 0) &&
(team->t.t_proc_bind == new_proc_bind)) {
if (new_proc_bind == proc_bind_spread) {
- __kmp_partition_places(
- team, 1); // add flag to update only master for spread
+ if (do_place_partition) {
+ // add flag to update only master for spread
+ __kmp_partition_places(team, 1);
+ }
}
KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
"proc_bind = %d, partition = [%d,%d]\n",
team->t.t_id, new_proc_bind, team->t.t_first_place,
team->t.t_last_place));
} else {
- KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
- __kmp_partition_places(team);
+ if (do_place_partition) {
+ KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+ __kmp_partition_places(team);
+ }
}
#else
KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
@@ -5076,6 +5195,11 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
new_nproc));
team->t.t_size_changed = 1;
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ // Barrier size already reduced earlier in this function
+ // Activate team threads via th_used_in_team
+ __kmp_add_threads_to_team(team, new_nproc);
+ }
#if KMP_NESTED_HOT_TEAMS
if (__kmp_hot_teams_mode == 0) {
// AC: saved number of threads should correspond to team's value in this
@@ -5137,10 +5261,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
}
#endif
- KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+ if (do_place_partition) {
+ KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
#if KMP_AFFINITY_SUPPORTED
- __kmp_partition_places(team);
+ __kmp_partition_places(team);
#endif
+ }
} else { // team->t.t_nproc < new_nproc
#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
kmp_affin_mask_t *old_mask;
@@ -5152,7 +5278,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
KA_TRACE(20,
("__kmp_allocate_team: increasing hot team thread count to %d\n",
new_nproc));
-
+ int old_nproc = team->t.t_nproc; // save old value and use to update only
team->t.t_size_changed = 1;
#if KMP_NESTED_HOT_TEAMS
@@ -5179,10 +5305,9 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
team->t.t_nproc = new_nproc; // just get reserved threads involved
} else {
- // we may have some threads in reserve, but not enough
- team->t.t_nproc =
- hot_teams[level]
- .hot_team_nth; // get reserved threads involved if any
+ // We may have some threads in reserve, but not enough;
+ // get reserved threads involved if any.
+ team->t.t_nproc = hot_teams[level].hot_team_nth;
hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
#endif // KMP_NESTED_HOT_TEAMS
if (team->t.t_max_nproc < new_nproc) {
@@ -5237,8 +5362,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
#if KMP_NESTED_HOT_TEAMS
} // end of check of t_nproc vs. new_nproc vs. hot_team_nth
#endif // KMP_NESTED_HOT_TEAMS
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ // Barrier size already increased earlier in this function
+ // Activate team threads via th_used_in_team
+ __kmp_add_threads_to_team(team, new_nproc);
+ }
/* make sure everyone is syncronized */
- int old_nproc = team->t.t_nproc; // save old value and use to update only
// new threads below
__kmp_initialize_team(team, new_nproc, new_icvs,
root->r.r_uber_thread->th.th_ident);
@@ -5273,10 +5402,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
}
#endif
- KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+ if (do_place_partition) {
+ KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
#if KMP_AFFINITY_SUPPORTED
- __kmp_partition_places(team);
+ __kmp_partition_places(team);
#endif
+ }
} // Check changes in number of threads
kmp_info_t *master = team->t.t_threads[0];
@@ -5342,6 +5473,13 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
/* take this team from the team pool */
__kmp_team_pool = team->t.t_next_pool;
+ if (max_nproc > 1 &&
+ __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ if (!team->t.b) { // Allocate barrier structure
+ team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
+ }
+ }
+
/* setup the team for fresh use */
__kmp_initialize_team(team, new_nproc, new_icvs, NULL);
@@ -5397,6 +5535,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
/* and set it up */
team->t.t_max_nproc = max_nproc;
+ if (max_nproc > 1 &&
+ __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ // Allocate barrier structure
+ team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
+ }
+
/* NOTE well, for some reason allocating one big buffer and dividing it up
seems to really hurt performance a lot on the P4, so, let's not use this */
__kmp_allocate_team_arrays(team, max_nproc);
@@ -5469,7 +5613,6 @@ void __kmp_free_team(kmp_root_t *root,
int use_hot_team = team == root->r.r_hot_team;
#if KMP_NESTED_HOT_TEAMS
int level;
- kmp_hot_team_ptr_t *hot_teams;
if (master) {
level = team->t.t_active_level - 1;
if (master->th.th_teams_microtask) { // in teams construct?
@@ -5483,7 +5626,9 @@ void __kmp_free_team(kmp_root_t *root,
// team_of_workers before the parallel
} // team->t.t_level will be increased inside parallel
}
- hot_teams = master->th.th_hot_teams;
+#if KMP_DEBUG
+ kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
+#endif
if (level < __kmp_hot_teams_max_level) {
KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
use_hot_team = 1;
@@ -5553,10 +5698,43 @@ void __kmp_free_team(kmp_root_t *root,
/* free the worker threads */
for (f = 1; f < team->t.t_nproc; ++f) {
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+ if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
+ 1, 2);
+ }
__kmp_free_thread(team->t.t_threads[f]);
+ }
+
+ if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ if (team->t.b) {
+ // wake up thread at old location
+ team->t.b->go_release();
+ if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+ for (f = 1; f < team->t.t_nproc; ++f) {
+ if (team->t.b->sleep[f].sleep) {
+ __kmp_atomic_resume_64(
+ team->t.t_threads[f]->th.th_info.ds.ds_gtid,
+ (kmp_atomic_flag_64<> *)NULL);
+ }
+ }
+ }
+ // Wait for threads to be removed from team
+ for (int f = 1; f < team->t.t_nproc; ++f) {
+ while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
+ KMP_CPU_PAUSE();
+ }
+ }
+ }
+
+ for (f = 1; f < team->t.t_nproc; ++f) {
team->t.t_threads[f] = NULL;
}
+ if (team->t.t_max_nproc > 1 &&
+ __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ distributedBarrier::deallocate(team->t.b);
+ team->t.b = NULL;
+ }
/* put the team back in the team pool */
/* TODO limit size of team pool, call reap_team if pool too large */
team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
@@ -5955,11 +6133,18 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
KA_TRACE(
20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
gtid));
- /* Need release fence here to prevent seg faults for tree forkjoin barrier
- * (GEH) */
- kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
- thread);
- __kmp_release_64(&flag);
+ if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ while (
+ !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
+ KMP_CPU_PAUSE();
+ __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
+ } else {
+ /* Need release fence here to prevent seg faults for tree forkjoin
+ barrier (GEH) */
+ kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
+ thread);
+ __kmp_release_64(&flag);
+ }
}
// Terminate OS thread.
@@ -6054,6 +6239,31 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
} // __kmp_reap_thread
+static void __kmp_itthash_clean(kmp_info_t *th) {
+#if USE_ITT_NOTIFY
+ if (__kmp_itt_region_domains.count > 0) {
+ for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
+ kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
+ while (bucket) {
+ kmp_itthash_entry_t *next = bucket->next_in_bucket;
+ __kmp_thread_free(th, bucket);
+ bucket = next;
+ }
+ }
+ }
+ if (__kmp_itt_barrier_domains.count > 0) {
+ for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
+ kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
+ while (bucket) {
+ kmp_itthash_entry_t *next = bucket->next_in_bucket;
+ __kmp_thread_free(th, bucket);
+ bucket = next;
+ }
+ }
+ }
+#endif
+}
+
static void __kmp_internal_end(void) {
int i;
@@ -6240,6 +6450,7 @@ void __kmp_internal_end_library(int gtid_req) {
gtid));
return;
} else {
+ __kmp_itthash_clean(__kmp_threads[gtid]);
KA_TRACE(
10,
("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
@@ -6486,7 +6697,7 @@ void __kmp_register_library_startup(void) {
char *value = NULL; // Actual value of the environment variable.
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
char *shm_name = __kmp_str_format("/%s", name);
int shm_preexist = 0;
char *data1;
@@ -6591,7 +6802,7 @@ void __kmp_register_library_startup(void) {
} break;
case 2: { // Neighbor is dead.
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
// close shared memory.
shm_unlink(shm_name); // this removes file in /dev/shm
#else
@@ -6605,7 +6816,7 @@ void __kmp_register_library_startup(void) {
}
}
KMP_INTERNAL_FREE((void *)value);
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
KMP_INTERNAL_FREE((void *)shm_name);
#endif
} // while
@@ -6618,7 +6829,7 @@ void __kmp_unregister_library(void) {
char *name = __kmp_reg_status_name();
char *value = NULL;
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
char *shm_name = __kmp_str_format("/%s", name);
int fd1 = shm_open(shm_name, O_RDONLY, 0666);
if (fd1 == -1) {
@@ -6639,14 +6850,14 @@ void __kmp_unregister_library(void) {
KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
// Ok, this is our variable. Delete it.
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
shm_unlink(shm_name); // this removes file in /dev/shm
#else
__kmp_env_unset(name);
#endif
}
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
KMP_INTERNAL_FREE(shm_name);
#endif
@@ -6684,7 +6895,9 @@ static void __kmp_check_mic_type() {
static void __kmp_user_level_mwait_init() {
struct kmp_cpuid buf;
__kmp_x86_cpuid(7, 0, &buf);
- __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
+ __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
+ __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
+ __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
__kmp_umwait_enabled));
}
@@ -6844,8 +7057,8 @@ static void __kmp_do_serial_initialize(void) {
#if KMP_FAST_REDUCTION_BARRIER
#define kmp_reduction_barrier_gather_bb ((int)1)
#define kmp_reduction_barrier_release_bb ((int)1)
-#define kmp_reduction_barrier_gather_pat bp_hyper_bar
-#define kmp_reduction_barrier_release_pat bp_hyper_bar
+#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
+#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
#endif // KMP_FAST_REDUCTION_BARRIER
for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
__kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
@@ -7500,6 +7713,11 @@ static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
num_threads = 1;
}
} else {
+ if (num_threads < 0) {
+ __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
+ __kmp_msg_null);
+ num_threads = 1;
+ }
// This thread will be the primary thread of the league primary threads
// Store new thread limit; old limit is saved in th_cg_roots list
thr->th.th_current_task->td_icvs.thread_limit = num_threads;
@@ -7531,9 +7749,13 @@ static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
int num_threads) {
kmp_info_t *thr = __kmp_threads[gtid];
- KMP_DEBUG_ASSERT(num_teams >= 0);
- KMP_DEBUG_ASSERT(num_threads >= 0);
-
+ if (num_teams < 0) {
+ // OpenMP specification requires requested values to be positive,
+ // but people can send us any value, so we'd better check
+ __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
+ __kmp_msg_null);
+ num_teams = 1;
+ }
if (num_teams == 0) {
if (__kmp_nteams > 0) {
num_teams = __kmp_nteams;
@@ -7590,7 +7812,7 @@ void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
} else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
num_teams = num_teams_ub;
} else { // num_teams_lb <= num_teams <= num_teams_ub
- if (num_threads == 0) {
+ if (num_threads <= 0) {
if (num_teams_ub > __kmp_teams_max_nth) {
num_teams = num_teams_lb;
} else {
@@ -8702,6 +8924,96 @@ void __kmp_omp_display_env(int verbose) {
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
}
+// The team size is changing, so distributed barrier must be modified
+void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
+ int new_nthreads) {
+ KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
+ bp_dist_bar);
+ kmp_info_t **other_threads = team->t.t_threads;
+
+ // We want all the workers to stop waiting on the barrier while we adjust the
+ // size of the team.
+ for (int f = 1; f < old_nthreads; ++f) {
+ KMP_DEBUG_ASSERT(other_threads[f] != NULL);
+ // Ignore threads that are already inactive or not present in the team
+ if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
+ // teams construct causes thread_limit to get passed in, and some of
+ // those could be inactive; just ignore them
+ continue;
+ }
+ // If thread is transitioning still to in_use state, wait for it
+ if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
+ while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
+ KMP_CPU_PAUSE();
+ }
+ // The thread should be in_use now
+ KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
+ // Transition to unused state
+ team->t.t_threads[f]->th.th_used_in_team.store(2);
+ KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
+ }
+ // Release all the workers
+ kmp_uint64 new_value; // new value for go
+ new_value = team->t.b->go_release();
+
+ KMP_MFENCE();
+
+ // Workers should see transition status 2 and move to 0; but may need to be
+ // woken up first
+ size_t my_go_index;
+ int count = old_nthreads - 1;
+ while (count > 0) {
+ count = old_nthreads - 1;
+ for (int f = 1; f < old_nthreads; ++f) {
+ my_go_index = f / team->t.b->threads_per_go;
+ if (other_threads[f]->th.th_used_in_team.load() != 0) {
+ if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
+ kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
+ void *, other_threads[f]->th.th_sleep_loc);
+ __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
+ }
+ } else {
+ KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
+ count--;
+ }
+ }
+ }
+ // Now update the barrier size
+ team->t.b->update_num_threads(new_nthreads);
+ team->t.b->go_reset();
+}
+
+void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
+ // Add the threads back to the team
+ KMP_DEBUG_ASSERT(team);
+ // Threads were paused and pointed at th_used_in_team temporarily during a
+ // resize of the team. We're going to set th_used_in_team to 3 to indicate to
+ // the thread that it should transition itself back into the team. Then, if
+ // blocktime isn't infinite, the thread could be sleeping, so we send a resume
+ // to wake it up.
+ for (int f = 1; f < new_nthreads; ++f) {
+ KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+ KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
+ 3);
+ if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
+ __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
+ (kmp_flag_32<false, false> *)NULL);
+ }
+ }
+ // The threads should be transitioning to the team; when they are done, they
+ // should have set th_used_in_team to 1. This loop forces master to wait until
+ // all threads have moved into the team and are waiting in the barrier.
+ int count = new_nthreads - 1;
+ while (count > 0) {
+ count = new_nthreads - 1;
+ for (int f = 1; f < new_nthreads; ++f) {
+ if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
+ count--;
+ }
+ }
+ }
+}
+
// Globals and functions for hidden helper task
kmp_info_t **__kmp_hidden_helper_threads;
kmp_info_t *__kmp_hidden_helper_main_thread;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_settings.cpp b/contrib/libs/cxxsupp/openmp/kmp_settings.cpp
index 0b0973c766..112502fdce 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_settings.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_settings.cpp
@@ -164,7 +164,12 @@ int __kmp_convert_to_milliseconds(char const *data) {
return (INT_MAX);
value = (double)0.0;
mult = '\0';
+#if KMP_OS_WINDOWS && KMP_MSVC_COMPAT
+ // On Windows, each %c parameter needs additional size parameter for sscanf_s
+ nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, 1, &extra, 1);
+#else
nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, &extra);
+#endif
if (nvalues < 1)
return (-1);
if (nvalues == 1)
@@ -297,8 +302,8 @@ void __kmp_check_stksize(size_t *val) {
// if system stack size is too big then limit the size for worker threads
if (*val > KMP_DEFAULT_STKSIZE * 16) // just a heuristics...
*val = KMP_DEFAULT_STKSIZE * 16;
- if (*val < KMP_MIN_STKSIZE)
- *val = KMP_MIN_STKSIZE;
+ if (*val < __kmp_sys_min_stksize)
+ *val = __kmp_sys_min_stksize;
if (*val > KMP_MAX_STKSIZE)
*val = KMP_MAX_STKSIZE; // dead code currently, but may work in future
#if KMP_OS_DARWIN
@@ -426,6 +431,7 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
int *out_range, char *out_routine,
char *out_file, int *out_lb,
int *out_ub) {
+ const char *par_range_value;
size_t len = KMP_STRLEN(value) + 1;
par_range_to_print = (char *)KMP_INTERNAL_MALLOC(len + 1);
KMP_STRNCPY_S(par_range_to_print, len + 1, value, len + 1);
@@ -434,11 +440,14 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
__kmp_par_range_ub = INT_MAX;
for (;;) {
unsigned int len;
- if (*value == '\0') {
+ if (!value || *value == '\0') {
break;
}
if (!__kmp_strcasecmp_with_sentinel("routine", value, '=')) {
- value = strchr(value, '=') + 1;
+ par_range_value = strchr(value, '=') + 1;
+ if (!par_range_value)
+ goto par_range_error;
+ value = par_range_value;
len = __kmp_readstr_with_sentinel(out_routine, value,
KMP_PAR_RANGE_ROUTINE_LEN - 1, ',');
if (len == 0) {
@@ -451,7 +460,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
continue;
}
if (!__kmp_strcasecmp_with_sentinel("filename", value, '=')) {
- value = strchr(value, '=') + 1;
+ par_range_value = strchr(value, '=') + 1;
+ if (!par_range_value)
+ goto par_range_error;
+ value = par_range_value;
len = __kmp_readstr_with_sentinel(out_file, value,
KMP_PAR_RANGE_FILENAME_LEN - 1, ',');
if (len == 0) {
@@ -465,7 +477,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
}
if ((!__kmp_strcasecmp_with_sentinel("range", value, '=')) ||
(!__kmp_strcasecmp_with_sentinel("incl_range", value, '='))) {
- value = strchr(value, '=') + 1;
+ par_range_value = strchr(value, '=') + 1;
+ if (!par_range_value)
+ goto par_range_error;
+ value = par_range_value;
if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) {
goto par_range_error;
}
@@ -477,7 +492,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
continue;
}
if (!__kmp_strcasecmp_with_sentinel("excl_range", value, '=')) {
- value = strchr(value, '=') + 1;
+ par_range_value = strchr(value, '=') + 1;
+ if (!par_range_value)
+ goto par_range_error;
+ value = par_range_value;
if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) {
goto par_range_error;
}
@@ -1684,6 +1702,8 @@ static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value,
const char *var;
/* ---------- Barrier method control ------------ */
+ static int dist_req = 0, non_dist_req = 0;
+ static bool warn = 1;
for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
var = __kmp_barrier_pattern_env_name[i];
@@ -1695,6 +1715,11 @@ static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value,
for (j = bp_linear_bar; j < bp_last_bar; j++) {
if (__kmp_match_with_sentinel(__kmp_barrier_pattern_name[j], value, 1,
',')) {
+ if (j == bp_dist_bar) {
+ dist_req++;
+ } else {
+ non_dist_req++;
+ }
__kmp_barrier_gather_pattern[i] = (kmp_bar_pat_e)j;
break;
}
@@ -1709,6 +1734,11 @@ static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value,
if (comma != NULL) {
for (j = bp_linear_bar; j < bp_last_bar; j++) {
if (__kmp_str_match(__kmp_barrier_pattern_name[j], 1, comma + 1)) {
+ if (j == bp_dist_bar) {
+ dist_req++;
+ } else {
+ non_dist_req++;
+ }
__kmp_barrier_release_pattern[i] = (kmp_bar_pat_e)j;
break;
}
@@ -1723,6 +1753,20 @@ static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value,
}
}
}
+ if (dist_req != 0) {
+ // set all barriers to dist
+ if ((non_dist_req != 0) && warn) {
+ KMP_INFORM(BarrierPatternOverride, name,
+ __kmp_barrier_pattern_name[bp_dist_bar]);
+ warn = 0;
+ }
+ for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+ if (__kmp_barrier_release_pattern[i] != bp_dist_bar)
+ __kmp_barrier_release_pattern[i] = bp_dist_bar;
+ if (__kmp_barrier_gather_pattern[i] != bp_dist_bar)
+ __kmp_barrier_gather_pattern[i] = bp_dist_bar;
+ }
+ }
} // __kmp_stg_parse_barrier_pattern
static void __kmp_stg_print_barrier_pattern(kmp_str_buf_t *buffer,
@@ -1739,7 +1783,7 @@ static void __kmp_stg_print_barrier_pattern(kmp_str_buf_t *buffer,
__kmp_str_buf_print(buffer, " %s='",
__kmp_barrier_pattern_env_name[i]);
}
- KMP_DEBUG_ASSERT(j < bs_last_barrier && k < bs_last_barrier);
+ KMP_DEBUG_ASSERT(j < bp_last_bar && k < bp_last_bar);
__kmp_str_buf_print(buffer, "%s,%s'\n", __kmp_barrier_pattern_name[j],
__kmp_barrier_pattern_name[k]);
}
@@ -3092,6 +3136,7 @@ static void __kmp_stg_parse_topology_method(char const *name, char const *value,
}
#if KMP_GROUP_AFFINITY
else if (__kmp_str_match("group", 1, value)) {
+ KMP_WARNING(StgDeprecatedValue, name, value, "all");
__kmp_affinity_top_method = affinity_top_method_group;
}
#endif /* KMP_GROUP_AFFINITY */
@@ -3155,6 +3200,47 @@ static void __kmp_stg_print_topology_method(kmp_str_buf_t *buffer,
}
} // __kmp_stg_print_topology_method
+// KMP_TEAMS_PROC_BIND
+struct kmp_proc_bind_info_t {
+ const char *name;
+ kmp_proc_bind_t proc_bind;
+};
+static kmp_proc_bind_info_t proc_bind_table[] = {
+ {"spread", proc_bind_spread},
+ {"true", proc_bind_spread},
+ {"close", proc_bind_close},
+ // teams-bind = false means "replicate the primary thread's affinity"
+ {"false", proc_bind_primary},
+ {"primary", proc_bind_primary}};
+static void __kmp_stg_parse_teams_proc_bind(char const *name, char const *value,
+ void *data) {
+ int valid;
+ const char *end;
+ valid = 0;
+ for (size_t i = 0; i < sizeof(proc_bind_table) / sizeof(proc_bind_table[0]);
+ ++i) {
+ if (__kmp_match_str(proc_bind_table[i].name, value, &end)) {
+ __kmp_teams_proc_bind = proc_bind_table[i].proc_bind;
+ valid = 1;
+ break;
+ }
+ }
+ if (!valid) {
+ KMP_WARNING(StgInvalidValue, name, value);
+ }
+}
+static void __kmp_stg_print_teams_proc_bind(kmp_str_buf_t *buffer,
+ char const *name, void *data) {
+ const char *value = KMP_I18N_STR(NotDefined);
+ for (size_t i = 0; i < sizeof(proc_bind_table) / sizeof(proc_bind_table[0]);
+ ++i) {
+ if (__kmp_teams_proc_bind == proc_bind_table[i].proc_bind) {
+ value = proc_bind_table[i].name;
+ break;
+ }
+ }
+ __kmp_stg_print_str(buffer, name, value);
+}
#endif /* KMP_AFFINITY_SUPPORTED */
// OMP_PROC_BIND / bind-var is functional on all 4.0 builds, including OS X*
@@ -4415,7 +4501,7 @@ static void __kmp_stg_parse_lock_kind(char const *name, char const *value,
}
#if KMP_USE_ADAPTIVE_LOCKS
else if (__kmp_str_match("adaptive", 1, value)) {
- if (__kmp_cpuinfo.rtm) { // ??? Is cpuinfo available here?
+ if (__kmp_cpuinfo.flags.rtm) { // ??? Is cpuinfo available here?
__kmp_user_lock_kind = lk_adaptive;
KMP_STORE_LOCK_SEQ(adaptive);
} else {
@@ -4427,7 +4513,7 @@ static void __kmp_stg_parse_lock_kind(char const *name, char const *value,
#endif // KMP_USE_ADAPTIVE_LOCKS
#if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX
else if (__kmp_str_match("rtm_queuing", 1, value)) {
- if (__kmp_cpuinfo.rtm) {
+ if (__kmp_cpuinfo.flags.rtm) {
__kmp_user_lock_kind = lk_rtm_queuing;
KMP_STORE_LOCK_SEQ(rtm_queuing);
} else {
@@ -4436,7 +4522,7 @@ static void __kmp_stg_parse_lock_kind(char const *name, char const *value,
KMP_STORE_LOCK_SEQ(queuing);
}
} else if (__kmp_str_match("rtm_spin", 1, value)) {
- if (__kmp_cpuinfo.rtm) {
+ if (__kmp_cpuinfo.flags.rtm) {
__kmp_user_lock_kind = lk_rtm_spin;
KMP_STORE_LOCK_SEQ(rtm_spin);
} else {
@@ -4875,28 +4961,85 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
// Check each component
for (int i = 0; i < level; ++i) {
- int offset = 0;
- int num = atoi(components[i]); // each component should start with a number
- if (num <= 0) {
- goto err; // only positive integers are valid for count
- }
- if ((pos = strchr(components[i], '@'))) {
- offset = atoi(pos + 1); // save offset
- *pos = '\0'; // cut the offset from the component
- }
- pos = components[i] + strspn(components[i], digits);
- if (pos == components[i]) {
- goto err;
- }
- // detect the component type
- kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos);
- if (type == KMP_HW_UNKNOWN) {
- goto err;
- }
- if (__kmp_hw_subset->specified(type)) {
- goto err;
+ int core_level = 0;
+ char *core_components[MAX_T_LEVEL];
+ // Split possible core components by '&' delimiter
+ pos = components[i];
+ core_components[core_level++] = pos;
+ while ((pos = strchr(pos, '&'))) {
+ if (core_level >= MAX_T_LEVEL)
+ goto err; // too many different core types
+ *pos = '\0'; // modify input and avoid more copying
+ core_components[core_level++] = ++pos; // expect something after '&'
+ }
+
+ for (int j = 0; j < core_level; ++j) {
+ char *offset_ptr;
+ char *attr_ptr;
+ int offset = 0;
+ kmp_hw_attr_t attr;
+ int num;
+ // components may begin with an optional count of the number of resources
+ if (isdigit(*core_components[j])) {
+ num = atoi(core_components[j]);
+ if (num <= 0) {
+ goto err; // only positive integers are valid for count
+ }
+ pos = core_components[j] + strspn(core_components[j], digits);
+ } else if (*core_components[j] == '*') {
+ num = kmp_hw_subset_t::USE_ALL;
+ pos = core_components[j] + 1;
+ } else {
+ num = kmp_hw_subset_t::USE_ALL;
+ pos = core_components[j];
+ }
+
+ offset_ptr = strchr(core_components[j], '@');
+ attr_ptr = strchr(core_components[j], ':');
+
+ if (offset_ptr) {
+ offset = atoi(offset_ptr + 1); // save offset
+ *offset_ptr = '\0'; // cut the offset from the component
+ }
+ if (attr_ptr) {
+ attr.clear();
+ // save the attribute
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ if (__kmp_str_match("intel_core", -1, attr_ptr + 1)) {
+ attr.set_core_type(KMP_HW_CORE_TYPE_CORE);
+ } else if (__kmp_str_match("intel_atom", -1, attr_ptr + 1)) {
+ attr.set_core_type(KMP_HW_CORE_TYPE_ATOM);
+ }
+#endif
+ if (__kmp_str_match("eff", 3, attr_ptr + 1)) {
+ const char *number = attr_ptr + 1;
+ // skip the eff[iciency] token
+ while (isalpha(*number))
+ number++;
+ if (!isdigit(*number)) {
+ goto err;
+ }
+ int efficiency = atoi(number);
+ attr.set_core_eff(efficiency);
+ } else {
+ goto err;
+ }
+ *attr_ptr = '\0'; // cut the attribute from the component
+ }
+ // detect the component type
+ kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos);
+ if (type == KMP_HW_UNKNOWN) {
+ goto err;
+ }
+ // Only the core type can have attributes
+ if (attr && type != KMP_HW_CORE)
+ goto err;
+ // Must allow core be specified more than once
+ if (type != KMP_HW_CORE && __kmp_hw_subset->specified(type)) {
+ goto err;
+ }
+ __kmp_hw_subset->push_back(num, type, offset, attr);
}
- __kmp_hw_subset->push_back(num, type, offset);
}
return;
err:
@@ -4908,6 +5051,21 @@ err:
return;
}
+static inline const char *
+__kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
+ switch (type) {
+ case KMP_HW_CORE_TYPE_UNKNOWN:
+ return "unknown";
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ case KMP_HW_CORE_TYPE_ATOM:
+ return "intel_atom";
+ case KMP_HW_CORE_TYPE_CORE:
+ return "intel_core";
+#endif
+ }
+ return "unknown";
+}
+
static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
void *data) {
kmp_str_buf_t buf;
@@ -4923,10 +5081,20 @@ static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
depth = __kmp_hw_subset->get_depth();
for (int i = 0; i < depth; ++i) {
const auto &item = __kmp_hw_subset->at(i);
- __kmp_str_buf_print(&buf, "%s%d%s", (i > 0 ? "," : ""), item.num,
- __kmp_hw_get_keyword(item.type));
- if (item.offset)
- __kmp_str_buf_print(&buf, "@%d", item.offset);
+ if (i > 0)
+ __kmp_str_buf_print(&buf, "%c", ',');
+ for (int j = 0; j < item.num_attrs; ++j) {
+ __kmp_str_buf_print(&buf, "%s%d%s", (j > 0 ? "&" : ""), item.num[j],
+ __kmp_hw_get_keyword(item.type));
+ if (item.attr[j].is_core_type_valid())
+ __kmp_str_buf_print(
+ &buf, ":%s",
+ __kmp_hw_get_core_type_keyword(item.attr[j].get_core_type()));
+ if (item.attr[j].is_core_eff_valid())
+ __kmp_str_buf_print(&buf, ":eff%d", item.attr[j].get_core_eff());
+ if (item.offset[j])
+ __kmp_str_buf_print(&buf, "@%d", item.offset[j]);
+ }
}
__kmp_str_buf_print(buffer, "%s'\n", buf.str);
__kmp_str_buf_free(&buf);
@@ -5003,6 +5171,27 @@ static void __kmp_stg_print_mwait_hints(kmp_str_buf_t *buffer, char const *name,
#endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+#if KMP_HAVE_UMWAIT
+// -----------------------------------------------------------------------------
+// KMP_TPAUSE
+// 0 = don't use TPAUSE, 1 = use C0.1 state, 2 = use C0.2 state
+
+static void __kmp_stg_parse_tpause(char const *name, char const *value,
+ void *data) {
+ __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_tpause_state);
+ if (__kmp_tpause_state != 0) {
+ // The actual hint passed to tpause is: 0 for C0.2 and 1 for C0.1
+ if (__kmp_tpause_state == 2) // use C0.2
+ __kmp_tpause_hint = 0; // default was set to 1 for C0.1
+ }
+} // __kmp_stg_parse_tpause
+
+static void __kmp_stg_print_tpause(kmp_str_buf_t *buffer, char const *name,
+ void *data) {
+ __kmp_stg_print_int(buffer, name, __kmp_tpause_state);
+} // __kmp_stg_print_tpause
+#endif // KMP_HAVE_UMWAIT
+
// -----------------------------------------------------------------------------
// OMP_DISPLAY_ENV
@@ -5260,6 +5449,8 @@ static kmp_setting_t __kmp_stg_table[] = {
#endif /* KMP_GOMP_COMPAT */
{"OMP_PROC_BIND", __kmp_stg_parse_proc_bind, __kmp_stg_print_proc_bind,
NULL, 0, 0},
+ {"KMP_TEAMS_PROC_BIND", __kmp_stg_parse_teams_proc_bind,
+ __kmp_stg_print_teams_proc_bind, NULL, 0, 0},
{"OMP_PLACES", __kmp_stg_parse_places, __kmp_stg_print_places, NULL, 0, 0},
{"KMP_TOPOLOGY_METHOD", __kmp_stg_parse_topology_method,
__kmp_stg_print_topology_method, NULL, 0, 0},
@@ -5366,6 +5557,10 @@ static kmp_setting_t __kmp_stg_table[] = {
{"KMP_MWAIT_HINTS", __kmp_stg_parse_mwait_hints,
__kmp_stg_print_mwait_hints, NULL, 0, 0},
#endif
+
+#if KMP_HAVE_UMWAIT
+ {"KMP_TPAUSE", __kmp_stg_parse_tpause, __kmp_stg_print_tpause, NULL, 0, 0},
+#endif
{"", NULL, NULL, NULL, 0, 0}}; // settings
static int const __kmp_stg_count =
@@ -5942,65 +6137,27 @@ void __kmp_env_initialize(char const *string) {
// Handle the Win 64 group affinity stuff if there are multiple
// processor groups, or if the user requested it, and OMP 4.0
// affinity is not in effect.
- if (((__kmp_num_proc_groups > 1) &&
- (__kmp_affinity_type == affinity_default) &&
- (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default)) ||
- (__kmp_affinity_top_method == affinity_top_method_group)) {
+ if (__kmp_num_proc_groups > 1 &&
+ __kmp_affinity_type == affinity_default &&
+ __kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
+ // Do not respect the initial processor affinity mask if it is assigned
+ // exactly one Windows Processor Group since this is interpreted as the
+ // default OS assignment. Not respecting the mask allows the runtime to
+ // use all the logical processors in all groups.
if (__kmp_affinity_respect_mask == affinity_respect_mask_default &&
exactly_one_group) {
__kmp_affinity_respect_mask = FALSE;
}
+ // Use compact affinity with anticipation of pinning to at least the
+ // group granularity since threads can only be bound to one group.
if (__kmp_affinity_type == affinity_default) {
__kmp_affinity_type = affinity_compact;
__kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
}
- if (__kmp_affinity_top_method == affinity_top_method_default) {
- if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
- __kmp_affinity_top_method = affinity_top_method_group;
- __kmp_affinity_gran = KMP_HW_PROC_GROUP;
- } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) {
- __kmp_affinity_top_method = affinity_top_method_group;
- } else {
- __kmp_affinity_top_method = affinity_top_method_all;
- }
- } else if (__kmp_affinity_top_method == affinity_top_method_group) {
- if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
- __kmp_affinity_gran = KMP_HW_PROC_GROUP;
- } else if ((__kmp_affinity_gran != KMP_HW_PROC_GROUP) &&
- (__kmp_affinity_gran != KMP_HW_THREAD)) {
- const char *str = __kmp_hw_get_keyword(__kmp_affinity_gran);
- KMP_WARNING(AffGranTopGroup, var, str);
- __kmp_affinity_gran = KMP_HW_THREAD;
- }
- } else {
- if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
- __kmp_affinity_gran = KMP_HW_CORE;
- } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) {
- const char *str = NULL;
- switch (__kmp_affinity_type) {
- case affinity_physical:
- str = "physical";
- break;
- case affinity_logical:
- str = "logical";
- break;
- case affinity_compact:
- str = "compact";
- break;
- case affinity_scatter:
- str = "scatter";
- break;
- case affinity_explicit:
- str = "explicit";
- break;
- // No MIC on windows, so no affinity_balanced case
- default:
- KMP_DEBUG_ASSERT(0);
- }
- KMP_WARNING(AffGranGroupType, var, str);
- __kmp_affinity_gran = KMP_HW_CORE;
- }
- }
+ if (__kmp_affinity_top_method == affinity_top_method_default)
+ __kmp_affinity_top_method = affinity_top_method_all;
+ if (__kmp_affinity_gran == KMP_HW_UNKNOWN)
+ __kmp_affinity_gran = KMP_HW_PROC_GROUP;
} else
#endif /* KMP_GROUP_AFFINITY */
diff --git a/contrib/libs/cxxsupp/openmp/kmp_stats.h b/contrib/libs/cxxsupp/openmp/kmp_stats.h
index 78bbb9068a..0e3ea3b9cf 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_stats.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_stats.h
@@ -246,6 +246,8 @@ enum stats_state_e {
// KMP_tree_release -- time in __kmp_tree_barrier_release
// KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
// KMP_hyper_release -- time in __kmp_hyper_barrier_release
+// KMP_dist_gather -- time in __kmp_dist_barrier_gather
+// KMP_dist_release -- time in __kmp_dist_barrier_release
// clang-format off
#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
macro(KMP_fork_call, 0, arg) \
@@ -255,6 +257,8 @@ enum stats_state_e {
macro(KMP_hier_release, 0, arg) \
macro(KMP_hyper_gather, 0, arg) \
macro(KMP_hyper_release, 0, arg) \
+ macro(KMP_dist_gather, 0, arg) \
+ macro(KMP_dist_release, 0, arg) \
macro(KMP_linear_gather, 0, arg) \
macro(KMP_linear_release, 0, arg) \
macro(KMP_tree_gather, 0, arg) \
diff --git a/contrib/libs/cxxsupp/openmp/kmp_str.cpp b/contrib/libs/cxxsupp/openmp/kmp_str.cpp
index ffce2b88ab..e64f989fbc 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_str.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_str.cpp
@@ -515,6 +515,31 @@ int __kmp_str_match(char const *target, int len, char const *data) {
return ((len > 0) ? i >= len : (!target[i] && (len || !data[i])));
} // __kmp_str_match
+// If data contains all of target, returns true, otherwise returns false.
+// len should be the length of target
+bool __kmp_str_contains(char const *target, int len, char const *data) {
+ int i = 0, j = 0, start = 0;
+ if (target == NULL || data == NULL) {
+ return FALSE;
+ }
+ while (target[i]) {
+ if (!data[j])
+ return FALSE;
+ if (TOLOWER(target[i]) != TOLOWER(data[j])) {
+ j = start + 1;
+ start = j;
+ i = 0;
+ } else {
+ if (i == 0)
+ start = j;
+ j++;
+ i++;
+ }
+ }
+
+ return i == len;
+} // __kmp_str_contains
+
int __kmp_str_match_false(char const *data) {
int result =
__kmp_str_match("false", 1, data) || __kmp_str_match("off", 2, data) ||
diff --git a/contrib/libs/cxxsupp/openmp/kmp_str.h b/contrib/libs/cxxsupp/openmp/kmp_str.h
index ff6179908e..855b5df55d 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_str.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_str.h
@@ -106,6 +106,7 @@ int __kmp_str_eqf(char const *lhs, char const *rhs);
char *__kmp_str_format(char const *format, ...);
void __kmp_str_free(char **str);
int __kmp_str_match(char const *target, int len, char const *data);
+bool __kmp_str_contains(char const *target, int len, char const *data);
int __kmp_str_match_false(char const *data);
int __kmp_str_match_true(char const *data);
void __kmp_str_replace(char *str, char search_for, char replace_with);
diff --git a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp
index dd3e7688d3..501830eaa7 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp
@@ -86,6 +86,7 @@ static kmp_dephash_t *__kmp_dephash_extend(kmp_info_t *thread,
h->buckets = (kmp_dephash_entry **)(h + 1);
h->generation = gen;
h->nconflicts = 0;
+ h->last_all = current_dephash->last_all;
// make sure buckets are properly initialized
for (size_t i = 0; i < new_size; i++) {
@@ -142,6 +143,7 @@ static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread,
h->nelements = 0;
h->nconflicts = 0;
h->buckets = (kmp_dephash_entry **)(h + 1);
+ h->last_all = NULL;
for (size_t i = 0; i < h_size; i++)
h->buckets[i] = 0;
@@ -174,7 +176,10 @@ static kmp_dephash_entry *__kmp_dephash_find(kmp_info_t *thread,
thread, sizeof(kmp_dephash_entry_t));
#endif
entry->addr = addr;
- entry->last_out = NULL;
+ if (!h->last_all) // no predecessor task with omp_all_memory dependence
+ entry->last_out = NULL;
+ else // else link the omp_all_memory depnode to the new entry
+ entry->last_out = __kmp_node_ref(h->last_all);
entry->last_set = NULL;
entry->prev_set = NULL;
entry->last_flag = 0;
@@ -290,6 +295,63 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
return npredecessors;
}
+static inline kmp_int32
+__kmp_process_dep_all(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *h,
+ bool dep_barrier, kmp_task_t *task) {
+ KA_TRACE(30, ("__kmp_process_dep_all: T#%d processing dep_all, "
+ "dep_barrier = %d\n",
+ gtid, dep_barrier));
+ kmp_info_t *thread = __kmp_threads[gtid];
+ kmp_int32 npredecessors = 0;
+
+ // process previous omp_all_memory node if any
+ npredecessors +=
+ __kmp_depnode_link_successor(gtid, thread, task, node, h->last_all);
+ __kmp_node_deref(thread, h->last_all);
+ if (!dep_barrier) {
+ h->last_all = __kmp_node_ref(node);
+ } else {
+ // if this is a sync point in the serial sequence, then the previous
+ // outputs are guaranteed to be completed after the execution of this
+ // task so the previous output nodes can be cleared.
+ h->last_all = NULL;
+ }
+
+ // process all regular dependences
+ for (size_t i = 0; i < h->size; i++) {
+ kmp_dephash_entry_t *info = h->buckets[i];
+ if (!info) // skip empty slots in dephash
+ continue;
+ for (; info; info = info->next_in_bucket) {
+ // for each entry the omp_all_memory works as OUT dependence
+ kmp_depnode_t *last_out = info->last_out;
+ kmp_depnode_list_t *last_set = info->last_set;
+ kmp_depnode_list_t *prev_set = info->prev_set;
+ if (last_set) {
+ npredecessors +=
+ __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
+ __kmp_depnode_list_free(thread, last_set);
+ __kmp_depnode_list_free(thread, prev_set);
+ info->last_set = NULL;
+ info->prev_set = NULL;
+ info->last_flag = 0; // no sets in this dephash entry
+ } else {
+ npredecessors +=
+ __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
+ }
+ __kmp_node_deref(thread, last_out);
+ if (!dep_barrier) {
+ info->last_out = __kmp_node_ref(node);
+ } else {
+ info->last_out = NULL;
+ }
+ }
+ }
+ KA_TRACE(30, ("__kmp_process_dep_all: T#%d found %d predecessors\n", gtid,
+ npredecessors));
+ return npredecessors;
+}
+
template <bool filter>
static inline kmp_int32
__kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
@@ -417,7 +479,7 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
kmp_depend_info_t *dep_list,
kmp_int32 ndeps_noalias,
kmp_depend_info_t *noalias_dep_list) {
- int i, n_mtxs = 0;
+ int i, n_mtxs = 0, dep_all = 0;
#if KMP_DEBUG
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
#endif
@@ -429,7 +491,8 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
// Filter deps in dep_list
// TODO: Different algorithm for large dep_list ( > 10 ? )
for (i = 0; i < ndeps; i++) {
- if (dep_list[i].base_addr != 0) {
+ if (dep_list[i].base_addr != 0 &&
+ dep_list[i].base_addr != (kmp_intptr_t)KMP_SIZE_T_MAX) {
KMP_DEBUG_ASSERT(
dep_list[i].flag == KMP_DEP_IN || dep_list[i].flag == KMP_DEP_OUT ||
dep_list[i].flag == KMP_DEP_INOUT ||
@@ -451,6 +514,13 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
dep_list[i].flag = KMP_DEP_OUT; // downgrade mutexinoutset to inout
}
}
+ } else if (dep_list[i].flag == KMP_DEP_ALL ||
+ dep_list[i].base_addr == (kmp_intptr_t)KMP_SIZE_T_MAX) {
+ // omp_all_memory dependence can be marked by compiler by either
+ // (addr=0 && flag=0x80) (flag KMP_DEP_ALL), or (addr=-1).
+ // omp_all_memory overrides all other dependences if any
+ dep_all = 1;
+ break;
}
}
@@ -464,10 +534,14 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
// the end
int npredecessors;
- npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier, ndeps,
- dep_list, task);
- npredecessors += __kmp_process_deps<false>(
- gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task);
+ if (!dep_all) { // regular dependences
+ npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier,
+ ndeps, dep_list, task);
+ npredecessors += __kmp_process_deps<false>(
+ gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task);
+ } else { // omp_all_memory dependence
+ npredecessors = __kmp_process_dep_all(gtid, node, *hash, dep_barrier, task);
+ }
node->dn.task = task;
KMP_MB();
@@ -755,8 +829,10 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
bool ignore = current_task->td_flags.team_serial ||
current_task->td_flags.tasking_ser ||
current_task->td_flags.final;
- ignore = ignore && thread->th.th_task_team != NULL &&
- thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE;
+ ignore =
+ ignore && thread->th.th_task_team != NULL &&
+ thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE &&
+ thread->th.th_task_team->tt.tt_hidden_helper_task_encountered == FALSE;
ignore = ignore || current_task->td_dephash == NULL;
if (ignore) {
diff --git a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h
index 73abf07018..99f182bbd0 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h
@@ -73,6 +73,8 @@ static inline void __kmp_dephash_free_entries(kmp_info_t *thread,
h->buckets[i] = 0;
}
}
+ __kmp_node_deref(thread, h->last_all);
+ h->last_all = NULL;
}
static inline void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h) {
@@ -144,9 +146,10 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
// encountering thread's queue; otherwise, it can be pushed to its own
// queue.
if (!next_taskdata->td_flags.hidden_helper) {
- __kmpc_give_task(
- successor->dn.task,
- __kmp_tid_from_gtid(next_taskdata->encountering_gtid));
+ kmp_int32 encountering_gtid =
+ next_taskdata->td_alloc_thread->th.th_info.ds.ds_gtid;
+ kmp_int32 encountering_tid = __kmp_tid_from_gtid(encountering_gtid);
+ __kmpc_give_task(successor->dn.task, encountering_tid);
} else {
__kmp_omp_task(gtid, successor->dn.task, false);
}
diff --git a/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp b/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp
index 55e9c30763..e445438524 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp
@@ -324,10 +324,16 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
kmp_info_t *thread = __kmp_threads[gtid];
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
- // We don't need to map to shadow gtid if it is already hidden helper thread
- if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) {
- gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
- thread = __kmp_threads[gtid];
+ // If we encounter a hidden helper task, and the current thread is not a
+ // hidden helper thread, we have to give the task to any hidden helper thread
+ // starting from its shadow one.
+ if (UNLIKELY(taskdata->td_flags.hidden_helper &&
+ !KMP_HIDDEN_HELPER_THREAD(gtid))) {
+ kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
+ __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
+ // Signal the hidden helper threads.
+ __kmp_hidden_helper_worker_thread_signal();
+ return TASK_SUCCESSFULLY_PUSHED;
}
kmp_task_team_t *task_team = thread->th.th_task_team;
@@ -434,16 +440,8 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
gtid, taskdata, thread_data->td.td_deque_ntasks,
thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
- auto hidden_helper = taskdata->td_flags.hidden_helper;
-
__kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
- // Signal one worker thread to execute the task
- if (UNLIKELY(hidden_helper)) {
- // Wake hidden helper threads up if they're sleeping
- __kmp_hidden_helper_worker_thread_signal();
- }
-
return TASK_SUCCESSFULLY_PUSHED;
}
@@ -809,6 +807,24 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
gtid, taskdata, children));
}
+// Only need to keep track of child task counts if any of the following:
+// 1. team parallel and tasking not serialized;
+// 2. it is a proxy or detachable or hidden helper task
+// 3. the children counter of its parent task is greater than 0.
+// The reason for the 3rd one is for serialized team that found detached task,
+// hidden helper task, T. In this case, the execution of T is still deferred,
+// and it is also possible that a regular task depends on T. In this case, if we
+// don't track the children, task synchronization will be broken.
+static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
+ kmp_tasking_flags_t flags = taskdata->td_flags;
+ bool ret = !(flags.team_serial || flags.tasking_ser);
+ ret = ret || flags.proxy == TASK_PROXY ||
+ flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
+ ret = ret ||
+ KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
+ return ret;
+}
+
// __kmp_task_finish: bookkeeping to do when a task finishes execution
//
// gtid: global thread ID for calling thread
@@ -825,8 +841,9 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
kmp_info_t *thread = __kmp_threads[gtid];
kmp_task_team_t *task_team =
thread->th.th_task_team; // might be NULL for serial teams...
+#if KMP_DEBUG
kmp_int32 children = 0;
-
+#endif
KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
"task %p\n",
gtid, taskdata, resumed_task));
@@ -934,16 +951,15 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
if (ompt)
__ompt_task_finish(task, resumed_task, ompt_task_complete);
#endif
-
- // Only need to keep track of count if team parallel and tasking not
- // serialized, or task is detachable and event has already been fulfilled
- if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
- taskdata->td_flags.detachable == TASK_DETACHABLE ||
- taskdata->td_flags.hidden_helper) {
+ // TODO: What would be the balance between the conditions in the function
+ // and an atomic operation?
+ if (__kmp_track_children_task(taskdata)) {
__kmp_release_deps(gtid, taskdata);
// Predecrement simulated by "- 1" calculation
- children =
- KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
+#if KMP_DEBUG
+ children = -1 +
+#endif
+ KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
KMP_DEBUG_ASSERT(children >= 0);
if (taskdata->td_taskgroup)
KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
@@ -1189,7 +1205,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *task;
kmp_taskdata_t *taskdata;
kmp_info_t *thread = __kmp_threads[gtid];
- kmp_info_t *encountering_thread = thread;
kmp_team_t *team = thread->th.th_team;
kmp_taskdata_t *parent_task = thread->th.th_current_task;
size_t shareds_offset;
@@ -1201,15 +1216,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
if (__kmp_enable_hidden_helper) {
if (!TCR_4(__kmp_init_hidden_helper))
__kmp_hidden_helper_initialize();
-
- // For a hidden helper task encountered by a regular thread, we will push
- // the task to the (gtid%__kmp_hidden_helper_threads_num)-th hidden helper
- // thread.
- if (!KMP_HIDDEN_HELPER_THREAD(gtid)) {
- thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
- // We don't change the parent-child relation for hidden helper task as
- // we need that to do per-task-region synchronization.
- }
} else {
// If the hidden helper task is not enabled, reset the flag to FALSE.
flags->hidden_helper = FALSE;
@@ -1232,8 +1238,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
// Untied task encountered causes the TSC algorithm to check entire deque of
// the victim thread. If no untied task encountered, then checking the head
// of the deque should be enough.
- KMP_CHECK_UPDATE(
- encountering_thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
+ KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
}
// Detachable tasks are not proxy tasks yet but could be in the future. Doing
@@ -1247,32 +1252,30 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
}
/* are we running in a sequential parallel or tskm_immediate_exec... we need
tasking support enabled */
- if ((encountering_thread->th.th_task_team) == NULL) {
+ if ((thread->th.th_task_team) == NULL) {
/* This should only happen if the team is serialized
setup a task team and propagate it to the thread */
KMP_DEBUG_ASSERT(team->t.t_serialized);
KA_TRACE(30,
("T#%d creating task team in __kmp_task_alloc for proxy task\n",
gtid));
- __kmp_task_team_setup(
- encountering_thread, team,
- 1); // 1 indicates setup the current team regardless of nthreads
- encountering_thread->th.th_task_team =
- team->t.t_task_team[encountering_thread->th.th_task_state];
+ // 1 indicates setup the current team regardless of nthreads
+ __kmp_task_team_setup(thread, team, 1);
+ thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
}
- kmp_task_team_t *task_team = encountering_thread->th.th_task_team;
+ kmp_task_team_t *task_team = thread->th.th_task_team;
/* tasking must be enabled now as the task might not be pushed */
if (!KMP_TASKING_ENABLED(task_team)) {
KA_TRACE(
30,
("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
- __kmp_enable_tasking(task_team, encountering_thread);
- kmp_int32 tid = encountering_thread->th.th_info.ds.ds_tid;
+ __kmp_enable_tasking(task_team, thread);
+ kmp_int32 tid = thread->th.th_info.ds.ds_tid;
kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
// No lock needed since only owner can allocate
if (thread_data->td.td_deque == NULL) {
- __kmp_alloc_task_deque(encountering_thread, thread_data);
+ __kmp_alloc_task_deque(thread, thread_data);
}
}
@@ -1297,11 +1300,11 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
// Avoid double allocation here by combining shareds with taskdata
#if USE_FAST_MEMORY
- taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(
- encountering_thread, shareds_offset + sizeof_shareds);
+ taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
+ sizeof_shareds);
#else /* ! USE_FAST_MEMORY */
- taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(
- encountering_thread, shareds_offset + sizeof_shareds);
+ taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
+ sizeof_shareds);
#endif /* USE_FAST_MEMORY */
task = KMP_TASKDATA_TO_TASK(taskdata);
@@ -1328,7 +1331,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
taskdata->td_task_id = KMP_GEN_TASK_ID();
taskdata->td_team = thread->th.th_team;
- taskdata->td_alloc_thread = encountering_thread;
+ taskdata->td_alloc_thread = thread;
taskdata->td_parent = parent_task;
taskdata->td_level = parent_task->td_level + 1; // increment nesting level
KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
@@ -1342,10 +1345,16 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
taskdata->td_flags = *flags;
- taskdata->encountering_gtid = gtid;
taskdata->td_task_team = thread->th.th_task_team;
taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
taskdata->td_flags.tasktype = TASK_EXPLICIT;
+ // If it is hidden helper task, we need to set the team and task team
+ // correspondingly.
+ if (flags->hidden_helper) {
+ kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
+ taskdata->td_team = shadow_thread->th.th_team;
+ taskdata->td_task_team = shadow_thread->th.th_task_team;
+ }
// GEH - TODO: fix this to copy parent task's value of tasking_ser flag
taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
@@ -1382,11 +1391,9 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
if (UNLIKELY(ompt_enabled.enabled))
__ompt_task_init(taskdata, gtid);
#endif
- // Only need to keep track of child task counts if team parallel and tasking
- // not serialized or if it is a proxy or detachable or hidden helper task
- if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE ||
- flags->hidden_helper ||
- !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
+ // TODO: What would be the balance between the conditions in the function and
+ // an atomic operation?
+ if (__kmp_track_children_task(taskdata)) {
KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
if (parent_task->td_taskgroup)
KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
@@ -1438,11 +1445,12 @@ kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
size_t sizeof_shareds,
kmp_routine_entry_t task_entry,
kmp_int64 device_id) {
- if (__kmp_enable_hidden_helper) {
- auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
+ auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
+ // target task is untied defined in the specification
+ input_flags.tiedness = TASK_UNTIED;
+
+ if (__kmp_enable_hidden_helper)
input_flags.hidden_helper = TRUE;
- input_flags.tiedness = TASK_UNTIED;
- }
return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
sizeof_shareds, task_entry);
@@ -1613,13 +1621,15 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
#endif
+ if (task->routine != NULL) {
#ifdef KMP_GOMP_COMPAT
- if (taskdata->td_flags.native) {
- ((void (*)(void *))(*(task->routine)))(task->shareds);
- } else
+ if (taskdata->td_flags.native) {
+ ((void (*)(void *))(*(task->routine)))(task->shareds);
+ } else
#endif /* KMP_GOMP_COMPAT */
- {
- (*(task->routine))(gtid, task);
+ {
+ (*(task->routine))(gtid, task);
+ }
}
KMP_POP_PARTITIONED_TIMER();
@@ -2833,15 +2843,14 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
// We need to un-mark this victim as a finished victim. This must be done
// before releasing the lock, or else other threads (starting with the
// primary thread victim) might be prematurely released from the barrier!!!
- kmp_int32 count;
-
- count = KMP_ATOMIC_INC(unfinished_threads);
-
+#if KMP_DEBUG
+ kmp_int32 count =
+#endif
+ KMP_ATOMIC_INC(unfinished_threads);
KA_TRACE(
20,
("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
gtid, count + 1, task_team));
-
*thread_finished = FALSE;
}
TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
@@ -2948,8 +2957,7 @@ static inline int __kmp_execute_tasks_template(
(TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
NULL)) {
asleep = 1;
- __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
- other_thread->th.th_sleep_loc);
+ __kmp_null_resume_wrapper(other_thread);
// A sleeping thread should not have any tasks on it's queue.
// There is a slight possibility that it resumes, steals a task
// from another thread, which spawns more tasks, all in the time
@@ -3034,9 +3042,10 @@ static inline int __kmp_execute_tasks_template(
// done. This decrement might be to the spin location, and result in the
// termination condition being satisfied.
if (!*thread_finished) {
- kmp_int32 count;
-
- count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
+#if KMP_DEBUG
+ kmp_int32 count = -1 +
+#endif
+ KMP_ATOMIC_DEC(unfinished_threads);
KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
"unfinished_threads to %d task_team=%p\n",
gtid, count, task_team));
@@ -3065,6 +3074,18 @@ static inline int __kmp_execute_tasks_template(
return FALSE;
}
+ // Check the flag again to see if it has already done in case to be trapped
+ // into infinite loop when a if0 task depends on a hidden helper task
+ // outside any parallel region. Detached tasks are not impacted in this case
+ // because the only thread executing this function has to execute the proxy
+ // task so it is in another code path that has the same check.
+ if (flag == NULL || (!final_spin && flag->done_check())) {
+ KA_TRACE(15,
+ ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
+ gtid));
+ return TRUE;
+ }
+
// We could be getting tasks from target constructs; if this is the only
// thread, keep trying to execute tasks from own queue
if (nthreads == 1 &&
@@ -3098,6 +3119,16 @@ int __kmp_execute_tasks_64(
thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
}
+template <bool C, bool S>
+int __kmp_atomic_execute_tasks_64(
+ kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
+ int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+ kmp_int32 is_constrained) {
+ return __kmp_execute_tasks_template(
+ thread, gtid, flag, final_spin,
+ thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
int __kmp_execute_tasks_oncore(
kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
@@ -3124,6 +3155,14 @@ template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
int *USE_ITT_BUILD_ARG(void *),
kmp_int32);
+template int __kmp_atomic_execute_tasks_64<false, true>(
+ kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
+ int *USE_ITT_BUILD_ARG(void *), kmp_int32);
+
+template int __kmp_atomic_execute_tasks_64<true, false>(
+ kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
+ int *USE_ITT_BUILD_ARG(void *), kmp_int32);
+
// __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
// next barrier so they can assist in executing enqueued tasks.
// First thread in allocates the task team atomically.
@@ -3162,7 +3201,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
// tasks and execute them. In extra barrier mode, tasks do not sleep
// at the separate tasking barrier, so this isn't a problem.
for (i = 0; i < nthreads; i++) {
- volatile void *sleep_loc;
+ void *sleep_loc;
kmp_info_t *thread = threads_data[i].td.td_thr;
if (i == this_thr->th.th_info.ds.ds_tid) {
@@ -3179,7 +3218,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
__kmp_gtid_from_thread(this_thr),
__kmp_gtid_from_thread(thread)));
- __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
+ __kmp_null_resume_wrapper(thread);
} else {
KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
__kmp_gtid_from_thread(this_thr),
@@ -3451,6 +3490,7 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
TCW_4(task_team->tt.tt_found_tasks, FALSE);
TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+ TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
@@ -3512,9 +3552,11 @@ void __kmp_reap_task_teams(void) {
void __kmp_wait_to_unref_task_teams(void) {
kmp_info_t *thread;
kmp_uint32 spins;
+ kmp_uint64 time;
int done;
KMP_INIT_YIELD(spins);
+ KMP_INIT_BACKOFF(time);
for (;;) {
done = TRUE;
@@ -3547,7 +3589,7 @@ void __kmp_wait_to_unref_task_teams(void) {
__kmp_gtid_from_thread(thread)));
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
- volatile void *sleep_loc;
+ void *sleep_loc;
// If the thread is sleeping, awaken it.
if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
NULL) {
@@ -3555,7 +3597,7 @@ void __kmp_wait_to_unref_task_teams(void) {
10,
("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
__kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
- __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
+ __kmp_null_resume_wrapper(thread);
}
}
}
@@ -3564,7 +3606,7 @@ void __kmp_wait_to_unref_task_teams(void) {
}
// If oversubscribed or have waited a bit, yield.
- KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
}
}
@@ -3613,6 +3655,7 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
TCW_4(task_team->tt.tt_found_tasks, FALSE);
TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+ TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
team->t.t_nproc);
TCW_4(task_team->tt.tt_active, TRUE);
@@ -3705,8 +3748,10 @@ void __kmp_task_team_wait(
"setting active to false, setting local and team's pointer to NULL\n",
__kmp_gtid_from_thread(this_thr), task_team));
KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
- task_team->tt.tt_found_proxy_tasks == TRUE);
+ task_team->tt.tt_found_proxy_tasks == TRUE ||
+ task_team->tt.tt_hidden_helper_task_encountered == TRUE);
TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+ TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
TCW_SYNC_4(task_team->tt.tt_active, FALSE);
KMP_MB();
@@ -3869,11 +3914,12 @@ static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
}
static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
+#if KMP_DEBUG
kmp_int32 children = 0;
-
// Predecrement simulated by "- 1" calculation
- children =
- KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
+ children = -1 +
+#endif
+ KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
KMP_DEBUG_ASSERT(children >= 0);
// Remove the imaginary children
@@ -3936,7 +3982,7 @@ void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
// This should be similar to start_k = __kmp_get_random( thread ) % nthreads
// but we cannot use __kmp_get_random here
- kmp_int32 start_k = start;
+ kmp_int32 start_k = start % nthreads;
kmp_int32 pass = 1;
kmp_int32 k = start_k;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_utility.cpp b/contrib/libs/cxxsupp/openmp/kmp_utility.cpp
index 6531536f5d..9465f720e0 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_utility.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_utility.cpp
@@ -135,7 +135,7 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
p->initialized = 1;
- p->sse2 = 1; // Assume SSE2 by default.
+ p->flags.sse2 = 1; // Assume SSE2 by default.
__kmp_x86_cpuid(0, 0, &buf);
@@ -175,7 +175,7 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
data[i] = (t & 0xff);
}
- p->sse2 = (buf.edx >> 26) & 1;
+ p->flags.sse2 = (buf.edx >> 26) & 1;
#ifdef KMP_DEBUG
@@ -253,15 +253,21 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
i, buf.eax, buf.ebx, buf.ecx, buf.edx));
}
#endif
-#if KMP_USE_ADAPTIVE_LOCKS
- p->rtm = 0;
+ p->flags.rtm = 0;
+ p->flags.hybrid = 0;
if (max_arg > 7) {
/* RTM bit CPUID.07:EBX, bit 11 */
+ /* HYRBID bit CPUID.07:EDX, bit 15 */
__kmp_x86_cpuid(7, 0, &buf);
- p->rtm = (buf.ebx >> 11) & 1;
- KA_TRACE(trace_level, (" RTM"));
+ p->flags.rtm = (buf.ebx >> 11) & 1;
+ p->flags.hybrid = (buf.edx >> 15) & 1;
+ if (p->flags.rtm) {
+ KA_TRACE(trace_level, (" RTM"));
+ }
+ if (p->flags.hybrid) {
+ KA_TRACE(trace_level, (" HYBRID"));
+ }
}
-#endif
}
{ // Parse CPU brand string for frequency, saving the string for later.
diff --git a/contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp b/contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp
index cabb5722f4..d41ddf231e 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp
@@ -33,6 +33,10 @@ template <bool C, bool S>
void __kmp_mwait_64(int th_gtid, kmp_flag_64<C, S> *flag) {
__kmp_mwait_template(th_gtid, flag);
}
+template <bool C, bool S>
+void __kmp_atomic_mwait_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag) {
+ __kmp_mwait_template(th_gtid, flag);
+}
void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag) {
__kmp_mwait_template(th_gtid, flag);
}
@@ -40,4 +44,8 @@ void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag) {
template void __kmp_mwait_32<false, false>(int, kmp_flag_32<false, false> *);
template void __kmp_mwait_64<false, true>(int, kmp_flag_64<false, true> *);
template void __kmp_mwait_64<true, false>(int, kmp_flag_64<true, false> *);
+template void
+__kmp_atomic_mwait_64<false, true>(int, kmp_atomic_flag_64<false, true> *);
+template void
+__kmp_atomic_mwait_64<true, false>(int, kmp_atomic_flag_64<true, false> *);
#endif
diff --git a/contrib/libs/cxxsupp/openmp/kmp_wait_release.h b/contrib/libs/cxxsupp/openmp/kmp_wait_release.h
index d528ce9f18..b32cb15de1 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_wait_release.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_wait_release.h
@@ -33,96 +33,288 @@ higher level operations such as barriers and fork/join.
@{
*/
-/*!
- * The flag_type describes the storage used for the flag.
- */
-enum flag_type {
- flag32, /**< 32 bit flags */
- flag64, /**< 64 bit flags */
- flag_oncore /**< special 64-bit flag for on-core barrier (hierarchical) */
-};
-
struct flag_properties {
unsigned int type : 16;
unsigned int reserved : 16;
};
-/*!
- * Base class for wait/release volatile flag
- */
-template <typename P> class kmp_flag_native {
- volatile P *loc;
- flag_properties t;
+template <enum flag_type FlagType> struct flag_traits {};
+
+template <> struct flag_traits<flag32> {
+ typedef kmp_uint32 flag_t;
+ static const flag_type t = flag32;
+ static inline flag_t tcr(flag_t f) { return TCR_4(f); }
+ static inline flag_t test_then_add4(volatile flag_t *f) {
+ return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f));
+ }
+ static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+ return KMP_TEST_THEN_OR32(f, v);
+ }
+ static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+ return KMP_TEST_THEN_AND32(f, v);
+ }
+};
+
+template <> struct flag_traits<atomic_flag64> {
+ typedef kmp_uint64 flag_t;
+ static const flag_type t = atomic_flag64;
+ static inline flag_t tcr(flag_t f) { return TCR_8(f); }
+ static inline flag_t test_then_add4(volatile flag_t *f) {
+ return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
+ }
+ static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+ return KMP_TEST_THEN_OR64(f, v);
+ }
+ static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+ return KMP_TEST_THEN_AND64(f, v);
+ }
+};
+
+template <> struct flag_traits<flag64> {
+ typedef kmp_uint64 flag_t;
+ static const flag_type t = flag64;
+ static inline flag_t tcr(flag_t f) { return TCR_8(f); }
+ static inline flag_t test_then_add4(volatile flag_t *f) {
+ return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
+ }
+ static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+ return KMP_TEST_THEN_OR64(f, v);
+ }
+ static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+ return KMP_TEST_THEN_AND64(f, v);
+ }
+};
+
+template <> struct flag_traits<flag_oncore> {
+ typedef kmp_uint64 flag_t;
+ static const flag_type t = flag_oncore;
+ static inline flag_t tcr(flag_t f) { return TCR_8(f); }
+ static inline flag_t test_then_add4(volatile flag_t *f) {
+ return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
+ }
+ static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+ return KMP_TEST_THEN_OR64(f, v);
+ }
+ static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+ return KMP_TEST_THEN_AND64(f, v);
+ }
+};
+
+/*! Base class for all flags */
+template <flag_type FlagType> class kmp_flag {
+protected:
+ flag_properties t; /**< "Type" of the flag in loc */
+ kmp_info_t *waiting_threads[1]; /**< Threads sleeping on this thread. */
+ kmp_uint32 num_waiting_threads; /**< #threads sleeping on this thread. */
+ std::atomic<bool> *sleepLoc;
public:
- typedef P flag_t;
- kmp_flag_native(volatile P *p, flag_type ft)
- : loc(p), t({(short unsigned int)ft, 0U}) {}
- volatile P *get() { return loc; }
- void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); }
- void set(volatile P *new_loc) { loc = new_loc; }
+ typedef flag_traits<FlagType> traits_type;
+ kmp_flag() : t({FlagType, 0U}), num_waiting_threads(0), sleepLoc(nullptr) {}
+ kmp_flag(int nwaiters)
+ : t({FlagType, 0U}), num_waiting_threads(nwaiters), sleepLoc(nullptr) {}
+ kmp_flag(std::atomic<bool> *sloc)
+ : t({FlagType, 0U}), num_waiting_threads(0), sleepLoc(sloc) {}
+ /*! @result the flag_type */
flag_type get_type() { return (flag_type)(t.type); }
- P load() { return *loc; }
- void store(P val) { *loc = val; }
+
+ /*! param i in index into waiting_threads
+ * @result the thread that is waiting at index i */
+ kmp_info_t *get_waiter(kmp_uint32 i) {
+ KMP_DEBUG_ASSERT(i < num_waiting_threads);
+ return waiting_threads[i];
+ }
+ /*! @result num_waiting_threads */
+ kmp_uint32 get_num_waiters() { return num_waiting_threads; }
+ /*! @param thr in the thread which is now waiting
+ * Insert a waiting thread at index 0. */
+ void set_waiter(kmp_info_t *thr) {
+ waiting_threads[0] = thr;
+ num_waiting_threads = 1;
+ }
+ enum barrier_type get_bt() { return bs_last_barrier; }
};
-/*!
- * Base class for wait/release atomic flag
- */
-template <typename P> class kmp_flag {
- std::atomic<P>
- *loc; /**< Pointer to the flag storage that is modified by another thread
- */
- flag_properties t; /**< "Type" of the flag in loc */
+/*! Base class for wait/release volatile flag */
+template <typename PtrType, flag_type FlagType, bool Sleepable>
+class kmp_flag_native : public kmp_flag<FlagType> {
+protected:
+ volatile PtrType *loc;
+ PtrType checker; /**< When flag==checker, it has been released. */
+ typedef flag_traits<FlagType> traits_type;
+
public:
- typedef P flag_t;
- kmp_flag(std::atomic<P> *p, flag_type ft)
- : loc(p), t({(short unsigned int)ft, 0U}) {}
- /*!
- * @result the pointer to the actual flag
- */
- std::atomic<P> *get() { return loc; }
- /*!
- * @result void* pointer to the actual flag
- */
+ typedef PtrType flag_t;
+ kmp_flag_native(volatile PtrType *p) : kmp_flag<FlagType>(), loc(p) {}
+ kmp_flag_native(volatile PtrType *p, kmp_info_t *thr)
+ : kmp_flag<FlagType>(1), loc(p) {
+ this->waiting_threads[0] = thr;
+ }
+ kmp_flag_native(volatile PtrType *p, PtrType c)
+ : kmp_flag<FlagType>(), loc(p), checker(c) {}
+ kmp_flag_native(volatile PtrType *p, PtrType c, std::atomic<bool> *sloc)
+ : kmp_flag<FlagType>(sloc), loc(p), checker(c) {}
+ virtual ~kmp_flag_native() {}
+ void *operator new(size_t size) { return __kmp_allocate(size); }
+ void operator delete(void *p) { __kmp_free(p); }
+ volatile PtrType *get() { return loc; }
+ void *get_void_p() { return RCAST(void *, CCAST(PtrType *, loc)); }
+ void set(volatile PtrType *new_loc) { loc = new_loc; }
+ PtrType load() { return *loc; }
+ void store(PtrType val) { *loc = val; }
+ /*! @result true if the flag object has been released. */
+ virtual bool done_check() {
+ if (Sleepable && !(this->sleepLoc))
+ return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) ==
+ checker;
+ else
+ return traits_type::tcr(*(this->get())) == checker;
+ }
+ /*! @param old_loc in old value of flag
+ * @result true if the flag's old value indicates it was released. */
+ virtual bool done_check_val(PtrType old_loc) { return old_loc == checker; }
+ /*! @result true if the flag object is not yet released.
+ * Used in __kmp_wait_template like:
+ * @code
+ * while (flag.notdone_check()) { pause(); }
+ * @endcode */
+ virtual bool notdone_check() {
+ return traits_type::tcr(*(this->get())) != checker;
+ }
+ /*! @result Actual flag value before release was applied.
+ * Trigger all waiting threads to run by modifying flag to release state. */
+ void internal_release() {
+ (void)traits_type::test_then_add4((volatile PtrType *)this->get());
+ }
+ /*! @result Actual flag value before sleep bit(s) set.
+ * Notes that there is at least one thread sleeping on the flag by setting
+ * sleep bit(s). */
+ PtrType set_sleeping() {
+ if (this->sleepLoc) {
+ this->sleepLoc->store(true);
+ return *(this->get());
+ }
+ return traits_type::test_then_or((volatile PtrType *)this->get(),
+ KMP_BARRIER_SLEEP_STATE);
+ }
+ /*! @result Actual flag value before sleep bit(s) cleared.
+ * Notes that there are no longer threads sleeping on the flag by clearing
+ * sleep bit(s). */
+ void unset_sleeping() {
+ if (this->sleepLoc) {
+ this->sleepLoc->store(false);
+ return;
+ }
+ traits_type::test_then_and((volatile PtrType *)this->get(),
+ ~KMP_BARRIER_SLEEP_STATE);
+ }
+ /*! @param old_loc in old value of flag
+ * Test if there are threads sleeping on the flag's old value in old_loc. */
+ bool is_sleeping_val(PtrType old_loc) {
+ if (this->sleepLoc)
+ return this->sleepLoc->load();
+ return old_loc & KMP_BARRIER_SLEEP_STATE;
+ }
+ /*! Test whether there are threads sleeping on the flag. */
+ bool is_sleeping() {
+ if (this->sleepLoc)
+ return this->sleepLoc->load();
+ return is_sleeping_val(*(this->get()));
+ }
+ bool is_any_sleeping() {
+ if (this->sleepLoc)
+ return this->sleepLoc->load();
+ return is_sleeping_val(*(this->get()));
+ }
+ kmp_uint8 *get_stolen() { return NULL; }
+};
+
+/*! Base class for wait/release atomic flag */
+template <typename PtrType, flag_type FlagType, bool Sleepable>
+class kmp_flag_atomic : public kmp_flag<FlagType> {
+protected:
+ std::atomic<PtrType> *loc; /**< Pointer to flag location to wait on */
+ PtrType checker; /**< Flag == checker means it has been released. */
+public:
+ typedef flag_traits<FlagType> traits_type;
+ typedef PtrType flag_t;
+ kmp_flag_atomic(std::atomic<PtrType> *p) : kmp_flag<FlagType>(), loc(p) {}
+ kmp_flag_atomic(std::atomic<PtrType> *p, kmp_info_t *thr)
+ : kmp_flag<FlagType>(1), loc(p) {
+ this->waiting_threads[0] = thr;
+ }
+ kmp_flag_atomic(std::atomic<PtrType> *p, PtrType c)
+ : kmp_flag<FlagType>(), loc(p), checker(c) {}
+ kmp_flag_atomic(std::atomic<PtrType> *p, PtrType c, std::atomic<bool> *sloc)
+ : kmp_flag<FlagType>(sloc), loc(p), checker(c) {}
+ /*! @result the pointer to the actual flag */
+ std::atomic<PtrType> *get() { return loc; }
+ /*! @result void* pointer to the actual flag */
void *get_void_p() { return RCAST(void *, loc); }
- /*!
- * @param new_loc in set loc to point at new_loc
- */
- void set(std::atomic<P> *new_loc) { loc = new_loc; }
- /*!
- * @result the flag_type
- */
- flag_type get_type() { return (flag_type)(t.type); }
- /*!
- * @result flag value
- */
- P load() { return loc->load(std::memory_order_acquire); }
- /*!
- * @param val the new flag value to be stored
- */
- void store(P val) { loc->store(val, std::memory_order_release); }
- // Derived classes must provide the following:
- /*
- kmp_info_t * get_waiter(kmp_uint32 i);
- kmp_uint32 get_num_waiters();
- bool done_check();
- bool done_check_val(P old_loc);
- bool notdone_check();
- P internal_release();
- void suspend(int th_gtid);
- void mwait(int th_gtid);
- void resume(int th_gtid);
- P set_sleeping();
- P unset_sleeping();
- bool is_sleeping();
- bool is_any_sleeping();
- bool is_sleeping_val(P old_loc);
- int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
- int *thread_finished
- USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32
- is_constrained);
- */
+ /*! @param new_loc in set loc to point at new_loc */
+ void set(std::atomic<PtrType> *new_loc) { loc = new_loc; }
+ /*! @result flag value */
+ PtrType load() { return loc->load(std::memory_order_acquire); }
+ /*! @param val the new flag value to be stored */
+ void store(PtrType val) { loc->store(val, std::memory_order_release); }
+ /*! @result true if the flag object has been released. */
+ bool done_check() {
+ if (Sleepable && !(this->sleepLoc))
+ return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker;
+ else
+ return this->load() == checker;
+ }
+ /*! @param old_loc in old value of flag
+ * @result true if the flag's old value indicates it was released. */
+ bool done_check_val(PtrType old_loc) { return old_loc == checker; }
+ /*! @result true if the flag object is not yet released.
+ * Used in __kmp_wait_template like:
+ * @code
+ * while (flag.notdone_check()) { pause(); }
+ * @endcode */
+ bool notdone_check() { return this->load() != checker; }
+ /*! @result Actual flag value before release was applied.
+ * Trigger all waiting threads to run by modifying flag to release state. */
+ void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
+ /*! @result Actual flag value before sleep bit(s) set.
+ * Notes that there is at least one thread sleeping on the flag by setting
+ * sleep bit(s). */
+ PtrType set_sleeping() {
+ if (this->sleepLoc) {
+ this->sleepLoc->store(true);
+ return *(this->get());
+ }
+ return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
+ }
+ /*! @result Actual flag value before sleep bit(s) cleared.
+ * Notes that there are no longer threads sleeping on the flag by clearing
+ * sleep bit(s). */
+ void unset_sleeping() {
+ if (this->sleepLoc) {
+ this->sleepLoc->store(false);
+ return;
+ }
+ KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
+ }
+ /*! @param old_loc in old value of flag
+ * Test whether there are threads sleeping on flag's old value in old_loc. */
+ bool is_sleeping_val(PtrType old_loc) {
+ if (this->sleepLoc)
+ return this->sleepLoc->load();
+ return old_loc & KMP_BARRIER_SLEEP_STATE;
+ }
+ /*! Test whether there are threads sleeping on the flag. */
+ bool is_sleeping() {
+ if (this->sleepLoc)
+ return this->sleepLoc->load();
+ return is_sleeping_val(this->load());
+ }
+ bool is_any_sleeping() {
+ if (this->sleepLoc)
+ return this->sleepLoc->load();
+ return is_sleeping_val(this->load());
+ }
+ kmp_uint8 *get_stolen() { return NULL; }
};
#if OMPT_SUPPORT
@@ -185,6 +377,7 @@ __kmp_wait_template(kmp_info_t *this_thr,
#else
kmp_uint32 hibernate;
#endif
+ kmp_uint64 time;
KMP_FSYNC_SPIN_INIT(spin, NULL);
if (flag->done_check()) {
@@ -264,8 +457,9 @@ final_spin=FALSE)
ompt_entry_state = this_thr->th.ompt_thread_info.state;
if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit ||
KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
- ompt_lw_taskteam_t *team =
- this_thr->th.th_team->t.ompt_serialized_team_info;
+ ompt_lw_taskteam_t *team = NULL;
+ if (this_thr->th.th_team)
+ team = this_thr->th.th_team->t.ompt_serialized_team_info;
if (team) {
tId = &(team->ompt_task_info.task_data);
} else {
@@ -283,6 +477,7 @@ final_spin=FALSE)
#endif
KMP_INIT_YIELD(spins); // Setup for waiting
+ KMP_INIT_BACKOFF(time);
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
__kmp_pause_status == kmp_soft_paused) {
@@ -340,11 +535,11 @@ final_spin=FALSE)
disabled (KMP_TASKING=0). */
if (task_team != NULL) {
if (TCR_SYNC_4(task_team->tt.tt_active)) {
- if (KMP_TASKING_ENABLED(task_team))
+ if (KMP_TASKING_ENABLED(task_team)) {
flag->execute_tasks(
this_thr, th_gtid, final_spin,
&tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
- else
+ } else
this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
} else {
KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
@@ -370,7 +565,7 @@ final_spin=FALSE)
// If we are oversubscribed, or have waited a bit (and
// KMP_LIBRARY=throughput), then yield
- KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
#if KMP_STATS_ENABLED
// Check if thread has been signalled to idle state
@@ -557,6 +752,7 @@ static inline void __kmp_mwait_template(int th_gtid, C *flag) {
else {
// if flag changes here, wake-up happens immediately
TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+ th->th.th_sleep_loc_type = flag->get_type();
__kmp_unlock_suspend_mx(th);
KF_TRACE(50, ("__kmp_mwait_template: T#%d calling mwait\n", th_gtid));
#if KMP_HAVE_UMWAIT
@@ -574,6 +770,7 @@ static inline void __kmp_mwait_template(int th_gtid, C *flag) {
if (flag->is_sleeping())
flag->unset_sleeping();
TCW_PTR(th->th.th_sleep_loc, NULL);
+ th->th.th_sleep_loc_type = flag_unset;
}
// Mark thread as active again
th->th.th_active = TRUE;
@@ -624,251 +821,15 @@ template <class C> static inline void __kmp_release_template(C *flag) {
}
}
-template <typename FlagType> struct flag_traits {};
-
-template <> struct flag_traits<kmp_uint32> {
- typedef kmp_uint32 flag_t;
- static const flag_type t = flag32;
- static inline flag_t tcr(flag_t f) { return TCR_4(f); }
- static inline flag_t test_then_add4(volatile flag_t *f) {
- return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f));
- }
- static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
- return KMP_TEST_THEN_OR32(f, v);
- }
- static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
- return KMP_TEST_THEN_AND32(f, v);
- }
-};
-
-template <> struct flag_traits<kmp_uint64> {
- typedef kmp_uint64 flag_t;
- static const flag_type t = flag64;
- static inline flag_t tcr(flag_t f) { return TCR_8(f); }
- static inline flag_t test_then_add4(volatile flag_t *f) {
- return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
- }
- static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
- return KMP_TEST_THEN_OR64(f, v);
- }
- static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
- return KMP_TEST_THEN_AND64(f, v);
- }
-};
-
-// Basic flag that does not use C11 Atomics
-template <typename FlagType, bool Sleepable>
-class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
- typedef flag_traits<FlagType> traits_type;
- FlagType checker; /**< Value to compare flag to to check if flag has been
- released. */
- kmp_info_t
- *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
- kmp_uint32
- num_waiting_threads; /**< Number of threads sleeping on this thread. */
-public:
- kmp_basic_flag_native(volatile FlagType *p)
- : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
- kmp_basic_flag_native(volatile FlagType *p, kmp_info_t *thr)
- : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(1) {
- waiting_threads[0] = thr;
- }
- kmp_basic_flag_native(volatile FlagType *p, FlagType c)
- : kmp_flag_native<FlagType>(p, traits_type::t), checker(c),
- num_waiting_threads(0) {}
- /*!
- * param i in index into waiting_threads
- * @result the thread that is waiting at index i
- */
- kmp_info_t *get_waiter(kmp_uint32 i) {
- KMP_DEBUG_ASSERT(i < num_waiting_threads);
- return waiting_threads[i];
- }
- /*!
- * @result num_waiting_threads
- */
- kmp_uint32 get_num_waiters() { return num_waiting_threads; }
- /*!
- * @param thr in the thread which is now waiting
- *
- * Insert a waiting thread at index 0.
- */
- void set_waiter(kmp_info_t *thr) {
- waiting_threads[0] = thr;
- num_waiting_threads = 1;
- }
- /*!
- * @result true if the flag object has been released.
- */
- bool done_check() {
- if (Sleepable)
- return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) ==
- checker;
- else
- return traits_type::tcr(*(this->get())) == checker;
- }
- /*!
- * @param old_loc in old value of flag
- * @result true if the flag's old value indicates it was released.
- */
- bool done_check_val(FlagType old_loc) { return old_loc == checker; }
- /*!
- * @result true if the flag object is not yet released.
- * Used in __kmp_wait_template like:
- * @code
- * while (flag.notdone_check()) { pause(); }
- * @endcode
- */
- bool notdone_check() { return traits_type::tcr(*(this->get())) != checker; }
- /*!
- * @result Actual flag value before release was applied.
- * Trigger all waiting threads to run by modifying flag to release state.
- */
- void internal_release() {
- (void)traits_type::test_then_add4((volatile FlagType *)this->get());
- }
- /*!
- * @result Actual flag value before sleep bit(s) set.
- * Notes that there is at least one thread sleeping on the flag by setting
- * sleep bit(s).
- */
- FlagType set_sleeping() {
- return traits_type::test_then_or((volatile FlagType *)this->get(),
- KMP_BARRIER_SLEEP_STATE);
- }
- /*!
- * @result Actual flag value before sleep bit(s) cleared.
- * Notes that there are no longer threads sleeping on the flag by clearing
- * sleep bit(s).
- */
- FlagType unset_sleeping() {
- return traits_type::test_then_and((volatile FlagType *)this->get(),
- ~KMP_BARRIER_SLEEP_STATE);
- }
- /*!
- * @param old_loc in old value of flag
- * Test whether there are threads sleeping on the flag's old value in old_loc.
- */
- bool is_sleeping_val(FlagType old_loc) {
- return old_loc & KMP_BARRIER_SLEEP_STATE;
- }
- /*!
- * Test whether there are threads sleeping on the flag.
- */
- bool is_sleeping() { return is_sleeping_val(*(this->get())); }
- bool is_any_sleeping() { return is_sleeping_val(*(this->get())); }
- kmp_uint8 *get_stolen() { return NULL; }
- enum barrier_type get_bt() { return bs_last_barrier; }
-};
-
-template <typename FlagType, bool Sleepable>
-class kmp_basic_flag : public kmp_flag<FlagType> {
- typedef flag_traits<FlagType> traits_type;
- FlagType checker; /**< Value to compare flag to to check if flag has been
- released. */
- kmp_info_t
- *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
- kmp_uint32
- num_waiting_threads; /**< Number of threads sleeping on this thread. */
-public:
- kmp_basic_flag(std::atomic<FlagType> *p)
- : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
- kmp_basic_flag(std::atomic<FlagType> *p, kmp_info_t *thr)
- : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
- waiting_threads[0] = thr;
- }
- kmp_basic_flag(std::atomic<FlagType> *p, FlagType c)
- : kmp_flag<FlagType>(p, traits_type::t), checker(c),
- num_waiting_threads(0) {}
- /*!
- * param i in index into waiting_threads
- * @result the thread that is waiting at index i
- */
- kmp_info_t *get_waiter(kmp_uint32 i) {
- KMP_DEBUG_ASSERT(i < num_waiting_threads);
- return waiting_threads[i];
- }
- /*!
- * @result num_waiting_threads
- */
- kmp_uint32 get_num_waiters() { return num_waiting_threads; }
- /*!
- * @param thr in the thread which is now waiting
- *
- * Insert a waiting thread at index 0.
- */
- void set_waiter(kmp_info_t *thr) {
- waiting_threads[0] = thr;
- num_waiting_threads = 1;
- }
- /*!
- * @result true if the flag object has been released.
- */
- bool done_check() {
- if (Sleepable)
- return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker;
- else
- return this->load() == checker;
- }
- /*!
- * @param old_loc in old value of flag
- * @result true if the flag's old value indicates it was released.
- */
- bool done_check_val(FlagType old_loc) { return old_loc == checker; }
- /*!
- * @result true if the flag object is not yet released.
- * Used in __kmp_wait_template like:
- * @code
- * while (flag.notdone_check()) { pause(); }
- * @endcode
- */
- bool notdone_check() { return this->load() != checker; }
- /*!
- * @result Actual flag value before release was applied.
- * Trigger all waiting threads to run by modifying flag to release state.
- */
- void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
- /*!
- * @result Actual flag value before sleep bit(s) set.
- * Notes that there is at least one thread sleeping on the flag by setting
- * sleep bit(s).
- */
- FlagType set_sleeping() {
- return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
- }
- /*!
- * @result Actual flag value before sleep bit(s) cleared.
- * Notes that there are no longer threads sleeping on the flag by clearing
- * sleep bit(s).
- */
- FlagType unset_sleeping() {
- return KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
- }
- /*!
- * @param old_loc in old value of flag
- * Test whether there are threads sleeping on the flag's old value in old_loc.
- */
- bool is_sleeping_val(FlagType old_loc) {
- return old_loc & KMP_BARRIER_SLEEP_STATE;
- }
- /*!
- * Test whether there are threads sleeping on the flag.
- */
- bool is_sleeping() { return is_sleeping_val(this->load()); }
- bool is_any_sleeping() { return is_sleeping_val(this->load()); }
- kmp_uint8 *get_stolen() { return NULL; }
- enum barrier_type get_bt() { return bs_last_barrier; }
-};
-
template <bool Cancellable, bool Sleepable>
-class kmp_flag_32 : public kmp_basic_flag<kmp_uint32, Sleepable> {
+class kmp_flag_32 : public kmp_flag_atomic<kmp_uint32, flag32, Sleepable> {
public:
kmp_flag_32(std::atomic<kmp_uint32> *p)
- : kmp_basic_flag<kmp_uint32, Sleepable>(p) {}
+ : kmp_flag_atomic<kmp_uint32, flag32, Sleepable>(p) {}
kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr)
- : kmp_basic_flag<kmp_uint32, Sleepable>(p, thr) {}
+ : kmp_flag_atomic<kmp_uint32, flag32, Sleepable>(p, thr) {}
kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c)
- : kmp_basic_flag<kmp_uint32, Sleepable>(p, c) {}
+ : kmp_flag_atomic<kmp_uint32, flag32, Sleepable>(p, c) {}
void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
void mwait(int th_gtid) { __kmp_mwait_32(th_gtid, this); }
@@ -895,14 +856,16 @@ public:
};
template <bool Cancellable, bool Sleepable>
-class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64, Sleepable> {
+class kmp_flag_64 : public kmp_flag_native<kmp_uint64, flag64, Sleepable> {
public:
kmp_flag_64(volatile kmp_uint64 *p)
- : kmp_basic_flag_native<kmp_uint64, Sleepable>(p) {}
+ : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p) {}
kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr)
- : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, thr) {}
+ : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p, thr) {}
kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c)
- : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, c) {}
+ : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p, c) {}
+ kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c, std::atomic<bool> *loc)
+ : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p, c, loc) {}
void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
void mwait(int th_gtid) { __kmp_mwait_64(th_gtid, this); }
@@ -928,20 +891,52 @@ public:
flag_type get_ptr_type() { return flag64; }
};
+template <bool Cancellable, bool Sleepable>
+class kmp_atomic_flag_64
+ : public kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable> {
+public:
+ kmp_atomic_flag_64(std::atomic<kmp_uint64> *p)
+ : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p) {}
+ kmp_atomic_flag_64(std::atomic<kmp_uint64> *p, kmp_info_t *thr)
+ : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p, thr) {}
+ kmp_atomic_flag_64(std::atomic<kmp_uint64> *p, kmp_uint64 c)
+ : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p, c) {}
+ kmp_atomic_flag_64(std::atomic<kmp_uint64> *p, kmp_uint64 c,
+ std::atomic<bool> *loc)
+ : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p, c, loc) {}
+ void suspend(int th_gtid) { __kmp_atomic_suspend_64(th_gtid, this); }
+ void mwait(int th_gtid) { __kmp_atomic_mwait_64(th_gtid, this); }
+ void resume(int th_gtid) { __kmp_atomic_resume_64(th_gtid, this); }
+ int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
+ int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+ kmp_int32 is_constrained) {
+ return __kmp_atomic_execute_tasks_64(
+ this_thr, gtid, this, final_spin,
+ thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+ }
+ bool wait(kmp_info_t *this_thr,
+ int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+ if (final_spin)
+ return __kmp_wait_template<kmp_atomic_flag_64, TRUE, Cancellable,
+ Sleepable>(
+ this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+ else
+ return __kmp_wait_template<kmp_atomic_flag_64, FALSE, Cancellable,
+ Sleepable>(
+ this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+ }
+ void release() { __kmp_release_template(this); }
+ flag_type get_ptr_type() { return atomic_flag64; }
+};
+
// Hierarchical 64-bit on-core barrier instantiation
-class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
- kmp_uint64 checker;
- kmp_info_t *waiting_threads[1];
- kmp_uint32 num_waiting_threads;
- kmp_uint32
- offset; /**< Portion of flag that is of interest for an operation. */
+class kmp_flag_oncore : public kmp_flag_native<kmp_uint64, flag_oncore, false> {
+ kmp_uint32 offset; /**< Portion of flag of interest for an operation. */
bool flag_switch; /**< Indicates a switch in flag location. */
enum barrier_type bt; /**< Barrier type. */
- kmp_info_t *this_thr; /**< Thread that may be redirected to different flag
- location. */
+ kmp_info_t *this_thr; /**< Thread to redirect to different flag location. */
#if USE_ITT_BUILD
- void *
- itt_sync_obj; /**< ITT object that must be passed to new flag location. */
+ void *itt_sync_obj; /**< ITT object to pass to new flag location. */
#endif
unsigned char &byteref(volatile kmp_uint64 *loc, size_t offset) {
return (RCAST(unsigned char *, CCAST(kmp_uint64 *, loc)))[offset];
@@ -949,31 +944,26 @@ class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
public:
kmp_flag_oncore(volatile kmp_uint64 *p)
- : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
- flag_switch(false) {}
+ : kmp_flag_native<kmp_uint64, flag_oncore, false>(p), flag_switch(false) {
+ }
kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
- : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
- offset(idx), flag_switch(false) {}
+ : kmp_flag_native<kmp_uint64, flag_oncore, false>(p), offset(idx),
+ flag_switch(false),
+ bt(bs_last_barrier) USE_ITT_BUILD_ARG(itt_sync_obj(nullptr)) {}
kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx,
enum barrier_type bar_t,
kmp_info_t *thr USE_ITT_BUILD_ARG(void *itt))
- : kmp_flag_native<kmp_uint64>(p, flag_oncore), checker(c),
- num_waiting_threads(0), offset(idx), flag_switch(false), bt(bar_t),
+ : kmp_flag_native<kmp_uint64, flag_oncore, false>(p, c), offset(idx),
+ flag_switch(false), bt(bar_t),
this_thr(thr) USE_ITT_BUILD_ARG(itt_sync_obj(itt)) {}
- kmp_info_t *get_waiter(kmp_uint32 i) {
- KMP_DEBUG_ASSERT(i < num_waiting_threads);
- return waiting_threads[i];
- }
- kmp_uint32 get_num_waiters() { return num_waiting_threads; }
- void set_waiter(kmp_info_t *thr) {
- waiting_threads[0] = thr;
- num_waiting_threads = 1;
- }
- bool done_check_val(kmp_uint64 old_loc) {
+ virtual ~kmp_flag_oncore() override {}
+ void *operator new(size_t size) { return __kmp_allocate(size); }
+ void operator delete(void *p) { __kmp_free(p); }
+ bool done_check_val(kmp_uint64 old_loc) override {
return byteref(&old_loc, offset) == checker;
}
- bool done_check() { return done_check_val(*get()); }
- bool notdone_check() {
+ bool done_check() override { return done_check_val(*get()); }
+ bool notdone_check() override {
// Calculate flag_switch
if (this_thr->th.th_bar[bt].bb.wait_flag == KMP_BARRIER_SWITCH_TO_OWN_FLAG)
flag_switch = true;
@@ -997,17 +987,6 @@ public:
KMP_TEST_THEN_OR64(get(), mask);
}
}
- kmp_uint64 set_sleeping() {
- return KMP_TEST_THEN_OR64(get(), KMP_BARRIER_SLEEP_STATE);
- }
- kmp_uint64 unset_sleeping() {
- return KMP_TEST_THEN_AND64(get(), ~KMP_BARRIER_SLEEP_STATE);
- }
- bool is_sleeping_val(kmp_uint64 old_loc) {
- return old_loc & KMP_BARRIER_SLEEP_STATE;
- }
- bool is_sleeping() { return is_sleeping_val(*get()); }
- bool is_any_sleeping() { return is_sleeping_val(*get()); }
void wait(kmp_info_t *this_thr, int final_spin) {
if (final_spin)
__kmp_wait_template<kmp_flag_oncore, TRUE>(
@@ -1038,27 +1017,39 @@ public:
thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
#endif
}
- kmp_uint8 *get_stolen() { return NULL; }
enum barrier_type get_bt() { return bt; }
flag_type get_ptr_type() { return flag_oncore; }
};
-// Used to wake up threads, volatile void* flag is usually the th_sleep_loc
-// associated with int gtid.
-static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
+static inline void __kmp_null_resume_wrapper(kmp_info_t *thr) {
+ int gtid = __kmp_gtid_from_thread(thr);
+ void *flag = CCAST(void *, thr->th.th_sleep_loc);
+ flag_type type = thr->th.th_sleep_loc_type;
if (!flag)
return;
-
- switch (RCAST(kmp_flag_64<> *, CCAST(void *, flag))->get_type()) {
+ // Attempt to wake up a thread: examine its type and call appropriate template
+ switch (type) {
case flag32:
- __kmp_resume_32(gtid, (kmp_flag_32<> *)NULL);
+ __kmp_resume_32(gtid, RCAST(kmp_flag_32<> *, flag));
break;
case flag64:
- __kmp_resume_64(gtid, (kmp_flag_64<> *)NULL);
+ __kmp_resume_64(gtid, RCAST(kmp_flag_64<> *, flag));
+ break;
+ case atomic_flag64:
+ __kmp_atomic_resume_64(gtid, RCAST(kmp_atomic_flag_64<> *, flag));
break;
case flag_oncore:
- __kmp_resume_oncore(gtid, (kmp_flag_oncore *)NULL);
+ __kmp_resume_oncore(gtid, RCAST(kmp_flag_oncore *, flag));
+ break;
+#ifdef KMP_DEBUG
+ case flag_unset:
+ KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d is unset\n", type));
break;
+ default:
+ KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d does not match any "
+ "known flag type\n",
+ type));
+#endif
}
}
diff --git a/contrib/libs/cxxsupp/openmp/omp.h b/contrib/libs/cxxsupp/openmp/omp.h
index cb2fe49599..2ddf4f630b 100644
--- a/contrib/libs/cxxsupp/openmp/omp.h
+++ b/contrib/libs/cxxsupp/openmp/omp.h
@@ -437,14 +437,23 @@
extern omp_allocator_handle_t __KAI_KMPC_CONVENTION omp_get_default_allocator(void);
# ifdef __cplusplus
extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a = omp_null_allocator);
- extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t a = omp_null_allocator);
+ extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
+ omp_allocator_handle_t a = omp_null_allocator);
+ extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size,
+ omp_allocator_handle_t a = omp_null_allocator);
+ extern void *__KAI_KMPC_CONVENTION omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
+ omp_allocator_handle_t a = omp_null_allocator);
extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size,
omp_allocator_handle_t allocator = omp_null_allocator,
omp_allocator_handle_t free_allocator = omp_null_allocator);
extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
# else
extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
+ extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
+ omp_allocator_handle_t a);
extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t a);
+ extern void *__KAI_KMPC_CONVENTION omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
+ omp_allocator_handle_t a);
extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
omp_allocator_handle_t free_allocator);
extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
diff --git a/contrib/libs/cxxsupp/openmp/ompt-general.cpp b/contrib/libs/cxxsupp/openmp/ompt-general.cpp
index 3d8ef041f7..c1468c0c32 100644
--- a/contrib/libs/cxxsupp/openmp/ompt-general.cpp
+++ b/contrib/libs/cxxsupp/openmp/ompt-general.cpp
@@ -295,9 +295,16 @@ ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) {
OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success. \n");
OMPT_VERBOSE_INIT_PRINT("Searching for ompt_start_tool in %s... ",
fname);
+ dlerror(); // Clear any existing error
start_tool = (ompt_start_tool_t)dlsym(h, "ompt_start_tool");
if (!start_tool) {
- OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", dlerror());
+ char *error = dlerror();
+ if (error != NULL) {
+ OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", error);
+ } else {
+ OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n",
+ "ompt_start_tool = NULL");
+ }
} else
#elif KMP_OS_WINDOWS
OMPT_VERBOSE_INIT_PRINT("Opening %s... ", fname);
diff --git a/contrib/libs/cxxsupp/openmp/ompt-specific.cpp b/contrib/libs/cxxsupp/openmp/ompt-specific.cpp
index 1ad0e17ed4..c28b9bd1a6 100644
--- a/contrib/libs/cxxsupp/openmp/ompt-specific.cpp
+++ b/contrib/libs/cxxsupp/openmp/ompt-specific.cpp
@@ -283,10 +283,6 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
link_lwt->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
*OMPT_CUR_TEAM_INFO(thr) = tmp_team;
- ompt_task_info_t tmp_task = lwt->ompt_task_info;
- link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
- *OMPT_CUR_TASK_INFO(thr) = tmp_task;
-
// link the taskteam into the list of taskteams:
ompt_lw_taskteam_t *my_parent =
thr->th.th_team->t.ompt_serialized_team_info;
@@ -297,6 +293,10 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
ompd_bp_parallel_begin();
}
#endif
+
+ ompt_task_info_t tmp_task = lwt->ompt_task_info;
+ link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
+ *OMPT_CUR_TASK_INFO(thr) = tmp_task;
} else {
// this is the first serialized team, so we just store the values in the
// team and drop the taskteam-object
@@ -313,6 +313,9 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
ompt_lw_taskteam_t *lwtask = thr->th.th_team->t.ompt_serialized_team_info;
if (lwtask) {
+ ompt_task_info_t tmp_task = lwtask->ompt_task_info;
+ lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
+ *OMPT_CUR_TASK_INFO(thr) = tmp_task;
#if OMPD_SUPPORT
if (ompd_state & OMPD_ENABLE_BP) {
ompd_bp_parallel_end();
@@ -324,10 +327,6 @@ void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
lwtask->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
*OMPT_CUR_TEAM_INFO(thr) = tmp_team;
- ompt_task_info_t tmp_task = lwtask->ompt_task_info;
- lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
- *OMPT_CUR_TASK_INFO(thr) = tmp_task;
-
if (lwtask->heap) {
__kmp_free(lwtask);
lwtask = NULL;
@@ -365,13 +364,9 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type,
if (team == NULL)
return 0;
ompt_lw_taskteam_t *lwt = NULL,
- *next_lwt = LWT_FROM_TEAM(taskdata->td_team),
- *prev_lwt = NULL;
+ *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
while (ancestor_level > 0) {
- // needed for thread_num
- prev_team = team;
- prev_lwt = lwt;
// next lightweight team (if any)
if (lwt)
lwt = lwt->parent;
@@ -390,6 +385,7 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type,
taskdata = taskdata->td_parent;
if (team == NULL)
return 0;
+ prev_team = team;
team = team->t.t_parent;
if (taskdata) {
next_lwt = LWT_FROM_TEAM(taskdata->td_team);
@@ -431,9 +427,18 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type,
if (thread_num) {
if (level == 0)
*thread_num = __kmp_get_tid();
- else if (prev_lwt)
+ else if (lwt)
*thread_num = 0;
- else
+ else if (!prev_team) {
+ // The innermost parallel region contains at least one explicit task.
+ // The task at level > 0 is either an implicit task that
+ // corresponds to the mentioned region or one of the explicit tasks
+ // nested inside the same region. Note that the task isn't the
+ // innermost explicit tasks (because of condition level > 0).
+ // Since the task at this level still belongs to the innermost parallel
+ // region, thread_num is determined the same way as for level==0.
+ *thread_num = __kmp_get_tid();
+ } else
*thread_num = prev_team->t.t_master_tid;
// *thread_num = team->t.t_master_tid;
}
diff --git a/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp b/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp
index 42ad1d56f9..5cd6ad6a03 100644
--- a/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp
+++ b/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp
@@ -1051,6 +1051,8 @@ void __kmp_reap_worker(kmp_info_t *th) {
"exit_val = %p\n",
th->th.th_info.ds.ds_gtid, exit_val));
}
+#else
+ (void)status; // unused variable
#endif /* KMP_DEBUG */
KA_TRACE(10, ("__kmp_reap_worker: done reaping T#%d\n",
@@ -1232,7 +1234,7 @@ static void __kmp_atfork_child(void) {
// affinity in the parent
kmp_set_thread_affinity_mask_initial();
#endif
- // Set default not to bind threads tightly in the child (we’re expecting
+ // Set default not to bind threads tightly in the child (we're expecting
// over-subscription after the fork and this can improve things for
// scripting languages that use OpenMP inside process-parallel code).
__kmp_affinity_type = affinity_none;
@@ -1407,9 +1409,13 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
/* TODO: shouldn't this use release semantics to ensure that
__kmp_suspend_initialize_thread gets called first? */
old_spin = flag->set_sleeping();
+ TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+ th->th.th_sleep_loc_type = flag->get_type();
if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
__kmp_pause_status != kmp_soft_paused) {
flag->unset_sleeping();
+ TCW_PTR(th->th.th_sleep_loc, NULL);
+ th->th.th_sleep_loc_type = flag_unset;
__kmp_unlock_suspend_mx(th);
return;
}
@@ -1417,8 +1423,10 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
" was %x\n",
th_gtid, flag->get(), flag->load(), old_spin));
- if (flag->done_check_val(old_spin)) {
- old_spin = flag->unset_sleeping();
+ if (flag->done_check_val(old_spin) || flag->done_check()) {
+ flag->unset_sleeping();
+ TCW_PTR(th->th.th_sleep_loc, NULL);
+ th->th.th_sleep_loc_type = flag_unset;
KF_TRACE(5, ("__kmp_suspend_template: T#%d false alarm, reset sleep bit "
"for spin(%p)\n",
th_gtid, flag->get()));
@@ -1427,7 +1435,6 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
"with low probability" return when the condition variable has
not been signaled or broadcast */
int deactivated = FALSE;
- TCW_PTR(th->th.th_sleep_loc, (void *)flag);
while (flag->is_sleeping()) {
#ifdef DEBUG_SUSPEND
@@ -1449,6 +1456,9 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
deactivated = TRUE;
}
+ KMP_DEBUG_ASSERT(th->th.th_sleep_loc);
+ KMP_DEBUG_ASSERT(flag->get_type() == th->th.th_sleep_loc_type);
+
#if USE_SUSPEND_TIMEOUT
struct timespec now;
struct timeval tval;
@@ -1478,6 +1488,18 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
if ((status != 0) && (status != EINTR) && (status != ETIMEDOUT)) {
KMP_SYSFAIL("pthread_cond_wait", status);
}
+
+ KMP_DEBUG_ASSERT(flag->get_type() == flag->get_ptr_type());
+
+ if (!flag->is_sleeping() &&
+ ((status == EINTR) || (status == ETIMEDOUT))) {
+ // if interrupt or timeout, and thread is no longer sleeping, we need to
+ // make sure sleep_loc gets reset; however, this shouldn't be needed if
+ // we woke up with resume
+ flag->unset_sleeping();
+ TCW_PTR(th->th.th_sleep_loc, NULL);
+ th->th.th_sleep_loc_type = flag_unset;
+ }
#ifdef KMP_DEBUG
if (status == ETIMEDOUT) {
if (flag->is_sleeping()) {
@@ -1487,6 +1509,8 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
KF_TRACE(2, ("__kmp_suspend_template: T#%d timeout wakeup, sleep bit "
"not set!\n",
th_gtid));
+ TCW_PTR(th->th.th_sleep_loc, NULL);
+ th->th.th_sleep_loc_type = flag_unset;
}
} else if (flag->is_sleeping()) {
KF_TRACE(100,
@@ -1504,6 +1528,13 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
}
}
}
+ // We may have had the loop variable set before entering the loop body;
+ // so we need to reset sleep_loc.
+ TCW_PTR(th->th.th_sleep_loc, NULL);
+ th->th.th_sleep_loc_type = flag_unset;
+
+ KMP_DEBUG_ASSERT(!flag->is_sleeping());
+ KMP_DEBUG_ASSERT(!th->th.th_sleep_loc);
#ifdef DEBUG_SUSPEND
{
char buffer[128];
@@ -1525,6 +1556,10 @@ template <bool C, bool S>
void __kmp_suspend_64(int th_gtid, kmp_flag_64<C, S> *flag) {
__kmp_suspend_template(th_gtid, flag);
}
+template <bool C, bool S>
+void __kmp_atomic_suspend_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag) {
+ __kmp_suspend_template(th_gtid, flag);
+}
void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
__kmp_suspend_template(th_gtid, flag);
}
@@ -1532,6 +1567,10 @@ void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
template void __kmp_suspend_32<false, false>(int, kmp_flag_32<false, false> *);
template void __kmp_suspend_64<false, true>(int, kmp_flag_64<false, true> *);
template void __kmp_suspend_64<true, false>(int, kmp_flag_64<true, false> *);
+template void
+__kmp_atomic_suspend_64<false, true>(int, kmp_atomic_flag_64<false, true> *);
+template void
+__kmp_atomic_suspend_64<true, false>(int, kmp_atomic_flag_64<true, false> *);
/* This routine signals the thread specified by target_gtid to wake up
after setting the sleep bit indicated by the flag argument to FALSE.
@@ -1554,36 +1593,50 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
__kmp_lock_suspend_mx(th);
- if (!flag) { // coming from __kmp_null_resume_wrapper
+ if (!flag || flag != th->th.th_sleep_loc) {
+ // coming from __kmp_null_resume_wrapper, or thread is now sleeping on a
+ // different location; wake up at new location
flag = (C *)CCAST(void *, th->th.th_sleep_loc);
}
// First, check if the flag is null or its type has changed. If so, someone
// else woke it up.
- if (!flag || flag->get_type() != flag->get_ptr_type()) { // get_ptr_type
- // simply shows what flag was cast to
+ if (!flag) { // Thread doesn't appear to be sleeping on anything
KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
"awake: flag(%p)\n",
- gtid, target_gtid, NULL));
+ gtid, target_gtid, (void *)NULL));
__kmp_unlock_suspend_mx(th);
return;
+ } else if (flag->get_type() != th->th.th_sleep_loc_type) {
+ // Flag type does not appear to match this function template; possibly the
+ // thread is sleeping on something else. Try null resume again.
+ KF_TRACE(
+ 5,
+ ("__kmp_resume_template: T#%d retrying, thread T#%d Mismatch flag(%p), "
+ "spin(%p) type=%d ptr_type=%d\n",
+ gtid, target_gtid, flag, flag->get(), flag->get_type(),
+ th->th.th_sleep_loc_type));
+ __kmp_unlock_suspend_mx(th);
+ __kmp_null_resume_wrapper(th);
+ return;
} else { // if multiple threads are sleeping, flag should be internally
// referring to a specific thread here
- typename C::flag_t old_spin = flag->unset_sleeping();
- if (!flag->is_sleeping_val(old_spin)) {
+ if (!flag->is_sleeping()) {
KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
- "awake: flag(%p): "
- "%u => %u\n",
- gtid, target_gtid, flag->get(), old_spin, flag->load()));
+ "awake: flag(%p): %u\n",
+ gtid, target_gtid, flag->get(), (unsigned int)flag->load()));
__kmp_unlock_suspend_mx(th);
return;
}
- KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset "
- "sleep bit for flag's loc(%p): "
- "%u => %u\n",
- gtid, target_gtid, flag->get(), old_spin, flag->load()));
}
+ KMP_DEBUG_ASSERT(flag);
+ flag->unset_sleeping();
TCW_PTR(th->th.th_sleep_loc, NULL);
+ th->th.th_sleep_loc_type = flag_unset;
+
+ KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset "
+ "sleep bit for flag's loc(%p): %u\n",
+ gtid, target_gtid, flag->get(), (unsigned int)flag->load()));
#ifdef DEBUG_SUSPEND
{
@@ -1609,12 +1662,19 @@ template <bool C, bool S>
void __kmp_resume_64(int target_gtid, kmp_flag_64<C, S> *flag) {
__kmp_resume_template(target_gtid, flag);
}
+template <bool C, bool S>
+void __kmp_atomic_resume_64(int target_gtid, kmp_atomic_flag_64<C, S> *flag) {
+ __kmp_resume_template(target_gtid, flag);
+}
void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
__kmp_resume_template(target_gtid, flag);
}
template void __kmp_resume_32<false, true>(int, kmp_flag_32<false, true> *);
+template void __kmp_resume_32<false, false>(int, kmp_flag_32<false, false> *);
template void __kmp_resume_64<false, true>(int, kmp_flag_64<false, true> *);
+template void
+__kmp_atomic_resume_64<false, true>(int, kmp_atomic_flag_64<false, true> *);
#if KMP_USE_MONITOR
void __kmp_resume_monitor() {
@@ -1741,8 +1801,12 @@ static int __kmp_get_xproc(void) {
int r = 0;
-#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
- KMP_OS_OPENBSD || KMP_OS_HURD
+#if KMP_OS_LINUX
+
+ __kmp_type_convert(sysconf(_SC_NPROCESSORS_CONF), &(r));
+
+#elif KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD || \
+ KMP_OS_HURD
__kmp_type_convert(sysconf(_SC_NPROCESSORS_ONLN), &(r));