Update contrib/libs/cxxsupp/openmp to 14.0.4

ref:77c6cdda99b217d50c4deadca11f5611fa0dc168
author: thegeorg <thegeorg@yandex-team.ru> 2022-06-03 10:53:07 +0300
committer: thegeorg <thegeorg@yandex-team.ru> 2022-06-03 10:53:07 +0300
commit: a1d4361e379e2c72a469ad1bd64569cbc2db131f (patch)
tree: 0caddb240a10132376e4653a31578e117d33f9fd
parent: 41f55a521834080d9d703c099c0418cfff3a0546 (diff)
download: ydb-a1d4361e379e2c72a469ad1bd64569cbc2db131f.tar.gz
40 files changed, 3980 insertions, 998 deletions
diff --git a/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report b/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report
index 7e4845f2b7..7fc086467b 100644
--- a/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report
+++ b/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report
@@ -126,6 +126,7 @@ BELONGS ya.make
         kmp_atomic.cpp [7:8]
         kmp_atomic.h [7:8]
         kmp_barrier.cpp [7:8]
+        kmp_barrier.h [7:8]
         kmp_cancel.cpp [4:5]
         kmp_config.h [6:7]
         kmp_csupport.cpp [7:8]
@@ -193,6 +194,7 @@ BELONGS ya.make
         kmp_atomic.cpp [7:8]
         kmp_atomic.h [7:8]
         kmp_barrier.cpp [7:8]
+        kmp_barrier.h [7:8]
         kmp_cancel.cpp [4:5]
         kmp_config.h [6:7]
         kmp_csupport.cpp [7:8]
@@ -326,6 +328,7 @@ BELONGS ya.make
         kmp_atomic.cpp [9:9]
         kmp_atomic.h [9:9]
         kmp_barrier.cpp [9:9]
+        kmp_barrier.h [9:9]
         kmp_cancel.cpp [6:6]
         kmp_config.h [8:8]
         kmp_csupport.cpp [9:9]
@@ -393,6 +396,7 @@ BELONGS ya.make
         kmp_atomic.cpp [9:9]
         kmp_atomic.h [9:9]
         kmp_barrier.cpp [9:9]
+        kmp_barrier.h [9:9]
         kmp_cancel.cpp [6:6]
         kmp_config.h [8:8]
         kmp_csupport.cpp [9:9]
diff --git a/contrib/libs/cxxsupp/openmp/exports_so.txt b/contrib/libs/cxxsupp/openmp/exports_so.txt
index cb79ae72e6..ac188af310 100644
--- a/contrib/libs/cxxsupp/openmp/exports_so.txt
+++ b/contrib/libs/cxxsupp/openmp/exports_so.txt
@@ -120,5 +120,7 @@ GOMP_4.5 {
 } GOMP_4.0;
 GOMP_5.0 {
 } GOMP_4.5;
+GOMP_5.0.1 {
+} GOMP_5.0;
 
 # end of file #
diff --git a/contrib/libs/cxxsupp/openmp/kmp.h b/contrib/libs/cxxsupp/openmp/kmp.h
index 0652080277..9502167474 100644
--- a/contrib/libs/cxxsupp/openmp/kmp.h
+++ b/contrib/libs/cxxsupp/openmp/kmp.h
@@ -115,6 +115,7 @@ typedef unsigned int kmp_hwloc_depth_t;
 #include "kmp_debug.h"
 #include "kmp_lock.h"
 #include "kmp_version.h"
+#include "kmp_barrier.h"
 #if USE_DEBUGGER
 #error #include "kmp_debugger.h"
 #endif
@@ -263,6 +264,7 @@ typedef union kmp_root kmp_root_p;
 
 template <bool C = false, bool S = true> class kmp_flag_32;
 template <bool C = false, bool S = true> class kmp_flag_64;
+template <bool C = false, bool S = true> class kmp_atomic_flag_64;
 class kmp_flag_oncore;
 
 #ifdef __cplusplus
@@ -616,6 +618,19 @@ enum kmp_hw_t : int {
   KMP_HW_LAST
 };
 
+typedef enum kmp_hw_core_type_t {
+  KMP_HW_CORE_TYPE_UNKNOWN = 0x0,
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  KMP_HW_CORE_TYPE_ATOM = 0x20,
+  KMP_HW_CORE_TYPE_CORE = 0x40,
+  KMP_HW_MAX_NUM_CORE_TYPES = 3,
+#else
+  KMP_HW_MAX_NUM_CORE_TYPES = 1,
+#endif
+} kmp_hw_core_type_t;
+
+#define KMP_HW_MAX_NUM_CORE_EFFS 8
+
 #define KMP_DEBUG_ASSERT_VALID_HW_TYPE(type)                                   \
   KMP_DEBUG_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
 #define KMP_ASSERT_VALID_HW_TYPE(type)                                         \
@@ -627,6 +642,7 @@ enum kmp_hw_t : int {
 
 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural = false);
 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural = false);
+const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type);
 
 /* Only Linux* OS and Windows* OS support thread affinity. */
 #if KMP_AFFINITY_SUPPORTED
@@ -847,6 +863,7 @@ typedef struct kmp_nested_proc_bind_t {
 } kmp_nested_proc_bind_t;
 
 extern kmp_nested_proc_bind_t __kmp_nested_proc_bind;
+extern kmp_proc_bind_t __kmp_teams_proc_bind;
 
 extern int __kmp_display_affinity;
 extern char *__kmp_affinity_format;
@@ -987,7 +1004,7 @@ typedef omp_memspace_handle_t kmp_memspace_t; // placeholder
 typedef struct kmp_allocator_t {
   omp_memspace_handle_t memspace;
   void **memkind; // pointer to memkind
-  int alignment;
+  size_t alignment;
   omp_alloctrait_value_t fb;
   kmp_allocator_t *fb_data;
   kmp_uint64 pool_size;
@@ -1001,13 +1018,25 @@ extern omp_allocator_handle_t __kmpc_init_allocator(int gtid,
 extern void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t al);
 extern void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t al);
 extern omp_allocator_handle_t __kmpc_get_default_allocator(int gtid);
+// external interfaces, may be used by compiler
 extern void *__kmpc_alloc(int gtid, size_t sz, omp_allocator_handle_t al);
+extern void *__kmpc_aligned_alloc(int gtid, size_t align, size_t sz,
+                                  omp_allocator_handle_t al);
 extern void *__kmpc_calloc(int gtid, size_t nmemb, size_t sz,
                            omp_allocator_handle_t al);
 extern void *__kmpc_realloc(int gtid, void *ptr, size_t sz,
                             omp_allocator_handle_t al,
                             omp_allocator_handle_t free_al);
 extern void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al);
+// internal interfaces, contain real implementation
+extern void *__kmp_alloc(int gtid, size_t align, size_t sz,
+                         omp_allocator_handle_t al);
+extern void *__kmp_calloc(int gtid, size_t align, size_t nmemb, size_t sz,
+                          omp_allocator_handle_t al);
+extern void *__kmp_realloc(int gtid, void *ptr, size_t sz,
+                           omp_allocator_handle_t al,
+                           omp_allocator_handle_t free_al);
+extern void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al);
 
 extern void __kmp_init_memkind();
 extern void __kmp_fini_memkind();
@@ -1066,7 +1095,9 @@ extern void __kmp_init_target_mem();
 #define KMP_MIN_BLOCKTIME (0)
 #define KMP_MAX_BLOCKTIME                                                      \
   (INT_MAX) /* Must be this for "infinite" setting the work */
-#define KMP_DEFAULT_BLOCKTIME (200) /*  __kmp_blocktime is in milliseconds  */
+
+/* __kmp_blocktime is in milliseconds */
+#define KMP_DEFAULT_BLOCKTIME (__kmp_is_hybrid_cpu() ? (0) : (200))
 
 #if KMP_USE_MONITOR
 #define KMP_DEFAULT_MONITOR_STKSIZE ((size_t)(64 * 1024))
@@ -1204,6 +1235,13 @@ typedef struct kmp_cpuid {
   kmp_uint32 edx;
 } kmp_cpuid_t;
 
+typedef struct kmp_cpuinfo_flags_t {
+  unsigned sse2 : 1; // 0 if SSE2 instructions are not supported, 1 otherwise.
+  unsigned rtm : 1; // 0 if RTM instructions are not supported, 1 otherwise.
+  unsigned hybrid : 1;
+  unsigned reserved : 29; // Ensure size of 32 bits
+} kmp_cpuinfo_flags_t;
+
 typedef struct kmp_cpuinfo {
   int initialized; // If 0, other fields are not initialized.
   int signature; // CPUID(1).EAX
@@ -1211,8 +1249,7 @@ typedef struct kmp_cpuinfo {
   int model; // ( CPUID(1).EAX[19:16] << 4 ) + CPUID(1).EAX[7:4] ( ( Extended
   // Model << 4 ) + Model)
   int stepping; // CPUID(1).EAX[3:0] ( Stepping )
-  int sse2; // 0 if SSE2 instructions are not supported, 1 otherwise.
-  int rtm; // 0 if RTM instructions are not supported, 1 otherwise.
+  kmp_cpuinfo_flags_t flags;
   int apic_id;
   int physical_id;
   int logical_id;
@@ -1278,6 +1315,82 @@ static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = _mm_getcsr(); }
 
 #define KMP_X86_MXCSR_MASK 0xffffffc0 /* ignore status flags (6 lsb) */
 
+// User-level Monitor/Mwait
+#if KMP_HAVE_UMWAIT
+// We always try for UMWAIT first
+#if KMP_HAVE_WAITPKG_INTRINSICS
+#if KMP_HAVE_IMMINTRIN_H
+#include <immintrin.h>
+#elif KMP_HAVE_INTRIN_H
+#include <intrin.h>
+#endif
+#endif // KMP_HAVE_WAITPKG_INTRINSICS
+
+KMP_ATTRIBUTE_TARGET_WAITPKG
+static inline int __kmp_tpause(uint32_t hint, uint64_t counter) {
+#if !KMP_HAVE_WAITPKG_INTRINSICS
+  uint32_t timeHi = uint32_t(counter >> 32);
+  uint32_t timeLo = uint32_t(counter & 0xffffffff);
+  char flag;
+  __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n"
+                   "setb   %0"
+                   : "=r"(flag)
+                   : "a"(timeLo), "d"(timeHi), "c"(hint)
+                   :);
+  return flag;
+#else
+  return _tpause(hint, counter);
+#endif
+}
+KMP_ATTRIBUTE_TARGET_WAITPKG
+static inline void __kmp_umonitor(void *cacheline) {
+#if !KMP_HAVE_WAITPKG_INTRINSICS
+  __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 "
+                   :
+                   : "a"(cacheline)
+                   :);
+#else
+  _umonitor(cacheline);
+#endif
+}
+KMP_ATTRIBUTE_TARGET_WAITPKG
+static inline int __kmp_umwait(uint32_t hint, uint64_t counter) {
+#if !KMP_HAVE_WAITPKG_INTRINSICS
+  uint32_t timeHi = uint32_t(counter >> 32);
+  uint32_t timeLo = uint32_t(counter & 0xffffffff);
+  char flag;
+  __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n"
+                   "setb   %0"
+                   : "=r"(flag)
+                   : "a"(timeLo), "d"(timeHi), "c"(hint)
+                   :);
+  return flag;
+#else
+  return _umwait(hint, counter);
+#endif
+}
+#elif KMP_HAVE_MWAIT
+#if KMP_OS_UNIX
+#include <pmmintrin.h>
+#else
+#include <intrin.h>
+#endif
+#if KMP_OS_UNIX
+__attribute__((target("sse3")))
+#endif
+static inline void
+__kmp_mm_monitor(void *cacheline, unsigned extensions, unsigned hints) {
+  _mm_monitor(cacheline, extensions, hints);
+}
+#if KMP_OS_UNIX
+__attribute__((target("sse3")))
+#endif
+static inline void
+__kmp_mm_mwait(unsigned extensions, unsigned hints) {
+  _mm_mwait(extensions, hints);
+}
+#endif // KMP_HAVE_UMWAIT
+
 #if KMP_ARCH_X86
 extern void __kmp_x86_pause(void);
 #elif KMP_MIC
@@ -1307,6 +1420,9 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
 #define KMP_INIT_YIELD(count)                                                  \
   { (count) = __kmp_yield_init; }
 
+#define KMP_INIT_BACKOFF(time)                                                 \
+  { (time) = __kmp_pause_init; }
+
 #define KMP_OVERSUBSCRIBED                                                     \
   (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc))
 
@@ -1344,7 +1460,36 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
     }                                                                          \
   }
 
-#define KMP_YIELD_OVERSUB_ELSE_SPIN(count)                                     \
+// If TPAUSE is available & enabled, use it. If oversubscribed, use the slower
+// (C0.2) state, which improves performance of other SMT threads on the same
+// core, otherwise, use the fast (C0.1) default state, or whatever the user has
+// requested. Uses a timed TPAUSE, and exponential backoff. If TPAUSE isn't
+// available, fall back to the regular CPU pause and yield combination.
+#if KMP_HAVE_UMWAIT
+#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time)                               \
+  {                                                                            \
+    if (__kmp_tpause_enabled) {                                                \
+      if (KMP_OVERSUBSCRIBED) {                                                \
+        __kmp_tpause(0, (time));                                               \
+      } else {                                                                 \
+        __kmp_tpause(__kmp_tpause_hint, (time));                               \
+      }                                                                        \
+      (time) *= 2;                                                             \
+    } else {                                                                   \
+      KMP_CPU_PAUSE();                                                         \
+      if ((KMP_TRY_YIELD_OVERSUB)) {                                           \
+        __kmp_yield();                                                         \
+      } else if (__kmp_use_yield == 1) {                                       \
+        (count) -= 2;                                                          \
+        if (!(count)) {                                                        \
+          __kmp_yield();                                                       \
+          (count) = __kmp_yield_next;                                          \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+  }
+#else
+#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time)                               \
   {                                                                            \
     KMP_CPU_PAUSE();                                                           \
     if ((KMP_TRY_YIELD_OVERSUB))                                               \
@@ -1357,86 +1502,14 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
       }                                                                        \
     }                                                                          \
   }
-
-// User-level Monitor/Mwait
-#if KMP_HAVE_UMWAIT
-// We always try for UMWAIT first
-#if KMP_HAVE_WAITPKG_INTRINSICS
-#if KMP_HAVE_IMMINTRIN_H
-#include <immintrin.h>
-#elif KMP_HAVE_INTRIN_H
-#include <intrin.h>
-#endif
-#endif // KMP_HAVE_WAITPKG_INTRINSICS
-KMP_ATTRIBUTE_TARGET_WAITPKG
-static inline int __kmp_tpause(uint32_t hint, uint64_t counter) {
-#if !KMP_HAVE_WAITPKG_INTRINSICS
-  uint32_t timeHi = uint32_t(counter >> 32);
-  uint32_t timeLo = uint32_t(counter & 0xffffffff);
-  char flag;
-  __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n"
-                   "setb   %0"
-                   : "=r"(flag)
-                   : "a"(timeLo), "d"(timeHi), "c"(hint)
-                   :);
-  return flag;
-#else
-  return _tpause(hint, counter);
-#endif
-}
-KMP_ATTRIBUTE_TARGET_WAITPKG
-static inline void __kmp_umonitor(void *cacheline) {
-#if !KMP_HAVE_WAITPKG_INTRINSICS
-  __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 "
-                   :
-                   : "a"(cacheline)
-                   :);
-#else
-  _umonitor(cacheline);
-#endif
-}
-KMP_ATTRIBUTE_TARGET_WAITPKG
-static inline int __kmp_umwait(uint32_t hint, uint64_t counter) {
-#if !KMP_HAVE_WAITPKG_INTRINSICS
-  uint32_t timeHi = uint32_t(counter >> 32);
-  uint32_t timeLo = uint32_t(counter & 0xffffffff);
-  char flag;
-  __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n"
-                   "setb   %0"
-                   : "=r"(flag)
-                   : "a"(timeLo), "d"(timeHi), "c"(hint)
-                   :);
-  return flag;
-#else
-  return _umwait(hint, counter);
-#endif
-}
-#elif KMP_HAVE_MWAIT
-#if KMP_OS_UNIX
-#include <pmmintrin.h>
-#else
-#include <intrin.h>
-#endif
-#if KMP_OS_UNIX
-__attribute__((target("sse3")))
-#endif
-static inline void
-__kmp_mm_monitor(void *cacheline, unsigned extensions, unsigned hints) {
-  _mm_monitor(cacheline, extensions, hints);
-}
-#if KMP_OS_UNIX
-__attribute__((target("sse3")))
-#endif
-static inline void
-__kmp_mm_mwait(unsigned extensions, unsigned hints) {
-  _mm_mwait(extensions, hints);
-}
 #endif // KMP_HAVE_UMWAIT
 
 /* ------------------------------------------------------------------------ */
 /* Support datatypes for the orphaned construct nesting checks.             */
 /* ------------------------------------------------------------------------ */
 
+/* When adding to this enum, add its corresponding string in cons_text_c[]
+ * array in kmp_error.cpp */
 enum cons_type {
   ct_none,
   ct_parallel,
@@ -1879,6 +1952,15 @@ typedef struct kmp_disp {
   0 // Thread th_reap_state: not safe to reap (tasking)
 #define KMP_SAFE_TO_REAP 1 // Thread th_reap_state: safe to reap (not tasking)
 
+// The flag_type describes the storage used for the flag.
+enum flag_type {
+  flag32, /**< atomic 32 bit flags */
+  flag64, /**< 64 bit flags */
+  atomic_flag64, /**< atomic 64 bit flags */
+  flag_oncore, /**< special 64-bit flag for on-core barrier (hierarchical) */
+  flag_unset
+};
+
 enum barrier_type {
   bs_plain_barrier = 0, /* 0, All non-fork/join barriers (except reduction
                            barriers if enabled) */
@@ -1902,6 +1984,7 @@ typedef enum kmp_bar_pat { /* Barrier communication patterns */
                            bp_hyper_bar = 2, /* Hypercube-embedded tree with min
                                                 branching factor 2^n */
                            bp_hierarchical_bar = 3, /* Machine hierarchy tree */
+                           bp_dist_bar = 4, /* Distributed barrier */
                            bp_last_bar /* Placeholder to mark the end */
 } kmp_bar_pat_e;
 
@@ -2241,22 +2324,26 @@ typedef union kmp_depnode kmp_depnode_t;
 typedef struct kmp_depnode_list kmp_depnode_list_t;
 typedef struct kmp_dephash_entry kmp_dephash_entry_t;
 
+// macros for checking dep flag as an integer
 #define KMP_DEP_IN 0x1
 #define KMP_DEP_OUT 0x2
 #define KMP_DEP_INOUT 0x3
 #define KMP_DEP_MTX 0x4
 #define KMP_DEP_SET 0x8
+#define KMP_DEP_ALL 0x80
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
   size_t len;
   union {
-    kmp_uint8 flag;
-    struct {
+    kmp_uint8 flag; // flag as an unsigned char
+    struct { // flag as a set of 8 bits
       unsigned in : 1;
       unsigned out : 1;
       unsigned mtx : 1;
       unsigned set : 1;
+      unsigned unused : 3;
+      unsigned all : 1;
     } flags;
   };
 } kmp_depend_info_t;
@@ -2302,6 +2389,7 @@ struct kmp_dephash_entry {
 typedef struct kmp_dephash {
   kmp_dephash_entry_t **buckets;
   size_t size;
+  kmp_depnode_t *last_all;
   size_t generation;
   kmp_uint32 nelements;
   kmp_uint32 nconflicts;
@@ -2409,13 +2497,6 @@ struct kmp_taskdata { /* aligned during dynamic allocation       */
   kmp_depnode_t
       *td_depnode; // Pointer to graph node if this task has dependencies
   kmp_task_team_t *td_task_team;
-  // The global thread id of the encountering thread. We need it because when a
-  // regular task depends on a hidden helper task, and the hidden helper task
-  // is finished on a hidden helper thread, it will call __kmp_release_deps to
-  // release all dependences. If now the task is a regular task, we need to pass
-  // the encountering gtid such that the task will be picked up and executed by
-  // its encountering team instead of hidden helper team.
-  kmp_int32 encountering_gtid;
   size_t td_size_alloc; // Size of task structure, including shareds etc.
 #if defined(KMP_GOMP_COMPAT)
   // 4 or 8 byte integers for the loop bounds in GOMP_taskloop
@@ -2626,6 +2707,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   /* while awaiting queuing lock acquire */
 
   volatile void *th_sleep_loc; // this points at a kmp_flag<T>
+  flag_type th_sleep_loc_type; // enum type of flag stored in th_sleep_loc
 
   ident_t *th_ident;
   unsigned th_x; // Random number generator data
@@ -2646,6 +2728,9 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
      written by the worker thread) */
   kmp_uint8 th_active_in_pool; // included in count of #active threads in pool
   int th_active; // ! sleeping; 32 bits for TCR/TCW
+  std::atomic<kmp_uint32> th_used_in_team; // Flag indicating use in team
+  // 0 = not used in team; 1 = used in team;
+  // 2 = transitioning to not used in team; 3 = transitioning to used in team
   struct cons_header *th_cons; // used for consistency check
 #if KMP_USE_HIER_SCHED
   // used for hierarchical scheduling
@@ -2825,6 +2910,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
 #if USE_ITT_BUILD
   void *t_stack_id; // team specific stack stitching id (for ittnotify)
 #endif /* USE_ITT_BUILD */
+  distributedBarrier *b; // Distributed barrier data associated with team
 } kmp_base_team_t;
 
 union KMP_ALIGN_CACHE kmp_team {
@@ -2949,6 +3035,9 @@ extern int __kmp_storage_map_verbose_specified;
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 extern kmp_cpuinfo_t __kmp_cpuinfo;
+static inline bool __kmp_is_hybrid_cpu() { return __kmp_cpuinfo.flags.hybrid; }
+#else
+static inline bool __kmp_is_hybrid_cpu() { return false; }
 #endif
 
 extern volatile int __kmp_init_serial;
@@ -3033,6 +3122,7 @@ extern kmp_int32 __kmp_use_yield;
 extern kmp_int32 __kmp_use_yield_exp_set;
 extern kmp_uint32 __kmp_yield_init;
 extern kmp_uint32 __kmp_yield_next;
+extern kmp_uint64 __kmp_pause_init;
 
 /* ------------------------------------------------------------------------- */
 extern int __kmp_allThreadsSpecified;
@@ -3235,6 +3325,13 @@ extern int __kmp_mwait_enabled; // Runtime check if ring3 mwait is enabled
 extern int __kmp_mwait_hints; // Hints to pass in to mwait
 #endif
 
+#if KMP_HAVE_UMWAIT
+extern int __kmp_waitpkg_enabled; // Runtime check if waitpkg exists
+extern int __kmp_tpause_state; // 0 (default), 1=C0.1, 2=C0.2; from KMP_TPAUSE
+extern int __kmp_tpause_hint; // 1=C0.1 (default), 0=C0.2; from KMP_TPAUSE
+extern int __kmp_tpause_enabled; // 0 (default), 1 (KMP_TPAUSE is non-zero)
+#endif
+
 /* ------------------------------------------------------------------------- */
 
 extern kmp_global_t __kmp_global; /* global status */
@@ -4118,6 +4215,10 @@ typedef enum kmp_severity_t {
 } kmp_severity_t;
 extern void __kmpc_error(ident_t *loc, int severity, const char *message);
 
+// Support for scope directive
+KMP_EXPORT void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved);
+KMP_EXPORT void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved);
+
 #ifdef __cplusplus
 }
 #endif
@@ -4126,18 +4227,26 @@ template <bool C, bool S>
 extern void __kmp_suspend_32(int th_gtid, kmp_flag_32<C, S> *flag);
 template <bool C, bool S>
 extern void __kmp_suspend_64(int th_gtid, kmp_flag_64<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_atomic_suspend_64(int th_gtid,
+                                    kmp_atomic_flag_64<C, S> *flag);
 extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag);
 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
 template <bool C, bool S>
 extern void __kmp_mwait_32(int th_gtid, kmp_flag_32<C, S> *flag);
 template <bool C, bool S>
 extern void __kmp_mwait_64(int th_gtid, kmp_flag_64<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_atomic_mwait_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag);
 extern void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag);
 #endif
 template <bool C, bool S>
 extern void __kmp_resume_32(int target_gtid, kmp_flag_32<C, S> *flag);
 template <bool C, bool S>
 extern void __kmp_resume_64(int target_gtid, kmp_flag_64<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_atomic_resume_64(int target_gtid,
+                                   kmp_atomic_flag_64<C, S> *flag);
 extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag);
 
 template <bool C, bool S>
@@ -4156,6 +4265,14 @@ int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
                            void *itt_sync_obj,
 #endif /* USE_ITT_BUILD */
                            kmp_int32 is_constrained);
+template <bool C, bool S>
+int __kmp_atomic_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
+                                  kmp_atomic_flag_64<C, S> *flag,
+                                  int final_spin, int *thread_finished,
+#if USE_ITT_BUILD
+                                  void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                                  kmp_int32 is_constrained);
 int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
                                kmp_flag_oncore *flag, int final_spin,
                                int *thread_finished,
@@ -4213,6 +4330,15 @@ public:
       }
     }
   }
+  /// Instead of erroring out, return non-zero when
+  /// unsuccessful fopen() for any reason
+  int try_open(const char *filename, const char *mode) {
+    KMP_ASSERT(!f);
+    f = fopen(filename, mode);
+    if (!f)
+      return errno;
+    return 0;
+  }
   /// Set the FILE* object to stdout and output there
   /// No open call should happen before this call.
   void set_stdout() {
diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
index 8b40bd7ecd..414a27fb05 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
@@ -26,6 +26,7 @@
 #define HWLOC_GROUP_KIND_INTEL_DIE 104
 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
 #endif
+#include <ctype.h>
 
 // The machine topology
 kmp_topology_t *__kmp_topology = nullptr;
@@ -123,6 +124,20 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
   return ((plural) ? "unknowns" : "unknown");
 }
 
+const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
+  switch (type) {
+  case KMP_HW_CORE_TYPE_UNKNOWN:
+    return "unknown";
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  case KMP_HW_CORE_TYPE_ATOM:
+    return "Intel Atom(R) processor";
+  case KMP_HW_CORE_TYPE_CORE:
+    return "Intel(R) Core(TM) processor";
+#endif
+  }
+  return "unknown";
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // kmp_hw_thread_t methods
 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
@@ -174,20 +189,94 @@ void kmp_hw_thread_t::print() const {
   for (int i = 0; i < depth; ++i) {
     printf("%4d ", ids[i]);
   }
+  if (attrs) {
+    if (attrs.is_core_type_valid())
+      printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type()));
+    if (attrs.is_core_eff_valid())
+      printf(" (eff=%d)", attrs.get_core_eff());
+  }
   printf("\n");
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // kmp_topology_t methods
 
+// Add a layer to the topology based on the ids. Assume the topology
+// is perfectly nested (i.e., so no object has more than one parent)
+void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
+  // Figure out where the layer should go by comparing the ids of the current
+  // layers with the new ids
+  int target_layer;
+  int previous_id = kmp_hw_thread_t::UNKNOWN_ID;
+  int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID;
+
+  // Start from the highest layer and work down to find target layer
+  // If new layer is equal to another layer then put the new layer above
+  for (target_layer = 0; target_layer < depth; ++target_layer) {
+    bool layers_equal = true;
+    bool strictly_above_target_layer = false;
+    for (int i = 0; i < num_hw_threads; ++i) {
+      int id = hw_threads[i].ids[target_layer];
+      int new_id = ids[i];
+      if (id != previous_id && new_id == previous_new_id) {
+        // Found the layer we are strictly above
+        strictly_above_target_layer = true;
+        layers_equal = false;
+        break;
+      } else if (id == previous_id && new_id != previous_new_id) {
+        // Found a layer we are below. Move to next layer and check.
+        layers_equal = false;
+        break;
+      }
+      previous_id = id;
+      previous_new_id = new_id;
+    }
+    if (strictly_above_target_layer || layers_equal)
+      break;
+  }
+
+  // Found the layer we are above. Now move everything to accommodate the new
+  // layer. And put the new ids and type into the topology.
+  for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
+    types[j] = types[i];
+  types[target_layer] = type;
+  for (int k = 0; k < num_hw_threads; ++k) {
+    for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
+      hw_threads[k].ids[j] = hw_threads[k].ids[i];
+    hw_threads[k].ids[target_layer] = ids[k];
+  }
+  equivalent[type] = type;
+  depth++;
+}
+
+#if KMP_GROUP_AFFINITY
+// Insert the Windows Processor Group structure into the topology
+void kmp_topology_t::_insert_windows_proc_groups() {
+  // Do not insert the processor group structure for a single group
+  if (__kmp_num_proc_groups == 1)
+    return;
+  kmp_affin_mask_t *mask;
+  int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
+  KMP_CPU_ALLOC(mask);
+  for (int i = 0; i < num_hw_threads; ++i) {
+    KMP_CPU_ZERO(mask);
+    KMP_CPU_SET(hw_threads[i].os_id, mask);
+    ids[i] = __kmp_get_proc_group(mask);
+  }
+  KMP_CPU_FREE(mask);
+  _insert_layer(KMP_HW_PROC_GROUP, ids);
+  __kmp_free(ids);
+}
+#endif
+
 // Remove layers that don't add information to the topology.
 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
 void kmp_topology_t::_remove_radix1_layers() {
   int preference[KMP_HW_LAST];
   int top_index1, top_index2;
   // Set up preference associative array
-  preference[KMP_HW_PROC_GROUP] = 110;
-  preference[KMP_HW_SOCKET] = 100;
+  preference[KMP_HW_SOCKET] = 110;
+  preference[KMP_HW_PROC_GROUP] = 100;
   preference[KMP_HW_CORE] = 95;
   preference[KMP_HW_THREAD] = 90;
   preference[KMP_HW_NUMA] = 85;
@@ -305,6 +394,7 @@ void kmp_topology_t::_gather_enumeration_information() {
     count[i] = 0;
     ratio[i] = 0;
   }
+  int core_level = get_level(KMP_HW_CORE);
   for (int i = 0; i < num_hw_threads; ++i) {
     kmp_hw_thread_t &hw_thread = hw_threads[i];
     for (int layer = 0; layer < depth; ++layer) {
@@ -320,6 +410,29 @@ void kmp_topology_t::_gather_enumeration_information() {
             ratio[l] = max[l];
           max[l] = 1;
         }
+        // Figure out the number of different core types
+        // and efficiencies for hybrid CPUs
+        if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) {
+          if (hw_thread.attrs.is_core_eff_valid() &&
+              hw_thread.attrs.core_eff >= num_core_efficiencies) {
+            // Because efficiencies can range from 0 to max efficiency - 1,
+            // the number of efficiencies is max efficiency + 1
+            num_core_efficiencies = hw_thread.attrs.core_eff + 1;
+          }
+          if (hw_thread.attrs.is_core_type_valid()) {
+            bool found = false;
+            for (int j = 0; j < num_core_types; ++j) {
+              if (hw_thread.attrs.get_core_type() == core_types[j]) {
+                found = true;
+                break;
+              }
+            }
+            if (!found) {
+              KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES);
+              core_types[num_core_types++] = hw_thread.attrs.get_core_type();
+            }
+          }
+        }
         break;
       }
     }
@@ -333,6 +446,42 @@ void kmp_topology_t::_gather_enumeration_information() {
   }
 }
 
+int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr,
+                                          int above_level,
+                                          bool find_all) const {
+  int current, current_max;
+  int previous_id[KMP_HW_LAST];
+  for (int i = 0; i < depth; ++i)
+    previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
+  int core_level = get_level(KMP_HW_CORE);
+  if (find_all)
+    above_level = -1;
+  KMP_ASSERT(above_level < core_level);
+  current_max = 0;
+  current = 0;
+  for (int i = 0; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &hw_thread = hw_threads[i];
+    if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) {
+      if (current > current_max)
+        current_max = current;
+      current = hw_thread.attrs.contains(attr);
+    } else {
+      for (int level = above_level + 1; level <= core_level; ++level) {
+        if (hw_thread.ids[level] != previous_id[level]) {
+          if (hw_thread.attrs.contains(attr))
+            current++;
+          break;
+        }
+      }
+    }
+    for (int level = 0; level < depth; ++level)
+      previous_id[level] = hw_thread.ids[level];
+  }
+  if (current > current_max)
+    current_max = current;
+  return current_max;
+}
+
 // Find out if the topology is uniform
 void kmp_topology_t::_discover_uniformity() {
   int num = 1;
@@ -406,7 +555,7 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
   kmp_topology_t *retval;
   // Allocate all data in one large allocation
   size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
-                sizeof(int) * ndepth * 3;
+                sizeof(int) * (size_t)KMP_HW_LAST * 3;
   char *bytes = (char *)__kmp_allocate(size);
   retval = (kmp_topology_t *)bytes;
   if (nproc > 0) {
@@ -419,8 +568,12 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
   int *arr =
       (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
   retval->types = (kmp_hw_t *)arr;
-  retval->ratio = arr + ndepth;
-  retval->count = arr + 2 * ndepth;
+  retval->ratio = arr + (size_t)KMP_HW_LAST;
+  retval->count = arr + 2 * (size_t)KMP_HW_LAST;
+  retval->num_core_efficiencies = 0;
+  retval->num_core_types = 0;
+  for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
+    retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
   KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
   for (int i = 0; i < ndepth; ++i) {
     retval->types[i] = types[i];
@@ -478,6 +631,13 @@ void kmp_topology_t::dump() const {
   }
   printf("\n");
 
+  printf("* num_core_eff: %d\n", num_core_efficiencies);
+  printf("* num_core_types: %d\n", num_core_types);
+  printf("* core_types: ");
+  for (int i = 0; i < num_core_types; ++i)
+    printf("%3d ", core_types[i]);
+  printf("\n");
+
   printf("* equivalent map:\n");
   KMP_FOREACH_HW_TYPE(i) {
     const char *key = __kmp_hw_get_keyword(i);
@@ -571,6 +731,29 @@ void kmp_topology_t::print(const char *env_var) const {
   }
   KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
 
+  // Hybrid topology information
+  if (__kmp_is_hybrid_cpu()) {
+    for (int i = 0; i < num_core_types; ++i) {
+      kmp_hw_core_type_t core_type = core_types[i];
+      kmp_hw_attr_t attr;
+      attr.clear();
+      attr.set_core_type(core_type);
+      int ncores = get_ncores_with_attr(attr);
+      if (ncores > 0) {
+        KMP_INFORM(TopologyHybrid, env_var, ncores,
+                   __kmp_hw_get_core_type_string(core_type));
+        KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS)
+        for (int eff = 0; eff < num_core_efficiencies; ++eff) {
+          attr.set_core_eff(eff);
+          int ncores_with_eff = get_ncores_with_attr(attr);
+          if (ncores_with_eff > 0) {
+            KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff);
+          }
+        }
+      }
+    }
+  }
+
   if (num_hw_threads <= 0) {
     __kmp_str_buf_free(&buf);
     return;
@@ -585,6 +768,10 @@ void kmp_topology_t::print(const char *env_var) const {
       __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
       __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
     }
+    if (__kmp_is_hybrid_cpu())
+      __kmp_str_buf_print(
+          &buf, "(%s)",
+          __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type()));
     KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
   }
 
@@ -592,6 +779,9 @@ void kmp_topology_t::print(const char *env_var) const {
 }
 
 void kmp_topology_t::canonicalize() {
+#if KMP_GROUP_AFFINITY
+  _insert_windows_proc_groups();
+#endif
   _remove_radix1_layers();
   _gather_enumeration_information();
   _discover_uniformity();
@@ -640,6 +830,25 @@ void kmp_topology_t::canonicalize() {
                   __kmp_hw_get_catalog_string(gran_type));
       __kmp_affinity_gran = gran_type;
     }
+#if KMP_GROUP_AFFINITY
+    // If more than one processor group exists, and the level of
+    // granularity specified by the user is too coarse, then the
+    // granularity must be adjusted "down" to processor group affinity
+    // because threads can only exist within one processor group.
+    // For example, if a user sets granularity=socket and there are two
+    // processor groups that cover a socket, then the runtime must
+    // restrict the granularity down to the processor group level.
+    if (__kmp_num_proc_groups > 1) {
+      int gran_depth = __kmp_topology->get_level(gran_type);
+      int proc_group_depth = __kmp_topology->get_level(KMP_HW_PROC_GROUP);
+      if (gran_depth >= 0 && proc_group_depth >= 0 &&
+          gran_depth < proc_group_depth) {
+        KMP_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY",
+                    __kmp_hw_get_catalog_string(__kmp_affinity_gran));
+        __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP;
+      }
+    }
+#endif
     __kmp_affinity_gran_levels = 0;
     for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
       __kmp_affinity_gran_levels++;
@@ -673,6 +882,56 @@ void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
   _discover_uniformity();
 }
 
+// Represents running sub IDs for a single core attribute where
+// attribute values have SIZE possibilities.
+template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
+  int last_level; // last level in topology to consider for sub_ids
+  int sub_id[SIZE]; // The sub ID for a given attribute value
+  int prev_sub_id[KMP_HW_LAST];
+  IndexFunc indexer;
+
+public:
+  kmp_sub_ids_t(int last_level) : last_level(last_level) {
+    KMP_ASSERT(last_level < KMP_HW_LAST);
+    for (size_t i = 0; i < SIZE; ++i)
+      sub_id[i] = -1;
+    for (size_t i = 0; i < KMP_HW_LAST; ++i)
+      prev_sub_id[i] = -1;
+  }
+  void update(const kmp_hw_thread_t &hw_thread) {
+    int idx = indexer(hw_thread);
+    KMP_ASSERT(idx < (int)SIZE);
+    for (int level = 0; level <= last_level; ++level) {
+      if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
+        if (level < last_level)
+          sub_id[idx] = -1;
+        sub_id[idx]++;
+        break;
+      }
+    }
+    for (int level = 0; level <= last_level; ++level)
+      prev_sub_id[level] = hw_thread.sub_ids[level];
+  }
+  int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
+    return sub_id[indexer(hw_thread)];
+  }
+};
+
+static kmp_str_buf_t *
+__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
+                                 bool plural) {
+  __kmp_str_buf_init(buf);
+  if (attr.is_core_type_valid())
+    __kmp_str_buf_print(buf, "%s %s",
+                        __kmp_hw_get_core_type_string(attr.get_core_type()),
+                        __kmp_hw_get_catalog_string(KMP_HW_CORE, plural));
+  else
+    __kmp_str_buf_print(buf, "%s eff=%d",
+                        __kmp_hw_get_catalog_string(KMP_HW_CORE, plural),
+                        attr.get_core_eff());
+  return buf;
+}
+
 // Apply the KMP_HW_SUBSET envirable to the topology
 // Returns true if KMP_HW_SUBSET filtered any processors
 // otherwise, returns false
@@ -681,18 +940,27 @@ bool kmp_topology_t::filter_hw_subset() {
   if (!__kmp_hw_subset)
     return false;
 
+  // First, sort the KMP_HW_SUBSET items by the machine topology
+  __kmp_hw_subset->sort();
+
   // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
+  bool using_core_types = false;
+  bool using_core_effs = false;
   int hw_subset_depth = __kmp_hw_subset->get_depth();
   kmp_hw_t specified[KMP_HW_LAST];
+  int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
   KMP_ASSERT(hw_subset_depth > 0);
   KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
+  int core_level = get_level(KMP_HW_CORE);
   for (int i = 0; i < hw_subset_depth; ++i) {
     int max_count;
-    int num = __kmp_hw_subset->at(i).num;
-    int offset = __kmp_hw_subset->at(i).offset;
-    kmp_hw_t type = __kmp_hw_subset->at(i).type;
+    const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i);
+    int num = item.num[0];
+    int offset = item.offset[0];
+    kmp_hw_t type = item.type;
     kmp_hw_t equivalent_type = equivalent[type];
     int level = get_level(type);
+    topology_levels[i] = level;
 
     // Check to see if current layer is in detected machine topology
     if (equivalent_type != KMP_HW_UNKNOWN) {
@@ -703,8 +971,8 @@ bool kmp_topology_t::filter_hw_subset() {
       return false;
     }
 
-    // Check to see if current layer has already been specified
-    // either directly or through an equivalent type
+    // Check to see if current layer has already been
+    // specified either directly or through an equivalent type
     if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
       KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type),
                   __kmp_hw_get_catalog_string(specified[equivalent_type]));
@@ -712,66 +980,247 @@ bool kmp_topology_t::filter_hw_subset() {
     }
     specified[equivalent_type] = type;
 
-    // Check to see if layers are in order
-    if (i + 1 < hw_subset_depth) {
-      kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type);
-      if (next_type == KMP_HW_UNKNOWN) {
-        KMP_WARNING(
-            AffHWSubsetNotExistGeneric,
-            __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type));
-        return false;
-      }
-      int next_topology_level = get_level(next_type);
-      if (level > next_topology_level) {
-        KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type),
-                    __kmp_hw_get_catalog_string(next_type));
-        return false;
-      }
-    }
-
     // Check to see if each layer's num & offset parameters are valid
     max_count = get_ratio(level);
-    if (max_count < 0 || num + offset > max_count) {
+    if (max_count < 0 ||
+        (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
       bool plural = (num > 1);
       KMP_WARNING(AffHWSubsetManyGeneric,
                   __kmp_hw_get_catalog_string(type, plural));
       return false;
     }
+
+    // Check to see if core attributes are consistent
+    if (core_level == level) {
+      // Determine which core attributes are specified
+      for (int j = 0; j < item.num_attrs; ++j) {
+        if (item.attr[j].is_core_type_valid())
+          using_core_types = true;
+        if (item.attr[j].is_core_eff_valid())
+          using_core_effs = true;
+      }
+
+      // Check if using a single core attribute on non-hybrid arch.
+      // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute.
+      //
+      // Check if using multiple core attributes on non-hyrbid arch.
+      // Ignore all of KMP_HW_SUBSET if this is the case.
+      if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
+        if (item.num_attrs == 1) {
+          if (using_core_effs) {
+            KMP_WARNING(AffHWSubsetIgnoringAttr, "efficiency");
+          } else {
+            KMP_WARNING(AffHWSubsetIgnoringAttr, "core_type");
+          }
+          using_core_effs = false;
+          using_core_types = false;
+        } else {
+          KMP_WARNING(AffHWSubsetAttrsNonHybrid);
+          return false;
+        }
+      }
+
+      // Check if using both core types and core efficiencies together
+      if (using_core_types && using_core_effs) {
+        KMP_WARNING(AffHWSubsetIncompat, "core_type", "efficiency");
+        return false;
+      }
+
+      // Check that core efficiency values are valid
+      if (using_core_effs) {
+        for (int j = 0; j < item.num_attrs; ++j) {
+          if (item.attr[j].is_core_eff_valid()) {
+            int core_eff = item.attr[j].get_core_eff();
+            if (core_eff < 0 || core_eff >= num_core_efficiencies) {
+              kmp_str_buf_t buf;
+              __kmp_str_buf_init(&buf);
+              __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff());
+              __kmp_msg(kmp_ms_warning,
+                        KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str),
+                        KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1),
+                        __kmp_msg_null);
+              __kmp_str_buf_free(&buf);
+              return false;
+            }
+          }
+        }
+      }
+
+      // Check that the number of requested cores with attributes is valid
+      if (using_core_types || using_core_effs) {
+        for (int j = 0; j < item.num_attrs; ++j) {
+          int num = item.num[j];
+          int offset = item.offset[j];
+          int level_above = core_level - 1;
+          if (level_above >= 0) {
+            max_count = get_ncores_with_attr_per(item.attr[j], level_above);
+            if (max_count <= 0 ||
+                (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
+              kmp_str_buf_t buf;
+              __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
+              KMP_WARNING(AffHWSubsetManyGeneric, buf.str);
+              __kmp_str_buf_free(&buf);
+              return false;
+            }
+          }
+        }
+      }
+
+      if ((using_core_types || using_core_effs) && item.num_attrs > 1) {
+        for (int j = 0; j < item.num_attrs; ++j) {
+          // Ambiguous use of specific core attribute + generic core
+          // e.g., 4c & 3c:intel_core or 4c & 3c:eff1
+          if (!item.attr[j]) {
+            kmp_hw_attr_t other_attr;
+            for (int k = 0; k < item.num_attrs; ++k) {
+              if (item.attr[k] != item.attr[j]) {
+                other_attr = item.attr[k];
+                break;
+              }
+            }
+            kmp_str_buf_t buf;
+            __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
+            KMP_WARNING(AffHWSubsetIncompat,
+                        __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
+            __kmp_str_buf_free(&buf);
+            return false;
+          }
+          // Allow specifying a specific core type or core eff exactly once
+          for (int k = 0; k < j; ++k) {
+            if (!item.attr[j] || !item.attr[k])
+              continue;
+            if (item.attr[k] == item.attr[j]) {
+              kmp_str_buf_t buf;
+              __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
+                                               item.num[j] > 0);
+              KMP_WARNING(AffHWSubsetAttrRepeat, buf.str);
+              __kmp_str_buf_free(&buf);
+              return false;
+            }
+          }
+        }
+      }
+    }
   }
 
-  // Apply the filtered hardware subset
-  int new_index = 0;
+  struct core_type_indexer {
+    int operator()(const kmp_hw_thread_t &t) const {
+      switch (t.attrs.get_core_type()) {
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+      case KMP_HW_CORE_TYPE_ATOM:
+        return 1;
+      case KMP_HW_CORE_TYPE_CORE:
+        return 2;
+#endif
+      case KMP_HW_CORE_TYPE_UNKNOWN:
+        return 0;
+      }
+      KMP_ASSERT(0);
+      return 0;
+    }
+  };
+  struct core_eff_indexer {
+    int operator()(const kmp_hw_thread_t &t) const {
+      return t.attrs.get_core_eff();
+    }
+  };
+
+  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
+      core_level);
+  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
+      core_level);
+
+  // Determine which hardware threads should be filtered.
+  int num_filtered = 0;
+  bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads);
   for (int i = 0; i < num_hw_threads; ++i) {
     kmp_hw_thread_t &hw_thread = hw_threads[i];
+    // Update type_sub_id
+    if (using_core_types)
+      core_type_sub_ids.update(hw_thread);
+    if (using_core_effs)
+      core_eff_sub_ids.update(hw_thread);
+
     // Check to see if this hardware thread should be filtered
     bool should_be_filtered = false;
-    for (int level = 0, hw_subset_index = 0;
-         level < depth && hw_subset_index < hw_subset_depth; ++level) {
-      kmp_hw_t topology_type = types[level];
-      auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
-      kmp_hw_t hw_subset_type = hw_subset_item.type;
-      if (topology_type != hw_subset_type)
+    for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth;
+         ++hw_subset_index) {
+      const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
+      int level = topology_levels[hw_subset_index];
+      if (level == -1)
         continue;
-      int num = hw_subset_item.num;
-      int offset = hw_subset_item.offset;
-      hw_subset_index++;
-      if (hw_thread.sub_ids[level] < offset ||
-          hw_thread.sub_ids[level] >= offset + num) {
-        should_be_filtered = true;
-        break;
+      if ((using_core_effs || using_core_types) && level == core_level) {
+        // Look for the core attribute in KMP_HW_SUBSET which corresponds
+        // to this hardware thread's core attribute. Use this num,offset plus
+        // the running sub_id for the particular core attribute of this hardware
+        // thread to determine if the hardware thread should be filtered or not.
+        int attr_idx;
+        kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type();
+        int core_eff = hw_thread.attrs.get_core_eff();
+        for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) {
+          if (using_core_types &&
+              hw_subset_item.attr[attr_idx].get_core_type() == core_type)
+            break;
+          if (using_core_effs &&
+              hw_subset_item.attr[attr_idx].get_core_eff() == core_eff)
+            break;
+        }
+        // This core attribute isn't in the KMP_HW_SUBSET so always filter it.
+        if (attr_idx == hw_subset_item.num_attrs) {
+          should_be_filtered = true;
+          break;
+        }
+        int sub_id;
+        int num = hw_subset_item.num[attr_idx];
+        int offset = hw_subset_item.offset[attr_idx];
+        if (using_core_types)
+          sub_id = core_type_sub_ids.get_sub_id(hw_thread);
+        else
+          sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
+        if (sub_id < offset ||
+            (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
+          should_be_filtered = true;
+          break;
+        }
+      } else {
+        int num = hw_subset_item.num[0];
+        int offset = hw_subset_item.offset[0];
+        if (hw_thread.sub_ids[level] < offset ||
+            (num != kmp_hw_subset_t::USE_ALL &&
+             hw_thread.sub_ids[level] >= offset + num)) {
+          should_be_filtered = true;
+          break;
+        }
       }
     }
-    if (!should_be_filtered) {
+    // Collect filtering information
+    filtered[i] = should_be_filtered;
+    if (should_be_filtered)
+      num_filtered++;
+  }
+
+  // One last check that we shouldn't allow filtering entire machine
+  if (num_filtered == num_hw_threads) {
+    KMP_WARNING(AffHWSubsetAllFiltered);
+    __kmp_free(filtered);
+    return false;
+  }
+
+  // Apply the filter
+  int new_index = 0;
+  for (int i = 0; i < num_hw_threads; ++i) {
+    if (!filtered[i]) {
       if (i != new_index)
-        hw_threads[new_index] = hw_thread;
+        hw_threads[new_index] = hw_threads[i];
       new_index++;
     } else {
 #if KMP_AFFINITY_SUPPORTED
-      KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask);
+      KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask);
 #endif
       __kmp_avail_proc--;
     }
   }
+
   KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
   num_hw_threads = new_index;
 
@@ -780,6 +1229,7 @@ bool kmp_topology_t::filter_hw_subset() {
   _discover_uniformity();
   _set_globals();
   _set_last_level_cache();
+  __kmp_free(filtered);
   return true;
 }
 
@@ -986,7 +1436,67 @@ kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
   return buf;
 }
 
-void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
+// Return (possibly empty) affinity mask representing the offline CPUs
+// Caller must free the mask
+kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
+  kmp_affin_mask_t *offline;
+  KMP_CPU_ALLOC(offline);
+  KMP_CPU_ZERO(offline);
+#if KMP_OS_LINUX
+  int n, begin_cpu, end_cpu;
+  kmp_safe_raii_file_t offline_file;
+  auto skip_ws = [](FILE *f) {
+    int c;
+    do {
+      c = fgetc(f);
+    } while (isspace(c));
+    if (c != EOF)
+      ungetc(c, f);
+  };
+  // File contains CSV of integer ranges representing the offline CPUs
+  // e.g., 1,2,4-7,9,11-15
+  int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
+  if (status != 0)
+    return offline;
+  while (!feof(offline_file)) {
+    skip_ws(offline_file);
+    n = fscanf(offline_file, "%d", &begin_cpu);
+    if (n != 1)
+      break;
+    skip_ws(offline_file);
+    int c = fgetc(offline_file);
+    if (c == EOF || c == ',') {
+      // Just single CPU
+      end_cpu = begin_cpu;
+    } else if (c == '-') {
+      // Range of CPUs
+      skip_ws(offline_file);
+      n = fscanf(offline_file, "%d", &end_cpu);
+      if (n != 1)
+        break;
+      skip_ws(offline_file);
+      c = fgetc(offline_file); // skip ','
+    } else {
+      // Syntax problem
+      break;
+    }
+    // Ensure a valid range of CPUs
+    if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 ||
+        end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
+      continue;
+    }
+    // Insert [begin_cpu, end_cpu] into offline mask
+    for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
+      KMP_CPU_SET(cpu, offline);
+    }
+  }
+#endif
+  return offline;
+}
+
+// Return the number of available procs
+int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
+  int avail_proc = 0;
   KMP_CPU_ZERO(mask);
 
 #if KMP_GROUP_AFFINITY
@@ -999,6 +1509,7 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
       int num = __kmp_GetActiveProcessorCount(group);
       for (i = 0; i < num; i++) {
         KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
+        avail_proc++;
       }
     }
   } else
@@ -1007,10 +1518,18 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
 
   {
     int proc;
+    kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus();
     for (proc = 0; proc < __kmp_xproc; proc++) {
+      // Skip offline CPUs
+      if (KMP_CPU_ISSET(proc, offline_cpus))
+        continue;
       KMP_CPU_SET(proc, mask);
+      avail_proc++;
     }
+    KMP_CPU_FREE(offline_cpus);
   }
+
+  return avail_proc;
 }
 
 // All of the __kmp_affinity_create_*_map() routines should allocate the
@@ -1156,6 +1675,45 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
     return true;
   }
 
+  // Handle multiple types of cores if they exist on the system
+  int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
+
+  typedef struct kmp_hwloc_cpukinds_info_t {
+    int efficiency;
+    kmp_hw_core_type_t core_type;
+    hwloc_bitmap_t mask;
+  } kmp_hwloc_cpukinds_info_t;
+  kmp_hwloc_cpukinds_info_t *cpukinds = nullptr;
+
+  if (nr_cpu_kinds > 0) {
+    unsigned nr_infos;
+    struct hwloc_info_s *infos;
+    cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate(
+        sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds);
+    for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) {
+      cpukinds[idx].efficiency = -1;
+      cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+      cpukinds[idx].mask = hwloc_bitmap_alloc();
+      if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask,
+                                  &cpukinds[idx].efficiency, &nr_infos, &infos,
+                                  0) == 0) {
+        for (unsigned i = 0; i < nr_infos; ++i) {
+          if (__kmp_str_match("CoreType", 8, infos[i].name)) {
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+            if (__kmp_str_match("IntelAtom", 9, infos[i].value)) {
+              cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM;
+              break;
+            } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) {
+              cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE;
+              break;
+            }
+#endif
+          }
+        }
+      }
+    }
+  }
+
   root = hwloc_get_root_obj(tp);
 
   // Figure out the depth and types in the topology
@@ -1215,6 +1773,20 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       hw_thread.clear();
       hw_thread.ids[index] = pu->logical_index;
       hw_thread.os_id = pu->os_index;
+      // If multiple core types, then set that attribute for the hardware thread
+      if (cpukinds) {
+        int cpukind_index = -1;
+        for (int i = 0; i < nr_cpu_kinds; ++i) {
+          if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) {
+            cpukind_index = i;
+            break;
+          }
+        }
+        if (cpukind_index >= 0) {
+          hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type);
+          hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
+        }
+      }
       index--;
     }
     obj = pu;
@@ -1258,6 +1830,13 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
     if (included)
       hw_thread_index++;
   }
+
+  // Free the core types information
+  if (cpukinds) {
+    for (int idx = 0; idx < nr_cpu_kinds; ++idx)
+      hwloc_bitmap_free(cpukinds[idx].mask);
+    __kmp_free(cpukinds);
+  }
   __kmp_topology->sort_ids();
   return true;
 }
@@ -1782,6 +2361,26 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
   return true;
 }
 
+// Hybrid cpu detection using CPUID.1A
+// Thread should be pinned to processor already
+static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency,
+                                  unsigned *native_model_id) {
+  kmp_cpuid buf;
+  __kmp_x86_cpuid(0x1a, 0, &buf);
+  *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
+  switch (*type) {
+  case KMP_HW_CORE_TYPE_ATOM:
+    *efficiency = 0;
+    break;
+  case KMP_HW_CORE_TYPE_CORE:
+    *efficiency = 1;
+    break;
+  default:
+    *efficiency = 0;
+  }
+  *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
+}
+
 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
 // architectures support a newer interface for specifying the x2APIC Ids,
 // based on CPUID.B or CPUID.1F
@@ -2051,6 +2650,15 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
         hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
       }
     }
+    // Hybrid information
+    if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
+      kmp_hw_core_type_t type;
+      unsigned native_model_id;
+      int efficiency;
+      __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
+      hw_thread.attrs.set_core_type(type);
+      hw_thread.attrs.set_core_eff(efficiency);
+    }
     hw_thread_index++;
   }
   KMP_ASSERT(hw_thread_index > 0);
@@ -2386,7 +2994,10 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
         unsigned val;
         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
           goto no_val;
-        KMP_ASSERT(nodeIdIndex + level <= maxIndex);
+        // validate the input before using level:
+        if (level > (unsigned)__kmp_xproc) { // level is too big
+          level = __kmp_xproc;
+        }
         if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
           goto dup_field;
         threadInfo[num_avail][nodeIdIndex + level] = val;
@@ -3497,8 +4108,8 @@ static void __kmp_aux_affinity_initialize(void) {
                                   __kmp_affin_fullMask);
         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
       }
-      __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
-      __kmp_avail_proc = __kmp_xproc;
+      __kmp_avail_proc =
+          __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
 #if KMP_OS_WINDOWS
       // Set the process affinity mask since threads' affinity
       // masks must be subset of process mask in Windows* OS
@@ -4145,14 +4756,19 @@ int __kmp_aux_set_affinity(void **mask) {
 int __kmp_aux_get_affinity(void **mask) {
   int gtid;
   int retval;
+#if KMP_OS_WINDOWS || KMP_DEBUG
   kmp_info_t *th;
-
+#endif
   if (!KMP_AFFINITY_CAPABLE()) {
     return -1;
   }
 
   gtid = __kmp_entry_gtid();
+#if KMP_OS_WINDOWS || KMP_DEBUG
   th = __kmp_threads[gtid];
+#else
+  (void)gtid; // unused variable
+#endif
   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
 
   KA_TRACE(
diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.h b/contrib/libs/cxxsupp/openmp/kmp_affinity.h
index 8e72922d2c..ce00362f04 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_affinity.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.h
@@ -15,6 +15,7 @@
 
 #include "kmp.h"
 #include "kmp_os.h"
+#include <limits>
 
 #if KMP_AFFINITY_SUPPORTED
 #if KMP_USE_HWLOC
@@ -598,6 +599,63 @@ class KMPNativeAffinity : public KMPAffinity {
 #endif /* KMP_OS_WINDOWS */
 #endif /* KMP_AFFINITY_SUPPORTED */
 
+// Describe an attribute for a level in the machine topology
+struct kmp_hw_attr_t {
+  int core_type : 8;
+  int core_eff : 8;
+  unsigned valid : 1;
+  unsigned reserved : 15;
+
+  static const int UNKNOWN_CORE_EFF = -1;
+
+  kmp_hw_attr_t()
+      : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
+        valid(0), reserved(0) {}
+  void set_core_type(kmp_hw_core_type_t type) {
+    valid = 1;
+    core_type = type;
+  }
+  void set_core_eff(int eff) {
+    valid = 1;
+    core_eff = eff;
+  }
+  kmp_hw_core_type_t get_core_type() const {
+    return (kmp_hw_core_type_t)core_type;
+  }
+  int get_core_eff() const { return core_eff; }
+  bool is_core_type_valid() const {
+    return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
+  }
+  bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
+  operator bool() const { return valid; }
+  void clear() {
+    core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+    core_eff = UNKNOWN_CORE_EFF;
+    valid = 0;
+  }
+  bool contains(const kmp_hw_attr_t &other) const {
+    if (!valid && !other.valid)
+      return true;
+    if (valid && other.valid) {
+      if (other.is_core_type_valid()) {
+        if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
+          return false;
+      }
+      if (other.is_core_eff_valid()) {
+        if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
+          return false;
+      }
+      return true;
+    }
+    return false;
+  }
+  bool operator==(const kmp_hw_attr_t &rhs) const {
+    return (rhs.valid == valid && rhs.core_eff == core_eff &&
+            rhs.core_type == core_type);
+  }
+  bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
+};
+
 class kmp_hw_thread_t {
 public:
   static const int UNKNOWN_ID = -1;
@@ -607,11 +665,14 @@ public:
   int sub_ids[KMP_HW_LAST];
   bool leader;
   int os_id;
+  kmp_hw_attr_t attrs;
+
   void print() const;
   void clear() {
     for (int i = 0; i < (int)KMP_HW_LAST; ++i)
       ids[i] = UNKNOWN_ID;
     leader = false;
+    attrs.clear();
   }
 };
 
@@ -624,7 +685,9 @@ class kmp_topology_t {
 
   int depth;
 
-  // The following arrays are all 'depth' long
+  // The following arrays are all 'depth' long and have been
+  // allocated to hold up to KMP_HW_LAST number of objects if
+  // needed so layers can be added without reallocation of any array
 
   // Orderd array of the types in the topology
   kmp_hw_t *types;
@@ -637,6 +700,12 @@ class kmp_topology_t {
   // Storage containing the absolute number of each topology layer
   int *count;
 
+  // The number of core efficiencies. This is only useful for hybrid
+  // topologies. Core efficiencies will range from 0 to num efficiencies - 1
+  int num_core_efficiencies;
+  int num_core_types;
+  kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
+
   // The hardware threads array
   // hw_threads is num_hw_threads long
   // Each hw_thread's ids and sub_ids are depth deep
@@ -652,6 +721,14 @@ class kmp_topology_t {
   // Flags describing the topology
   flags_t flags;
 
+  // Insert a new topology layer after allocation
+  void _insert_layer(kmp_hw_t type, const int *ids);
+
+#if KMP_GROUP_AFFINITY
+  // Insert topology information about Windows Processor groups
+  void _insert_windows_proc_groups();
+#endif
+
   // Count each item & get the num x's per y
   // e.g., get the number of cores and the number of threads per core
   // for each (x, y) in (KMP_HW_* , KMP_HW_*)
@@ -675,6 +752,12 @@ class kmp_topology_t {
   // Set the last level cache equivalent type
   void _set_last_level_cache();
 
+  // Return the number of cores with a particular attribute, 'attr'.
+  // If 'find_all' is true, then find all cores on the machine, otherwise find
+  // all cores per the layer 'above'
+  int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
+                            bool find_all = false) const;
+
 public:
   // Force use of allocate()/deallocate()
   kmp_topology_t() = delete;
@@ -764,6 +847,16 @@ public:
     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
     return count[level];
   }
+  // Return the total number of cores with attribute 'attr'
+  int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
+    return _get_ncores_with_attr(attr, -1, true);
+  }
+  // Return the number of cores with attribute
+  // 'attr' per topology level 'above'
+  int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
+    return _get_ncores_with_attr(attr, above, false);
+  }
+
 #if KMP_AFFINITY_SUPPORTED
   void sort_compact() {
     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
@@ -773,14 +866,22 @@ public:
   void print(const char *env_var = "KMP_AFFINITY") const;
   void dump() const;
 };
+extern kmp_topology_t *__kmp_topology;
 
 class kmp_hw_subset_t {
+  const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
+
 public:
+  // Describe a machine topology item in KMP_HW_SUBSET
   struct item_t {
-    int num;
     kmp_hw_t type;
-    int offset;
+    int num_attrs;
+    int num[MAX_ATTRS];
+    int offset[MAX_ATTRS];
+    kmp_hw_attr_t attr[MAX_ATTRS];
   };
+  // Put parenthesis around max to avoid accidental use of Windows max macro.
+  const static int USE_ALL = (std::numeric_limits<int>::max)();
 
 private:
   int depth;
@@ -790,6 +891,15 @@ private:
   bool absolute;
   // The set must be able to handle up to KMP_HW_LAST number of layers
   KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
+  // Sorting the KMP_HW_SUBSET items to follow topology order
+  // All unknown topology types will be at the beginning of the subset
+  static int hw_subset_compare(const void *i1, const void *i2) {
+    kmp_hw_t type1 = ((const item_t *)i1)->type;
+    kmp_hw_t type2 = ((const item_t *)i2)->type;
+    int level1 = __kmp_topology->get_level(type1);
+    int level2 = __kmp_topology->get_level(type2);
+    return level1 - level2;
+  }
 
 public:
   // Force use of allocate()/deallocate()
@@ -816,7 +926,20 @@ public:
   }
   void set_absolute() { absolute = true; }
   bool is_absolute() const { return absolute; }
-  void push_back(int num, kmp_hw_t type, int offset) {
+  void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
+    for (int i = 0; i < depth; ++i) {
+      // Found an existing item for this layer type
+      // Add the num, offset, and attr to this item
+      if (items[i].type == type) {
+        int idx = items[i].num_attrs++;
+        if ((size_t)idx >= MAX_ATTRS)
+          return;
+        items[i].num[idx] = num;
+        items[i].offset[idx] = offset;
+        items[i].attr[idx] = attr;
+        return;
+      }
+    }
     if (depth == capacity - 1) {
       capacity *= 2;
       item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
@@ -825,9 +948,11 @@ public:
       __kmp_free(items);
       items = new_items;
     }
-    items[depth].num = num;
+    items[depth].num_attrs = 1;
     items[depth].type = type;
-    items[depth].offset = offset;
+    items[depth].num[0] = num;
+    items[depth].offset[0] = offset;
+    items[depth].attr[0] = attr;
     depth++;
     set |= (1ull << type);
   }
@@ -848,6 +973,10 @@ public:
     }
     depth--;
   }
+  void sort() {
+    KMP_DEBUG_ASSERT(__kmp_topology);
+    qsort(items, depth, sizeof(item_t), hw_subset_compare);
+  }
   bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
   void dump() const {
     printf("**********************\n");
@@ -855,16 +984,25 @@ public:
     printf("* depth: %d\n", depth);
     printf("* items:\n");
     for (int i = 0; i < depth; ++i) {
-      printf("num: %d, type: %s, offset: %d\n", items[i].num,
-             __kmp_hw_get_keyword(items[i].type), items[i].offset);
+      printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
+      for (int j = 0; j < items[i].num_attrs; ++j) {
+        printf("  num: %d, offset: %d, attr: ", items[i].num[j],
+               items[i].offset[j]);
+        if (!items[i].attr[j]) {
+          printf(" (none)\n");
+        } else {
+          printf(
+              " core_type = %s, core_eff = %d\n",
+              __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
+              items[i].attr[j].get_core_eff());
+        }
+      }
     }
     printf("* set: 0x%llx\n", set);
     printf("* absolute: %d\n", absolute);
     printf("**********************\n");
   }
 };
-
-extern kmp_topology_t *__kmp_topology;
 extern kmp_hw_subset_t *__kmp_hw_subset;
 
 /* A structure for holding machine-specific hierarchy info to be computed once
diff --git a/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp b/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp
index b373353ddd..120cad17c2 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp
@@ -895,7 +895,7 @@ static void bpool(kmp_info_t *th, void *buf, bufsize len) {
   __kmp_bget_dequeue(th); /* Release any queued buffers */
 
 #ifdef SizeQuant
-  len &= ~(SizeQuant - 1);
+  len &= ~((bufsize)(SizeQuant - 1));
 #endif
   if (thr->pool_len == 0) {
     thr->pool_len = len;
@@ -1496,31 +1496,74 @@ typedef struct kmp_mem_desc { // Memory block descriptor
   void *ptr_align; // Pointer to aligned memory, returned
   kmp_allocator_t *allocator; // allocator
 } kmp_mem_desc_t;
-static int alignment = sizeof(void *); // let's align to pointer size
+static int alignment = sizeof(void *); // align to pointer size by default
 
+// external interfaces are wrappers over internal implementation
 void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
+  KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator));
+  void *ptr = __kmp_alloc(gtid, 0, size, allocator);
+  KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", ptr, gtid));
+  return ptr;
+}
+
+void *__kmpc_aligned_alloc(int gtid, size_t algn, size_t size,
+                           omp_allocator_handle_t allocator) {
+  KE_TRACE(25, ("__kmpc_aligned_alloc: T#%d (%d, %d, %p)\n", gtid, (int)algn,
+                (int)size, allocator));
+  void *ptr = __kmp_alloc(gtid, algn, size, allocator);
+  KE_TRACE(25, ("__kmpc_aligned_alloc returns %p, T#%d\n", ptr, gtid));
+  return ptr;
+}
+
+void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
+                    omp_allocator_handle_t allocator) {
+  KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb,
+                (int)size, allocator));
+  void *ptr = __kmp_calloc(gtid, 0, nmemb, size, allocator);
+  KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid));
+  return ptr;
+}
+
+void *__kmpc_realloc(int gtid, void *ptr, size_t size,
+                     omp_allocator_handle_t allocator,
+                     omp_allocator_handle_t free_allocator) {
+  KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size,
+                allocator, free_allocator));
+  void *nptr = __kmp_realloc(gtid, ptr, size, allocator, free_allocator);
+  KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid));
+  return nptr;
+}
+
+void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
+  KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));
+  ___kmpc_free(gtid, ptr, allocator);
+  KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, ptr, allocator));
+  return;
+}
+
+// internal implementation, called from inside the library
+void *__kmp_alloc(int gtid, size_t algn, size_t size,
+                  omp_allocator_handle_t allocator) {
   void *ptr = NULL;
   kmp_allocator_t *al;
   KMP_DEBUG_ASSERT(__kmp_init_serial);
-
   if (size == 0)
     return NULL;
-
   if (allocator == omp_null_allocator)
     allocator = __kmp_threads[gtid]->th.th_def_allocator;
 
-  KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator));
-  al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
+  al = RCAST(kmp_allocator_t *, allocator);
 
   int sz_desc = sizeof(kmp_mem_desc_t);
   kmp_mem_desc_t desc;
   kmp_uintptr_t addr; // address returned by allocator
   kmp_uintptr_t addr_align; // address to return to caller
   kmp_uintptr_t addr_descr; // address of memory block descriptor
-  int align = alignment; // default alignment
-  if (allocator > kmp_max_mem_alloc && al->alignment > 0) {
-    align = al->alignment; // alignment requested by user
-  }
+  size_t align = alignment; // default alignment
+  if (allocator > kmp_max_mem_alloc && al->alignment > align)
+    align = al->alignment; // alignment required by allocator trait
+  if (align < algn)
+    align = algn; // max of allocator trait, parameter and sizeof(void*)
   desc.size_orig = size;
   desc.size_a = size + sz_desc + align;
 
@@ -1549,7 +1592,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
         } else if (al->fb == omp_atv_allocator_fb) {
           KMP_ASSERT(al != al->fb_data);
           al = al->fb_data;
-          return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
+          return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
         } // else ptr == NULL;
       } else {
         // pool has enough space
@@ -1563,7 +1606,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
           } else if (al->fb == omp_atv_allocator_fb) {
             KMP_ASSERT(al != al->fb_data);
             al = al->fb_data;
-            return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
+            return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
           }
         }
       }
@@ -1579,7 +1622,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
         } else if (al->fb == omp_atv_allocator_fb) {
           KMP_ASSERT(al != al->fb_data);
           al = al->fb_data;
-          return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
+          return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
         }
       }
     }
@@ -1635,7 +1678,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
       } else if (al->fb == omp_atv_allocator_fb) {
         KMP_ASSERT(al != al->fb_data);
         al = al->fb_data;
-        return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
+        return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
       } // else ptr == NULL;
     } else {
       // pool has enough space
@@ -1651,7 +1694,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
       KMP_ASSERT(0); // abort fallback requested
     } // no sense to look for another fallback because of same internal alloc
   }
-  KE_TRACE(10, ("__kmpc_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));
+  KE_TRACE(10, ("__kmp_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));
   if (ptr == NULL)
     return NULL;
 
@@ -1665,12 +1708,11 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
   *((kmp_mem_desc_t *)addr_descr) = desc; // save descriptor contents
   KMP_MB();
 
-  KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", desc.ptr_align, gtid));
   return desc.ptr_align;
 }
 
-void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
-                    omp_allocator_handle_t allocator) {
+void *__kmp_calloc(int gtid, size_t algn, size_t nmemb, size_t size,
+                   omp_allocator_handle_t allocator) {
   void *ptr = NULL;
   kmp_allocator_t *al;
   KMP_DEBUG_ASSERT(__kmp_init_serial);
@@ -1678,10 +1720,7 @@ void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
   if (allocator == omp_null_allocator)
     allocator = __kmp_threads[gtid]->th.th_def_allocator;
 
-  KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb,
-                (int)size, allocator));
-
-  al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
+  al = RCAST(kmp_allocator_t *, allocator);
 
   if (nmemb == 0 || size == 0)
     return ptr;
@@ -1693,31 +1732,27 @@ void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
     return ptr;
   }
 
-  ptr = __kmpc_alloc(gtid, nmemb * size, allocator);
+  ptr = __kmp_alloc(gtid, algn, nmemb * size, allocator);
 
   if (ptr) {
     memset(ptr, 0x00, nmemb * size);
   }
-  KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid));
   return ptr;
 }
 
-void *__kmpc_realloc(int gtid, void *ptr, size_t size,
-                     omp_allocator_handle_t allocator,
-                     omp_allocator_handle_t free_allocator) {
+void *__kmp_realloc(int gtid, void *ptr, size_t size,
+                    omp_allocator_handle_t allocator,
+                    omp_allocator_handle_t free_allocator) {
   void *nptr = NULL;
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   if (size == 0) {
     if (ptr != NULL)
-      __kmpc_free(gtid, ptr, free_allocator);
+      ___kmpc_free(gtid, ptr, free_allocator);
     return nptr;
   }
 
-  KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size,
-                allocator, free_allocator));
-
-  nptr = __kmpc_alloc(gtid, size, allocator);
+  nptr = __kmp_alloc(gtid, 0, size, allocator);
 
   if (nptr != NULL && ptr != NULL) {
     kmp_mem_desc_t desc;
@@ -1736,15 +1771,13 @@ void *__kmpc_realloc(int gtid, void *ptr, size_t size,
   }
 
   if (nptr != NULL) {
-    __kmpc_free(gtid, ptr, free_allocator);
+    ___kmpc_free(gtid, ptr, free_allocator);
   }
 
-  KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid));
   return nptr;
 }
 
-void __kmpc_free(int gtid, void *ptr, const omp_allocator_handle_t allocator) {
-  KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));
+void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
   if (ptr == NULL)
     return;
 
@@ -1804,8 +1837,6 @@ void __kmpc_free(int gtid, void *ptr, const omp_allocator_handle_t allocator) {
     }
     __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);
   }
-  KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, desc.ptr_alloc,
-                allocator));
 }
 
 /* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
@@ -1939,9 +1970,10 @@ void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {
    In debug mode, fill the memory block with 0xEF before call to free(). */
 void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
   kmp_mem_descr_t descr;
+#if KMP_DEBUG
   kmp_uintptr_t addr_allocated; // Address returned by malloc().
   kmp_uintptr_t addr_aligned; // Aligned address passed by caller.
-
+#endif
   KE_TRACE(25,
            ("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));
   KMP_ASSERT(ptr != NULL);
@@ -1953,18 +1985,15 @@ void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
                 "ptr_aligned=%p, size_aligned=%d\n",
                 descr.ptr_allocated, (int)descr.size_allocated,
                 descr.ptr_aligned, (int)descr.size_aligned));
-
+#if KMP_DEBUG
   addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
   addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;
-
   KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);
   KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);
   KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);
   KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);
   KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
                    addr_allocated + descr.size_allocated);
-
-#ifdef KMP_DEBUG
   memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
 // Fill memory block with 0xEF, it helps catch using freed memory.
 #endif
diff --git a/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp b/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp
index fcc06216a4..0bd7b1a41a 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp
@@ -732,7 +732,7 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs,
 
 #define OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID)                                   \
   __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
-  (*lhs) = (TYPE)((*lhs)OP((TYPE)rhs));                                        \
+  (*lhs) = (TYPE)((*lhs)OP rhs);                                               \
   __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
 
 // ------------------------------------------------------------------------
@@ -791,14 +791,14 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs,
   {                                                                            \
     TYPE old_value, new_value;                                                 \
     old_value = *(TYPE volatile *)lhs;                                         \
-    new_value = (TYPE)(old_value OP((TYPE)rhs));                               \
+    new_value = (TYPE)(old_value OP rhs);                                      \
     while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
         (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
         *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
       KMP_DO_PAUSE;                                                            \
                                                                                \
       old_value = *(TYPE volatile *)lhs;                                       \
-      new_value = (TYPE)(old_value OP((TYPE)rhs));                             \
+      new_value = (TYPE)(old_value OP rhs);                                    \
     }                                                                          \
   }
 
@@ -1235,6 +1235,12 @@ MIN_MAX_COMPXCHG(float8, max, kmp_real64, 64, <, 8r, 7,
                  KMP_ARCH_X86) // __kmpc_atomic_float8_max
 MIN_MAX_COMPXCHG(float8, min, kmp_real64, 64, >, 8r, 7,
                  KMP_ARCH_X86) // __kmpc_atomic_float8_min
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+MIN_MAX_CRITICAL(float10, max, long double, <, 10r,
+                 1) // __kmpc_atomic_float10_max
+MIN_MAX_CRITICAL(float10, min, long double, >, 10r,
+                 1) // __kmpc_atomic_float10_min
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
 #if KMP_HAVE_QUAD
 MIN_MAX_CRITICAL(float16, max, QUAD_LEGACY, <, 16r,
                  1) // __kmpc_atomic_float16_max
@@ -1313,6 +1319,7 @@ ATOMIC_CMPX_EQV(fixed8, eqv, kmp_int64, 64, ^~, 8i, 7,
   }
 
 /* ------------------------------------------------------------------------- */
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
 // routines for long double type
 ATOMIC_CRITICAL(float10, add, long double, +, 10r,
                 1) // __kmpc_atomic_float10_add
@@ -1322,6 +1329,7 @@ ATOMIC_CRITICAL(float10, mul, long double, *, 10r,
                 1) // __kmpc_atomic_float10_mul
 ATOMIC_CRITICAL(float10, div, long double, /, 10r,
                 1) // __kmpc_atomic_float10_div
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
 #if KMP_HAVE_QUAD
 // routines for _Quad type
 ATOMIC_CRITICAL(float16, add, QUAD_LEGACY, +, 16r,
@@ -1367,6 +1375,7 @@ ATOMIC_CRITICAL(cmplx8, add, kmp_cmplx64, +, 16c, 1) // __kmpc_atomic_cmplx8_add
 ATOMIC_CRITICAL(cmplx8, sub, kmp_cmplx64, -, 16c, 1) // __kmpc_atomic_cmplx8_sub
 ATOMIC_CRITICAL(cmplx8, mul, kmp_cmplx64, *, 16c, 1) // __kmpc_atomic_cmplx8_mul
 ATOMIC_CRITICAL(cmplx8, div, kmp_cmplx64, /, 16c, 1) // __kmpc_atomic_cmplx8_div
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
 ATOMIC_CRITICAL(cmplx10, add, kmp_cmplx80, +, 20c,
                 1) // __kmpc_atomic_cmplx10_add
 ATOMIC_CRITICAL(cmplx10, sub, kmp_cmplx80, -, 20c,
@@ -1375,6 +1384,7 @@ ATOMIC_CRITICAL(cmplx10, mul, kmp_cmplx80, *, 20c,
                 1) // __kmpc_atomic_cmplx10_mul
 ATOMIC_CRITICAL(cmplx10, div, kmp_cmplx80, /, 20c,
                 1) // __kmpc_atomic_cmplx10_div
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
 #if KMP_HAVE_QUAD
 ATOMIC_CRITICAL(cmplx16, add, CPLX128_LEG, +, 32c,
                 1) // __kmpc_atomic_cmplx16_add
@@ -1793,6 +1803,7 @@ ATOMIC_CMPXCHG_MIX(float8, kmp_real64, mul, 64, *, fp, _Quad, 8r, 7,
 ATOMIC_CMPXCHG_MIX(float8, kmp_real64, div, 64, /, fp, _Quad, 8r, 7,
                    KMP_ARCH_X86) // __kmpc_atomic_float8_div_fp
 
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
 ATOMIC_CRITICAL_FP(float10, long double, add, +, fp, _Quad, 10r,
                    1) // __kmpc_atomic_float10_add_fp
 ATOMIC_CRITICAL_FP(float10, long double, sub, -, fp, _Quad, 10r,
@@ -1802,7 +1813,6 @@ ATOMIC_CRITICAL_FP(float10, long double, mul, *, fp, _Quad, 10r,
 ATOMIC_CRITICAL_FP(float10, long double, div, /, fp, _Quad, 10r,
                    1) // __kmpc_atomic_float10_div_fp
 
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
 // Reverse operations
 ATOMIC_CMPXCHG_REV_MIX(fixed1, char, sub_rev, 8, -, fp, _Quad, 1i, 0,
                        KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_rev_fp
@@ -2717,6 +2727,10 @@ MIN_MAX_COMPXCHG_CPT(float8, max_cpt, kmp_real64, 64, <,
                      KMP_ARCH_X86) // __kmpc_atomic_float8_max_cpt
 MIN_MAX_COMPXCHG_CPT(float8, min_cpt, kmp_real64, 64, >,
                      KMP_ARCH_X86) // __kmpc_atomic_float8_min_cpt
+MIN_MAX_CRITICAL_CPT(float10, max_cpt, long double, <, 10r,
+                     1) // __kmpc_atomic_float10_max_cpt
+MIN_MAX_CRITICAL_CPT(float10, min_cpt, long double, >, 10r,
+                     1) // __kmpc_atomic_float10_min_cpt
 #if KMP_HAVE_QUAD
 MIN_MAX_CRITICAL_CPT(float16, max_cpt, QUAD_LEGACY, <, 16r,
                      1) // __kmpc_atomic_float16_max_cpt
@@ -3586,7 +3600,7 @@ void __kmpc_atomic_8(ident_t *id_ref, int gtid, void *lhs, void *rhs,
       __kmp_release_atomic_lock(&__kmp_atomic_lock_8i, gtid);
   }
 }
-
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
 void __kmpc_atomic_10(ident_t *id_ref, int gtid, void *lhs, void *rhs,
                       void (*f)(void *, void *, void *)) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
@@ -3607,6 +3621,7 @@ void __kmpc_atomic_10(ident_t *id_ref, int gtid, void *lhs, void *rhs,
 #endif /* KMP_GOMP_COMPAT */
     __kmp_release_atomic_lock(&__kmp_atomic_lock_10r, gtid);
 }
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
 
 void __kmpc_atomic_16(ident_t *id_ref, int gtid, void *lhs, void *rhs,
                       void (*f)(void *, void *, void *)) {
@@ -3628,7 +3643,7 @@ void __kmpc_atomic_16(ident_t *id_ref, int gtid, void *lhs, void *rhs,
 #endif /* KMP_GOMP_COMPAT */
     __kmp_release_atomic_lock(&__kmp_atomic_lock_16c, gtid);
 }
-
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
 void __kmpc_atomic_20(ident_t *id_ref, int gtid, void *lhs, void *rhs,
                       void (*f)(void *, void *, void *)) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
@@ -3649,7 +3664,7 @@ void __kmpc_atomic_20(ident_t *id_ref, int gtid, void *lhs, void *rhs,
 #endif /* KMP_GOMP_COMPAT */
     __kmp_release_atomic_lock(&__kmp_atomic_lock_20c, gtid);
 }
-
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
 void __kmpc_atomic_32(ident_t *id_ref, int gtid, void *lhs, void *rhs,
                       void (*f)(void *, void *, void *)) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
@@ -3686,6 +3701,171 @@ void __kmpc_atomic_end(void) {
   __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
 }
 
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// OpenMP 5.1 compare and swap
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param x Memory location to operate on
+@param e Expected value
+@param d Desired value
+@return Result of comparison
+
+Implements Compare And Swap atomic operation.
+
+Sample code:
+#pragma omp atomic compare update capture
+  { r = x == e; if(r) { x = d; } }
+*/
+bool __kmpc_atomic_bool_1_cas(ident_t *loc, int gtid, char *x, char e, char d) {
+  return KMP_COMPARE_AND_STORE_ACQ8(x, e, d);
+}
+bool __kmpc_atomic_bool_2_cas(ident_t *loc, int gtid, short *x, short e,
+                              short d) {
+  return KMP_COMPARE_AND_STORE_ACQ16(x, e, d);
+}
+bool __kmpc_atomic_bool_4_cas(ident_t *loc, int gtid, kmp_int32 *x, kmp_int32 e,
+                              kmp_int32 d) {
+  return KMP_COMPARE_AND_STORE_ACQ32(x, e, d);
+}
+bool __kmpc_atomic_bool_8_cas(ident_t *loc, int gtid, kmp_int64 *x, kmp_int64 e,
+                              kmp_int64 d) {
+  return KMP_COMPARE_AND_STORE_ACQ64(x, e, d);
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param x Memory location to operate on
+@param e Expected value
+@param d Desired value
+@return Old value of x
+
+Implements Compare And Swap atomic operation.
+
+Sample code:
+#pragma omp atomic compare update capture
+  { v = x; if (x == e) { x = d; } }
+*/
+char __kmpc_atomic_val_1_cas(ident_t *loc, int gtid, char *x, char e, char d) {
+  return KMP_COMPARE_AND_STORE_RET8(x, e, d);
+}
+short __kmpc_atomic_val_2_cas(ident_t *loc, int gtid, short *x, short e,
+                              short d) {
+  return KMP_COMPARE_AND_STORE_RET16(x, e, d);
+}
+kmp_int32 __kmpc_atomic_val_4_cas(ident_t *loc, int gtid, kmp_int32 *x,
+                                  kmp_int32 e, kmp_int32 d) {
+  return KMP_COMPARE_AND_STORE_RET32(x, e, d);
+}
+kmp_int64 __kmpc_atomic_val_8_cas(ident_t *loc, int gtid, kmp_int64 *x,
+                                  kmp_int64 e, kmp_int64 d) {
+  return KMP_COMPARE_AND_STORE_RET64(x, e, d);
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param x Memory location to operate on
+@param e Expected value
+@param d Desired value
+@param pv Captured value location
+@return Result of comparison
+
+Implements Compare And Swap + Capture atomic operation.
+
+v gets old valie of x if comparison failed, untouched otherwise.
+Sample code:
+#pragma omp atomic compare update capture
+  { r = x == e; if(r) { x = d; } else { v = x; } }
+*/
+bool __kmpc_atomic_bool_1_cas_cpt(ident_t *loc, int gtid, char *x, char e,
+                                  char d, char *pv) {
+  char old = KMP_COMPARE_AND_STORE_RET8(x, e, d);
+  if (old == e)
+    return true;
+  KMP_ASSERT(pv != NULL);
+  *pv = old;
+  return false;
+}
+bool __kmpc_atomic_bool_2_cas_cpt(ident_t *loc, int gtid, short *x, short e,
+                                  short d, short *pv) {
+  short old = KMP_COMPARE_AND_STORE_RET16(x, e, d);
+  if (old == e)
+    return true;
+  KMP_ASSERT(pv != NULL);
+  *pv = old;
+  return false;
+}
+bool __kmpc_atomic_bool_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x,
+                                  kmp_int32 e, kmp_int32 d, kmp_int32 *pv) {
+  kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(x, e, d);
+  if (old == e)
+    return true;
+  KMP_ASSERT(pv != NULL);
+  *pv = old;
+  return false;
+}
+bool __kmpc_atomic_bool_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x,
+                                  kmp_int64 e, kmp_int64 d, kmp_int64 *pv) {
+  kmp_int64 old = KMP_COMPARE_AND_STORE_RET64(x, e, d);
+  if (old == e)
+    return true;
+  KMP_ASSERT(pv != NULL);
+  *pv = old;
+  return false;
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param x Memory location to operate on
+@param e Expected value
+@param d Desired value
+@param pv Captured value location
+@return Old value of x
+
+Implements Compare And Swap + Capture atomic operation.
+
+v gets new valie of x.
+Sample code:
+#pragma omp atomic compare update capture
+  { if (x == e) { x = d; }; v = x; }
+*/
+char __kmpc_atomic_val_1_cas_cpt(ident_t *loc, int gtid, char *x, char e,
+                                 char d, char *pv) {
+  char old = KMP_COMPARE_AND_STORE_RET8(x, e, d);
+  KMP_ASSERT(pv != NULL);
+  *pv = old == e ? d : old;
+  return old;
+}
+short __kmpc_atomic_val_2_cas_cpt(ident_t *loc, int gtid, short *x, short e,
+                                  short d, short *pv) {
+  short old = KMP_COMPARE_AND_STORE_RET16(x, e, d);
+  KMP_ASSERT(pv != NULL);
+  *pv = old == e ? d : old;
+  return old;
+}
+kmp_int32 __kmpc_atomic_val_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x,
+                                      kmp_int32 e, kmp_int32 d, kmp_int32 *pv) {
+  kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(x, e, d);
+  KMP_ASSERT(pv != NULL);
+  *pv = old == e ? d : old;
+  return old;
+}
+kmp_int64 __kmpc_atomic_val_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x,
+                                      kmp_int64 e, kmp_int64 d, kmp_int64 *pv) {
+  kmp_int64 old = KMP_COMPARE_AND_STORE_RET64(x, e, d);
+  KMP_ASSERT(pv != NULL);
+  *pv = old == e ? d : old;
+  return old;
+}
+
+// End OpenMP 5.1 compare + capture
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+
 /*!
 @}
 */
diff --git a/contrib/libs/cxxsupp/openmp/kmp_atomic.h b/contrib/libs/cxxsupp/openmp/kmp_atomic.h
index 6a0827aaf1..079b917285 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_atomic.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_atomic.h
@@ -578,6 +578,10 @@ void __kmpc_atomic_float8_max(ident_t *id_ref, int gtid, kmp_real64 *lhs,
                               kmp_real64 rhs);
 void __kmpc_atomic_float8_min(ident_t *id_ref, int gtid, kmp_real64 *lhs,
                               kmp_real64 rhs);
+void __kmpc_atomic_float10_max(ident_t *id_ref, int gtid, long double *lhs,
+                               long double rhs);
+void __kmpc_atomic_float10_min(ident_t *id_ref, int gtid, long double *lhs,
+                               long double rhs);
 #if KMP_HAVE_QUAD
 void __kmpc_atomic_float16_max(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
                                QUAD_LEGACY rhs);
@@ -1254,6 +1258,12 @@ kmp_real64 __kmpc_atomic_float8_max_cpt(ident_t *id_ref, int gtid,
 kmp_real64 __kmpc_atomic_float8_min_cpt(ident_t *id_ref, int gtid,
                                         kmp_real64 *lhs, kmp_real64 rhs,
                                         int flag);
+long double __kmpc_atomic_float10_max_cpt(ident_t *id_ref, int gtid,
+                                          long double *lhs, long double rhs,
+                                          int flag);
+long double __kmpc_atomic_float10_min_cpt(ident_t *id_ref, int gtid,
+                                          long double *lhs, long double rhs,
+                                          int flag);
 #if KMP_HAVE_QUAD
 QUAD_LEGACY __kmpc_atomic_float16_max_cpt(ident_t *id_ref, int gtid,
                                           QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
@@ -1756,6 +1766,78 @@ long double __kmpc_atomic_float10_div_cpt_rev_fp(ident_t *id_ref, int gtid,
 
 // End of OpenMP 4.0 capture
 
+// OpenMP 5.1 compare and swap
+/*
+    __kmpc_atomic_bool_1_cas
+    __kmpc_atomic_bool_2_cas
+    __kmpc_atomic_bool_4_cas
+    __kmpc_atomic_bool_8_cas
+    __kmpc_atomic_val_1_cas
+    __kmpc_atomic_val_2_cas
+    __kmpc_atomic_val_4_cas
+    __kmpc_atomic_val_8_cas
+    __kmpc_atomic_bool_1_cas_cpt
+    __kmpc_atomic_bool_2_cas_cpt
+    __kmpc_atomic_bool_4_cas_cpt
+    __kmpc_atomic_bool_8_cas_cpt
+    __kmpc_atomic_val_1_cas_cpt
+    __kmpc_atomic_val_2_cas_cpt
+    __kmpc_atomic_val_4_cas_cpt
+    __kmpc_atomic_val_8_cas_cpt
+*/
+// In all interfaces of CAS (Compare And Swap):
+// r is the boolean result of comparison
+// x is memory location to operate on
+// e is expected (old) value
+// d is desired (new) value
+// pv is pointer to captured value v whose location may coincide with e
+
+// { r = x == e; if(r) { x = d; } }
+// functions return result of comparison
+bool __kmpc_atomic_bool_1_cas(ident_t *loc, int gtid, char *x, char e, char d);
+bool __kmpc_atomic_bool_2_cas(ident_t *loc, int gtid, short *x, short e,
+                              short d);
+bool __kmpc_atomic_bool_4_cas(ident_t *loc, int gtid, kmp_int32 *x, kmp_int32 e,
+                              kmp_int32 d);
+bool __kmpc_atomic_bool_8_cas(ident_t *loc, int gtid, kmp_int64 *x, kmp_int64 e,
+                              kmp_int64 d);
+
+// { v = x; if (x == e) { x = d; } }
+// functions return old value
+char __kmpc_atomic_val_1_cas(ident_t *loc, int gtid, char *x, char e, char d);
+short __kmpc_atomic_val_2_cas(ident_t *loc, int gtid, short *x, short e,
+                              short d);
+kmp_int32 __kmpc_atomic_val_4_cas(ident_t *loc, int gtid, kmp_int32 *x,
+                                  kmp_int32 e, kmp_int32 d);
+kmp_int64 __kmpc_atomic_val_8_cas(ident_t *loc, int gtid, kmp_int64 *x,
+                                  kmp_int64 e, kmp_int64 d);
+
+// { r = x == e; if(r) { x = d; } else { v = x; } }
+// v gets old value if comparison failed, untouched otherwise
+// functions return result of comparison
+bool __kmpc_atomic_bool_1_cas_cpt(ident_t *loc, int gtid, char *x, char e,
+                                  char d, char *pv);
+bool __kmpc_atomic_bool_2_cas_cpt(ident_t *loc, int gtid, short *x, short e,
+                                  short d, short *pv);
+bool __kmpc_atomic_bool_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x,
+                                  kmp_int32 e, kmp_int32 d, kmp_int32 *pv);
+bool __kmpc_atomic_bool_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x,
+                                  kmp_int64 e, kmp_int64 d, kmp_int64 *pv);
+
+// { if (x == e) { x = d; }; v = x; }
+// v gets old value if comparison failed, new value otherwise
+// functions return old value
+char __kmpc_atomic_val_1_cas_cpt(ident_t *loc, int gtid, char *x, char e,
+                                 char d, char *pv);
+short __kmpc_atomic_val_2_cas_cpt(ident_t *loc, int gtid, short *x, short e,
+                                  short d, short *pv);
+kmp_int32 __kmpc_atomic_val_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x,
+                                      kmp_int32 e, kmp_int32 d, kmp_int32 *pv);
+kmp_int64 __kmpc_atomic_val_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x,
+                                      kmp_int64 e, kmp_int64 d, kmp_int64 *pv);
+
+// End OpenMP 5.1 compare + capture
+
 #endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
 
 /* ------------------------------------------------------------------------ */
diff --git a/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp b/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp
index 93112156a1..ee05bb3587 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp
@@ -10,12 +10,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "kmp.h"
 #include "kmp_wait_release.h"
+#include "kmp_barrier.h"
 #include "kmp_itt.h"
 #include "kmp_os.h"
 #include "kmp_stats.h"
 #include "ompt-specific.h"
+// for distributed barrier
+#include "kmp_affinity.h"
 
 #if KMP_MIC
 #include <immintrin.h>
@@ -38,6 +40,516 @@
 void __kmp_print_structure(void); // Forward declaration
 
 // ---------------------------- Barrier Algorithms ----------------------------
+// Distributed barrier
+
+// Compute how many threads to have polling each cache-line.
+// We want to limit the number of writes to IDEAL_GO_RESOLUTION.
+void distributedBarrier::computeVarsForN(size_t n) {
+  int nsockets = 1;
+  if (__kmp_topology) {
+    int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
+    int core_level = __kmp_topology->get_level(KMP_HW_CORE);
+    int ncores_per_socket =
+        __kmp_topology->calculate_ratio(core_level, socket_level);
+    nsockets = __kmp_topology->get_count(socket_level);
+
+    if (nsockets <= 0)
+      nsockets = 1;
+    if (ncores_per_socket <= 0)
+      ncores_per_socket = 1;
+
+    threads_per_go = ncores_per_socket >> 1;
+    if (!fix_threads_per_go) {
+      // Minimize num_gos
+      if (threads_per_go > 4) {
+        if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
+          threads_per_go = threads_per_go >> 1;
+        }
+        if (threads_per_go > 4 && nsockets == 1)
+          threads_per_go = threads_per_go >> 1;
+      }
+    }
+    if (threads_per_go == 0)
+      threads_per_go = 1;
+    fix_threads_per_go = true;
+    num_gos = n / threads_per_go;
+    if (n % threads_per_go)
+      num_gos++;
+    if (nsockets == 1 || num_gos == 1)
+      num_groups = 1;
+    else {
+      num_groups = num_gos / nsockets;
+      if (num_gos % nsockets)
+        num_groups++;
+    }
+    if (num_groups <= 0)
+      num_groups = 1;
+    gos_per_group = num_gos / num_groups;
+    if (num_gos % num_groups)
+      gos_per_group++;
+    threads_per_group = threads_per_go * gos_per_group;
+  } else {
+    num_gos = n / threads_per_go;
+    if (n % threads_per_go)
+      num_gos++;
+    if (num_gos == 1)
+      num_groups = 1;
+    else {
+      num_groups = num_gos / 2;
+      if (num_gos % 2)
+        num_groups++;
+    }
+    gos_per_group = num_gos / num_groups;
+    if (num_gos % num_groups)
+      gos_per_group++;
+    threads_per_group = threads_per_go * gos_per_group;
+  }
+}
+
+void distributedBarrier::computeGo(size_t n) {
+  // Minimize num_gos
+  for (num_gos = 1;; num_gos++)
+    if (IDEAL_CONTENTION * num_gos >= n)
+      break;
+  threads_per_go = n / num_gos;
+  if (n % num_gos)
+    threads_per_go++;
+  while (num_gos > MAX_GOS) {
+    threads_per_go++;
+    num_gos = n / threads_per_go;
+    if (n % threads_per_go)
+      num_gos++;
+  }
+  computeVarsForN(n);
+}
+
+// This function is to resize the barrier arrays when the new number of threads
+// exceeds max_threads, which is the current size of all the arrays
+void distributedBarrier::resize(size_t nthr) {
+  KMP_DEBUG_ASSERT(nthr > max_threads);
+
+  // expand to requested size * 2
+  max_threads = nthr * 2;
+
+  // allocate arrays to new max threads
+  for (int i = 0; i < MAX_ITERS; ++i) {
+    if (flags[i])
+      flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
+                                                 max_threads * sizeof(flags_s));
+    else
+      flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
+  }
+
+  if (go)
+    go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
+  else
+    go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
+
+  if (iter)
+    iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
+  else
+    iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
+
+  if (sleep)
+    sleep =
+        (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
+  else
+    sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
+}
+
+// This function is to set all the go flags that threads might be waiting
+// on, and when blocktime is not infinite, it should be followed by a wake-up
+// call to each thread
+kmp_uint64 distributedBarrier::go_release() {
+  kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
+  for (size_t j = 0; j < num_gos; j++) {
+    go[j].go.store(next_go);
+  }
+  return next_go;
+}
+
+void distributedBarrier::go_reset() {
+  for (size_t j = 0; j < max_threads; ++j) {
+    for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
+      flags[i][j].stillNeed = 1;
+    }
+    go[j].go.store(0);
+    iter[j].iter = 0;
+  }
+}
+
+// This function inits/re-inits the distributed barrier for a particular number
+// of threads. If a resize of arrays is needed, it calls the resize function.
+void distributedBarrier::init(size_t nthr) {
+  size_t old_max = max_threads;
+  if (nthr > max_threads) { // need more space in arrays
+    resize(nthr);
+  }
+
+  for (size_t i = 0; i < max_threads; i++) {
+    for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
+      flags[j][i].stillNeed = 1;
+    }
+    go[i].go.store(0);
+    iter[i].iter = 0;
+    if (i >= old_max)
+      sleep[i].sleep = false;
+  }
+
+  // Recalculate num_gos, etc. based on new nthr
+  computeVarsForN(nthr);
+
+  num_threads = nthr;
+
+  if (team_icvs == NULL)
+    team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
+}
+
+// This function is used only when KMP_BLOCKTIME is not infinite.
+// static
+void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
+                               size_t start, size_t stop, size_t inc,
+                               size_t tid) {
+  KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
+  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+    return;
+
+  kmp_info_t **other_threads = team->t.t_threads;
+  for (size_t thr = start; thr < stop; thr += inc) {
+    KMP_DEBUG_ASSERT(other_threads[thr]);
+    int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
+    // Wake up worker regardless of if it appears to be sleeping or not
+    __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
+  }
+}
+
+static void __kmp_dist_barrier_gather(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
+  kmp_team_t *team;
+  distributedBarrier *b;
+  kmp_info_t **other_threads;
+  kmp_uint64 my_current_iter, my_next_iter;
+  kmp_uint32 nproc;
+  bool group_leader;
+
+  team = this_thr->th.th_team;
+  nproc = this_thr->th.th_team_nproc;
+  other_threads = team->t.t_threads;
+  b = team->t.b;
+  my_current_iter = b->iter[tid].iter;
+  my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
+  group_leader = ((tid % b->threads_per_group) == 0);
+
+  KA_TRACE(20,
+           ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
+            gtid, team->t.t_id, tid, bt));
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  // Barrier imbalance - save arrive time to the thread
+  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
+        __itt_get_timestamp();
+  }
+#endif
+
+  if (group_leader) {
+    // Start from the thread after the group leader
+    size_t group_start = tid + 1;
+    size_t group_end = tid + b->threads_per_group;
+    size_t threads_pending = 0;
+
+    if (group_end > nproc)
+      group_end = nproc;
+    do { // wait for threads in my group
+      threads_pending = 0;
+      // Check all the flags every time to avoid branch misspredict
+      for (size_t thr = group_start; thr < group_end; thr++) {
+        // Each thread uses a different cache line
+        threads_pending += b->flags[my_current_iter][thr].stillNeed;
+      }
+      // Execute tasks here
+      if (__kmp_tasking_mode != tskm_immediate_exec) {
+        kmp_task_team_t *task_team = this_thr->th.th_task_team;
+        if (task_team != NULL) {
+          if (TCR_SYNC_4(task_team->tt.tt_active)) {
+            if (KMP_TASKING_ENABLED(task_team)) {
+              int tasks_completed = FALSE;
+              __kmp_atomic_execute_tasks_64(
+                  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
+                  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+            } else
+              this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+          }
+        } else {
+          this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+        } // if
+      }
+      if (TCR_4(__kmp_global.g.g_done)) {
+        if (__kmp_global.g.g_abort)
+          __kmp_abort_thread();
+        break;
+      } else if (__kmp_tasking_mode != tskm_immediate_exec &&
+                 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
+        this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
+      }
+    } while (threads_pending > 0);
+
+    if (reduce) { // Perform reduction if needed
+      OMPT_REDUCTION_DECL(this_thr, gtid);
+      OMPT_REDUCTION_BEGIN;
+      // Group leader reduces all threads in group
+      for (size_t thr = group_start; thr < group_end; thr++) {
+        (*reduce)(this_thr->th.th_local.reduce_data,
+                  other_threads[thr]->th.th_local.reduce_data);
+      }
+      OMPT_REDUCTION_END;
+    }
+
+    // Set flag for next iteration
+    b->flags[my_next_iter][tid].stillNeed = 1;
+    // Each thread uses a different cache line; resets stillNeed to 0 to
+    // indicate it has reached the barrier
+    b->flags[my_current_iter][tid].stillNeed = 0;
+
+    do { // wait for all group leaders
+      threads_pending = 0;
+      for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
+        threads_pending += b->flags[my_current_iter][thr].stillNeed;
+      }
+      // Execute tasks here
+      if (__kmp_tasking_mode != tskm_immediate_exec) {
+        kmp_task_team_t *task_team = this_thr->th.th_task_team;
+        if (task_team != NULL) {
+          if (TCR_SYNC_4(task_team->tt.tt_active)) {
+            if (KMP_TASKING_ENABLED(task_team)) {
+              int tasks_completed = FALSE;
+              __kmp_atomic_execute_tasks_64(
+                  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
+                  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+            } else
+              this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+          }
+        } else {
+          this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+        } // if
+      }
+      if (TCR_4(__kmp_global.g.g_done)) {
+        if (__kmp_global.g.g_abort)
+          __kmp_abort_thread();
+        break;
+      } else if (__kmp_tasking_mode != tskm_immediate_exec &&
+                 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
+        this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
+      }
+    } while (threads_pending > 0);
+
+    if (reduce) { // Perform reduction if needed
+      if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
+        for (size_t thr = b->threads_per_group; thr < nproc;
+             thr += b->threads_per_group) {
+          (*reduce)(this_thr->th.th_local.reduce_data,
+                    other_threads[thr]->th.th_local.reduce_data);
+        }
+        OMPT_REDUCTION_END;
+      }
+    }
+  } else {
+    // Set flag for next iteration
+    b->flags[my_next_iter][tid].stillNeed = 1;
+    // Each thread uses a different cache line; resets stillNeed to 0 to
+    // indicate it has reached the barrier
+    b->flags[my_current_iter][tid].stillNeed = 0;
+  }
+
+  KMP_MFENCE();
+
+  KA_TRACE(20,
+           ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+            gtid, team->t.t_id, tid, bt));
+}
+
+static void __kmp_dist_barrier_release(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
+  kmp_team_t *team;
+  distributedBarrier *b;
+  kmp_bstate_t *thr_bar;
+  kmp_uint64 my_current_iter, next_go;
+  size_t my_go_index;
+  bool group_leader;
+
+  KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
+                gtid, tid, bt));
+
+  thr_bar = &this_thr->th.th_bar[bt].bb;
+
+  if (!KMP_MASTER_TID(tid)) {
+    // workers and non-master group leaders need to check their presence in team
+    do {
+      if (this_thr->th.th_used_in_team.load() != 1 &&
+          this_thr->th.th_used_in_team.load() != 3) {
+        // Thread is not in use in a team. Wait on location in tid's thread
+        // struct. The 0 value tells anyone looking that this thread is spinning
+        // or sleeping until this location becomes 3 again; 3 is the transition
+        // state to get to 1 which is waiting on go and being in the team
+        kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
+        if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
+                                        0) ||
+            this_thr->th.th_used_in_team.load() == 0) {
+          my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
+        }
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+          // In fork barrier where we could not get the object reliably
+          itt_sync_obj =
+              __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+          // Cancel wait on previous parallel region...
+          __kmp_itt_task_starting(itt_sync_obj);
+
+          if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+            return;
+
+          itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+          if (itt_sync_obj != NULL)
+            // Call prepare as early as possible for "new" barrier
+            __kmp_itt_task_finished(itt_sync_obj);
+        } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+          return;
+      }
+      if (this_thr->th.th_used_in_team.load() != 1 &&
+          this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
+        continue;
+      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+        return;
+
+      // At this point, the thread thinks it is in use in a team, or in
+      // transition to be used in a team, but it might have reached this barrier
+      // before it was marked unused by the team. Unused threads are awoken and
+      // shifted to wait on local thread struct elsewhere. It also might reach
+      // this point by being picked up for use by a different team. Either way,
+      // we need to update the tid.
+      tid = __kmp_tid_from_gtid(gtid);
+      team = this_thr->th.th_team;
+      KMP_DEBUG_ASSERT(tid >= 0);
+      KMP_DEBUG_ASSERT(team);
+      b = team->t.b;
+      my_current_iter = b->iter[tid].iter;
+      next_go = my_current_iter + distributedBarrier::MAX_ITERS;
+      my_go_index = tid / b->threads_per_go;
+      if (this_thr->th.th_used_in_team.load() == 3) {
+        KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
+      }
+      // Check if go flag is set
+      if (b->go[my_go_index].go.load() != next_go) {
+        // Wait on go flag on team
+        kmp_atomic_flag_64<false, true> my_flag(
+            &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
+        my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
+        KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
+                         b->iter[tid].iter == 0);
+        KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
+      }
+
+      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+        return;
+      // At this point, the thread's go location was set. This means the primary
+      // thread is safely in the barrier, and so this thread's data is
+      // up-to-date, but we should check again that this thread is really in
+      // use in the team, as it could have been woken up for the purpose of
+      // changing team size, or reaping threads at shutdown.
+      if (this_thr->th.th_used_in_team.load() == 1)
+        break;
+    } while (1);
+
+    if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+      return;
+
+    group_leader = ((tid % b->threads_per_group) == 0);
+    if (group_leader) {
+      // Tell all the threads in my group they can go!
+      for (size_t go_idx = my_go_index + 1;
+           go_idx < my_go_index + b->gos_per_group; go_idx++) {
+        b->go[go_idx].go.store(next_go);
+      }
+      // Fence added so that workers can see changes to go. sfence inadequate.
+      KMP_MFENCE();
+    }
+
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs) { // copy ICVs to final dest
+      __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
+                               tid, FALSE);
+      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                (kmp_internal_control_t *)team->t.b->team_icvs);
+      copy_icvs(&thr_bar->th_fixed_icvs,
+                &team->t.t_implicit_task_taskdata[tid].td_icvs);
+    }
+#endif
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
+      // This thread is now awake and participating in the barrier;
+      // wake up the other threads in the group
+      size_t nproc = this_thr->th.th_team_nproc;
+      size_t group_end = tid + b->threads_per_group;
+      if (nproc < group_end)
+        group_end = nproc;
+      __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
+    }
+  } else { //  Primary thread
+    team = this_thr->th.th_team;
+    b = team->t.b;
+    my_current_iter = b->iter[tid].iter;
+    next_go = my_current_iter + distributedBarrier::MAX_ITERS;
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs) {
+      // primary thread has ICVs in final destination; copy
+      copy_icvs(&thr_bar->th_fixed_icvs,
+                &team->t.t_implicit_task_taskdata[tid].td_icvs);
+    }
+#endif
+    // Tell all the group leaders they can go!
+    for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
+      b->go[go_idx].go.store(next_go);
+    }
+
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+      // Wake-up the group leaders
+      size_t nproc = this_thr->th.th_team_nproc;
+      __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
+                                b->threads_per_group, tid);
+    }
+
+    // Tell all the threads in my group they can go!
+    for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
+      b->go[go_idx].go.store(next_go);
+    }
+
+    // Fence added so that workers can see changes to go. sfence inadequate.
+    KMP_MFENCE();
+
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+      // Wake-up the other threads in my group
+      size_t nproc = this_thr->th.th_team_nproc;
+      size_t group_end = tid + b->threads_per_group;
+      if (nproc < group_end)
+        group_end = nproc;
+      __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
+    }
+  }
+  // Update to next iteration
+  KMP_ASSERT(my_current_iter == b->iter[tid].iter);
+  b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
+
+  KA_TRACE(
+      20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+           gtid, team->t.t_id, tid, bt));
+}
 
 // Linear Barrier
 template <bool cancellable = false>
@@ -1354,6 +1866,11 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
           bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
     } else {
       switch (__kmp_barrier_gather_pattern[bt]) {
+      case bp_dist_bar: {
+        __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
+                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+        break;
+      }
       case bp_hyper_bar: {
         // don't set branch bits to 0; use linear
         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
@@ -1467,6 +1984,12 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
       } else {
         switch (__kmp_barrier_release_pattern[bt]) {
+        case bp_dist_bar: {
+          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+          __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
+                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          break;
+        }
         case bp_hyper_bar: {
           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
@@ -1514,8 +2037,10 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
         }
 #endif
 
-        KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
-                         TRUE);
+        KMP_DEBUG_ASSERT(
+            this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
+            this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
+                TRUE);
         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
         __kmp_task_team_setup(this_thr, team, 0);
 
@@ -1596,6 +2121,11 @@ void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
   if (!team->t.t_serialized) {
     if (KMP_MASTER_GTID(gtid)) {
       switch (__kmp_barrier_release_pattern[bt]) {
+      case bp_dist_bar: {
+        __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
+                                   FALSE USE_ITT_BUILD_ARG(NULL));
+        break;
+      }
       case bp_hyper_bar: {
         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
@@ -1634,7 +2164,6 @@ void __kmp_join_barrier(int gtid) {
   kmp_info_t *this_thr = __kmp_threads[gtid];
   kmp_team_t *team;
   kmp_uint nproc;
-  kmp_info_t *master_thread;
   int tid;
 #ifdef KMP_DEBUG
   int team_id;
@@ -1656,9 +2185,7 @@ void __kmp_join_barrier(int gtid) {
   tid = __kmp_tid_from_gtid(gtid);
 #ifdef KMP_DEBUG
   team_id = team->t.t_id;
-#endif /* KMP_DEBUG */
-  master_thread = this_thr->th.th_team_master;
-#ifdef KMP_DEBUG
+  kmp_info_t *master_thread = this_thr->th.th_team_master;
   if (master_thread != team->t.t_threads[0]) {
     __kmp_print_structure();
   }
@@ -1705,8 +2232,8 @@ void __kmp_join_barrier(int gtid) {
 
   if (__kmp_tasking_mode == tskm_extra_barrier) {
     __kmp_tasking_barrier(team, this_thr, gtid);
-    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
-                  team_id, tid));
+    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
+                  gtid, team_id, tid));
   }
 #ifdef KMP_DEBUG
   if (__kmp_tasking_mode != tskm_immediate_exec) {
@@ -1715,8 +2242,9 @@ void __kmp_join_barrier(int gtid) {
                   __kmp_gtid_from_thread(this_thr), team_id,
                   team->t.t_task_team[this_thr->th.th_task_state],
                   this_thr->th.th_task_team));
-    KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
-                     team->t.t_task_team[this_thr->th.th_task_state]);
+    if (this_thr->th.th_task_team)
+      KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
+                       team->t.t_task_team[this_thr->th.th_task_state]);
   }
 #endif /* KMP_DEBUG */
 
@@ -1742,6 +2270,11 @@ void __kmp_join_barrier(int gtid) {
 #endif /* USE_ITT_BUILD */
 
   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
+  case bp_dist_bar: {
+    __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
+                              NULL USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
   case bp_hyper_bar: {
     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
@@ -1787,8 +2320,7 @@ void __kmp_join_barrier(int gtid) {
       team_thread->th.th_stats->setIdleFlag();
       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
           team_thread->th.th_sleep_loc != NULL)
-        __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
-                                  team_thread->th.th_sleep_loc);
+        __kmp_null_resume_wrapper(team_thread);
     }
 #endif
 #if USE_ITT_BUILD
@@ -1806,8 +2338,6 @@ void __kmp_join_barrier(int gtid) {
       kmp_uint64 cur_time = __itt_get_timestamp();
       ident_t *loc = team->t.t_ident;
       kmp_info_t **other_threads = team->t.t_threads;
-      int nproc = this_thr->th.th_team_nproc;
-      int i;
       switch (__kmp_forkjoin_frames_mode) {
       case 1:
         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
@@ -1824,7 +2354,7 @@ void __kmp_join_barrier(int gtid) {
           // Set arrive time to zero to be able to check it in
           // __kmp_invoke_task(); the same is done inside the loop below
           this_thr->th.th_bar_arrive_time = 0;
-          for (i = 1; i < nproc; ++i) {
+          for (kmp_uint i = 1; i < nproc; ++i) {
             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
             other_threads[i]->th.th_bar_arrive_time = 0;
           }
@@ -1933,6 +2463,11 @@ void __kmp_fork_barrier(int gtid, int tid) {
   } // primary thread
 
   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
+  case bp_dist_bar: {
+    __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
+                               TRUE USE_ITT_BUILD_ARG(NULL));
+    break;
+  }
   case bp_hyper_bar: {
     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
diff --git a/contrib/libs/cxxsupp/openmp/kmp_barrier.h b/contrib/libs/cxxsupp/openmp/kmp_barrier.h
new file mode 100644
index 0000000000..ac28a13217
--- /dev/null
+++ b/contrib/libs/cxxsupp/openmp/kmp_barrier.h
@@ -0,0 +1,141 @@
+/*
+ * kmp_barrier.h
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_BARRIER_H
+#define KMP_BARRIER_H
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+
+#if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC
+#include <xmmintrin.h>
+#define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment)
+#define KMP_ALIGNED_FREE(ptr) _mm_free(ptr)
+#elif KMP_HAVE_ALIGNED_ALLOC
+#define KMP_ALIGNED_ALLOCATE(size, alignment) aligned_alloc(alignment, size)
+#define KMP_ALIGNED_FREE(ptr) free(ptr)
+#elif KMP_HAVE_POSIX_MEMALIGN
+static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) {
+  void *ptr;
+  int n = posix_memalign(&ptr, alignment, size);
+  if (n != 0) {
+    if (ptr)
+      free(ptr);
+    return nullptr;
+  }
+  return ptr;
+}
+#define KMP_ALIGNED_FREE(ptr) free(ptr)
+#elif KMP_HAVE__ALIGNED_MALLOC
+#include <malloc.h>
+#define KMP_ALIGNED_ALLOCATE(size, alignment) _aligned_malloc(size, alignment)
+#define KMP_ALIGNED_FREE(ptr) _aligned_free(ptr)
+#else
+#define KMP_ALIGNED_ALLOCATE(size, alignment) KMP_INTERNAL_MALLOC(size)
+#define KMP_ALIGNED_FREE(ptr) KMP_INTERNAL_FREE(ptr)
+#endif
+
+// Use four cache lines: MLC tends to prefetch the next or previous cache line
+// creating a possible fake conflict between cores, so this is the only way to
+// guarantee that no such prefetch can happen.
+#ifndef KMP_FOURLINE_ALIGN_CACHE
+#define KMP_FOURLINE_ALIGN_CACHE KMP_ALIGN(4 * CACHE_LINE)
+#endif
+
+#define KMP_OPTIMIZE_FOR_REDUCTIONS 0
+
+class distributedBarrier {
+  struct flags_s {
+    kmp_uint32 volatile KMP_FOURLINE_ALIGN_CACHE stillNeed;
+  };
+
+  struct go_s {
+    std::atomic<kmp_uint64> KMP_FOURLINE_ALIGN_CACHE go;
+  };
+
+  struct iter_s {
+    kmp_uint64 volatile KMP_FOURLINE_ALIGN_CACHE iter;
+  };
+
+  struct sleep_s {
+    std::atomic<bool> KMP_FOURLINE_ALIGN_CACHE sleep;
+  };
+
+  void init(size_t nthr);
+  void resize(size_t nthr);
+  void computeGo(size_t n);
+  void computeVarsForN(size_t n);
+
+public:
+  enum {
+    MAX_ITERS = 3,
+    MAX_GOS = 8,
+    IDEAL_GOS = 4,
+    IDEAL_CONTENTION = 16,
+  };
+
+  flags_s *flags[MAX_ITERS];
+  go_s *go;
+  iter_s *iter;
+  sleep_s *sleep;
+
+  size_t KMP_ALIGN_CACHE num_threads; // number of threads in barrier
+  size_t KMP_ALIGN_CACHE max_threads; // size of arrays in data structure
+  // number of go signals each requiring one write per iteration
+  size_t KMP_ALIGN_CACHE num_gos;
+  // number of groups of gos
+  size_t KMP_ALIGN_CACHE num_groups;
+  // threads per go signal
+  size_t KMP_ALIGN_CACHE threads_per_go;
+  bool KMP_ALIGN_CACHE fix_threads_per_go;
+  // threads per group
+  size_t KMP_ALIGN_CACHE threads_per_group;
+  // number of go signals in a group
+  size_t KMP_ALIGN_CACHE gos_per_group;
+  void *team_icvs;
+
+  distributedBarrier() = delete;
+  ~distributedBarrier() = delete;
+
+  // Used instead of constructor to create aligned data
+  static distributedBarrier *allocate(int nThreads) {
+    distributedBarrier *d = (distributedBarrier *)KMP_ALIGNED_ALLOCATE(
+        sizeof(distributedBarrier), 4 * CACHE_LINE);
+    if (!d) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    d->num_threads = 0;
+    d->max_threads = 0;
+    for (int i = 0; i < MAX_ITERS; ++i)
+      d->flags[i] = NULL;
+    d->go = NULL;
+    d->iter = NULL;
+    d->sleep = NULL;
+    d->team_icvs = NULL;
+    d->fix_threads_per_go = false;
+    // calculate gos and groups ONCE on base size
+    d->computeGo(nThreads);
+    d->init(nThreads);
+    return d;
+  }
+
+  static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); }
+
+  void update_num_threads(size_t nthr) { init(nthr); }
+
+  bool need_resize(size_t new_nthr) { return (new_nthr > max_threads); }
+  size_t get_num_threads() { return num_threads; }
+  kmp_uint64 go_release();
+  void go_reset();
+};
+
+#endif // KMP_BARRIER_H
diff --git a/contrib/libs/cxxsupp/openmp/kmp_config.h b/contrib/libs/cxxsupp/openmp/kmp_config.h
index 81314ed20a..2f7a7f9320 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_config.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_config.h
@@ -80,6 +80,16 @@
 #define KMP_HAVE_ATTRIBUTE_RTM LIBOMP_HAVE_ATTRIBUTE_RTM
 #define LIBOMP_ARCH_AARCH64_A64FX 0
 #define KMP_ARCH_AARCH64_A64FX LIBOMP_ARCH_AARCH64_A64FX
+#define LIBOMP_HAVE_XMMINTRIN_H 1
+#define KMP_HAVE_XMMINTRIN_H LIBOMP_HAVE_XMMINTRIN_H
+#define LIBOMP_HAVE__MM_MALLOC 1
+#define KMP_HAVE__MM_MALLOC LIBOMP_HAVE__MM_MALLOC
+#define LIBOMP_HAVE_ALIGNED_ALLOC 1
+#define KMP_HAVE_ALIGNED_ALLOC LIBOMP_HAVE_ALIGNED_ALLOC
+#define LIBOMP_HAVE_POSIX_MEMALIGN 1
+#define KMP_HAVE_POSIX_MEMALIGN LIBOMP_HAVE_POSIX_MEMALIGN
+#define LIBOMP_HAVE__ALIGNED_MALLOC 0
+#define KMP_HAVE__ALIGNED_MALLOC LIBOMP_HAVE__ALIGNED_MALLOC
 
 // Configured cache line based on architecture
 #if KMP_ARCH_PPC64
@@ -119,4 +129,9 @@
 # define KMP_GOMP_COMPAT
 #endif
 
+// use shared memory with dynamic library (except Android, where shm_*
+// functions don't exist).
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !__ANDROID__
+#define KMP_USE_SHM
+#endif
 #endif // KMP_CONFIG_H
diff --git a/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp b/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp
index 2a7c9a8cb2..e263558517 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp
@@ -288,15 +288,7 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
     ompt_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       kmp_info_t *master_th = __kmp_threads[gtid];
-      kmp_team_t *parent_team = master_th->th.th_team;
-      ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
-      if (lwt)
-        ompt_frame = &(lwt->ompt_task_info.frame);
-      else {
-        int tid = __kmp_tid_from_gtid(gtid);
-        ompt_frame = &(
-            parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
-      }
+      ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     }
     OMPT_STORE_RETURN_ADDRESS(gtid);
@@ -320,6 +312,12 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
     );
 
     va_end(ap);
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
   }
 
 #if KMP_STATS_ENABLED
@@ -533,7 +531,8 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 
   kmp_task_team_t *task_team = this_thr->th.th_task_team;
   // we need to wait for the proxy tasks before finishing the thread
-  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
+  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
+                            task_team->tt.tt_hidden_helper_task_encountered))
     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
 
   KMP_MB();
@@ -578,9 +577,6 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     __kmp_free(top);
   }
 
-  // if( serial_team -> t.t_serialized > 1 )
-  serial_team->t.t_level--;
-
   /* pop dispatch buffers stack */
   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
   {
@@ -605,6 +601,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
+    __kmp_pop_current_task_from_thread(this_thr);
 #if OMPD_SUPPORT
     if (ompd_state & OMPD_ENABLE_BP)
       ompd_bp_parallel_end();
@@ -623,8 +620,6 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     this_thr->th.th_dispatch =
         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
 
-    __kmp_pop_current_task_from_thread(this_thr);
-
     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
     this_thr->th.th_current_task->td_flags.executing = 1;
 
@@ -645,6 +640,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     }
   }
 
+  serial_team->t.t_level--;
   if (__kmp_env_consistency_check)
     __kmp_pop_parallel(global_tid, NULL);
 #if OMPT_SUPPORT
@@ -686,7 +682,7 @@ void __kmpc_flush(ident_t *loc) {
   if (!__kmp_cpuinfo.initialized) {
     __kmp_query_cpuid(&__kmp_cpuinfo);
   }
-  if (!__kmp_cpuinfo.sse2) {
+  if (!__kmp_cpuinfo.flags.sse2) {
     // CPU cannot execute SSE2 instructions.
   } else {
 #if KMP_COMPILER_ICC
@@ -1359,7 +1355,7 @@ static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-#define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
+#define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
 #else
 #define KMP_CPUINFO_RTM 0
 #endif
@@ -4330,24 +4326,35 @@ void __kmpc_doacross_fini(ident_t *loc, int gtid) {
   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
 }
 
-/* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
+/* OpenMP 5.1 Memory Management routines */
 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
-  return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
+  return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
+}
+
+void *omp_aligned_alloc(size_t align, size_t size,
+                        omp_allocator_handle_t allocator) {
+  return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
 }
 
 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
-  return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
+  return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
+}
+
+void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
+                         omp_allocator_handle_t allocator) {
+  return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
 }
 
 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
                   omp_allocator_handle_t free_allocator) {
-  return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
+  return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
                         free_allocator);
 }
 
 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
-  __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
+  ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
 }
+/* end of OpenMP 5.1 Memory Management routines */
 
 int __kmpc_get_target_offload(void) {
   if (!__kmp_init_serial) {
@@ -4395,6 +4402,38 @@ void __kmpc_error(ident_t *loc, int severity, const char *message) {
   __kmp_str_free(&src_loc);
 }
 
+// Mark begin of scope directive.
+void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
+// reserved is for extension of scope directive and not used.
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
+    kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
+    int tid = __kmp_tid_from_gtid(gtid);
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_scope, ompt_scope_begin,
+        &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+}
+
+// Mark end of scope directive
+void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
+// reserved is for extension of scope directive and not used.
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
+    kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
+    int tid = __kmp_tid_from_gtid(gtid);
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_scope, ompt_scope_end,
+        &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+}
+
 #ifdef KMP_USE_VERSION_SYMBOLS
 // For GOMP compatibility there are two versions of each omp_* API.
 // One is the plain C symbol and one is the Fortran symbol with an appended
diff --git a/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp b/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp
index 108384e1cc..f3407bf889 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp
@@ -72,8 +72,8 @@ void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
                                          bool use_hier = false) {
   // Pick up the nonmonotonic/monotonic bits from the scheduling type
-  // TODO: make nonmonotonic when static_steal is fixed
-  int monotonicity = SCHEDULE_MONOTONIC;
+  // Nonmonotonic as default for dynamic schedule when no modifier is specified
+  int monotonicity = SCHEDULE_NONMONOTONIC;
 
   // Let default be monotonic for executables
   // compiled with OpenMP* 4.5 or less compilers
@@ -561,6 +561,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
         _control87(_PC_64, _MCW_PC); // 0,0x30000
 #endif
         /* value used for comparison in solver for cross-over point */
+        KMP_ASSERT(tc > 0);
         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
 
         /* crossover point--chunk indexes equal to or greater than
@@ -668,6 +669,8 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
   case kmp_sch_static_chunked:
   case kmp_sch_dynamic_chunked:
   dynamic_init:
+    if (tc == 0)
+      break;
     if (pr->u.p.parm1 <= 0)
       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
     else if (pr->u.p.parm1 > tc)
@@ -1713,7 +1716,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
         status = 0; // nothing to do, don't try atomic op
         break;
       }
-      KMP_DEBUG_ASSERT(init % chunk == 0);
+      KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
       // compare with K*nproc*(chunk+1), K=2 by default
       if ((T)remaining < pr->u.p.parm2) {
         // use dynamic-style schedule
@@ -2652,9 +2655,11 @@ __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
   kmp_uint32 spins;
   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
   kmp_uint32 r;
+  kmp_uint64 time;
 
   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
   KMP_INIT_YIELD(spins);
+  KMP_INIT_BACKOFF(time);
   // main wait spin loop
   while (!f(r = TCR_4(*spin), check)) {
     KMP_FSYNC_SPIN_PREPARE(obj);
@@ -2662,7 +2667,7 @@ __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
        split. It causes problems with infinite recursion because of exit lock */
     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
         __kmp_abort_thread(); */
-    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
   }
   KMP_FSYNC_SPIN_ACQUIRED(obj);
   return r;
@@ -2677,15 +2682,17 @@ void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
   kmp_uint32 check = checker;
   kmp_uint32 spins;
   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
+  kmp_uint64 time;
 
   KMP_FSYNC_SPIN_INIT(obj, spin);
   KMP_INIT_YIELD(spins);
+  KMP_INIT_BACKOFF(time);
   // main wait spin loop
   while (!f(spin, check)) {
     KMP_FSYNC_SPIN_PREPARE(obj);
     /* if we have waited a bit, or are noversubscribed, yield */
     /* pause is in the following code */
-    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
   }
   KMP_FSYNC_SPIN_ACQUIRED(obj);
 }
diff --git a/contrib/libs/cxxsupp/openmp/kmp_dispatch.h b/contrib/libs/cxxsupp/openmp/kmp_dispatch.h
index ae11361ca5..154db17461 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_dispatch.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_dispatch.h
@@ -292,10 +292,12 @@ static UT __kmp_wait(volatile UT *spinner, UT checker,
   UT check = checker;
   kmp_uint32 spins;
   kmp_uint32 (*f)(UT, UT) = pred;
+  kmp_uint64 time;
   UT r;
 
   KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
   KMP_INIT_YIELD(spins);
+  KMP_INIT_BACKOFF(time);
   // main wait spin loop
   while (!f(r = *spin, check)) {
     KMP_FSYNC_SPIN_PREPARE(obj);
@@ -305,7 +307,7 @@ static UT __kmp_wait(volatile UT *spinner, UT checker,
     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
         __kmp_abort_thread(); */
     // If oversubscribed, or have waited a bit then yield.
-    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
   }
   KMP_FSYNC_SPIN_ACQUIRED(obj);
   return r;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h b/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h
index 30c967af3d..bf9ebf9b2e 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h
@@ -1446,6 +1446,120 @@ int FTN_STDCALL FTN_GET_TEAMS_THREAD_LIMIT(void) {
 #endif
 }
 
+/// TODO: Include the `omp.h` of the current build
+/* OpenMP 5.1 interop */
+typedef intptr_t omp_intptr_t;
+
+/* 0..omp_get_num_interop_properties()-1 are reserved for implementation-defined
+ * properties */
+typedef enum omp_interop_property {
+  omp_ipr_fr_id = -1,
+  omp_ipr_fr_name = -2,
+  omp_ipr_vendor = -3,
+  omp_ipr_vendor_name = -4,
+  omp_ipr_device_num = -5,
+  omp_ipr_platform = -6,
+  omp_ipr_device = -7,
+  omp_ipr_device_context = -8,
+  omp_ipr_targetsync = -9,
+  omp_ipr_first = -9
+} omp_interop_property_t;
+
+#define omp_interop_none 0
+
+typedef enum omp_interop_rc {
+  omp_irc_no_value = 1,
+  omp_irc_success = 0,
+  omp_irc_empty = -1,
+  omp_irc_out_of_range = -2,
+  omp_irc_type_int = -3,
+  omp_irc_type_ptr = -4,
+  omp_irc_type_str = -5,
+  omp_irc_other = -6
+} omp_interop_rc_t;
+
+typedef enum omp_interop_fr {
+  omp_ifr_cuda = 1,
+  omp_ifr_cuda_driver = 2,
+  omp_ifr_opencl = 3,
+  omp_ifr_sycl = 4,
+  omp_ifr_hip = 5,
+  omp_ifr_level_zero = 6,
+  omp_ifr_last = 7
+} omp_interop_fr_t;
+
+typedef void *omp_interop_t;
+
+// libomptarget, if loaded, provides this function
+int FTN_STDCALL FTN_GET_NUM_INTEROP_PROPERTIES(const omp_interop_t interop) {
+#if KMP_MIC || KMP_OS_DARWIN || defined(KMP_STUB)
+  return 0;
+#else
+  int (*fptr)(const omp_interop_t);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_num_interop_properties")))
+    return (*fptr)(interop);
+  return 0;
+#endif // KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
+}
+
+/// TODO Convert FTN_GET_INTEROP_XXX functions into a macro like interop.cpp
+// libomptarget, if loaded, provides this function
+intptr_t FTN_STDCALL FTN_GET_INTEROP_INT(const omp_interop_t interop,
+                                         omp_interop_property_t property_id,
+                                         int *err) {
+  intptr_t (*fptr)(const omp_interop_t, omp_interop_property_t, int *);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_int")))
+    return (*fptr)(interop, property_id, err);
+  return 0;
+}
+
+// libomptarget, if loaded, provides this function
+void *FTN_STDCALL FTN_GET_INTEROP_PTR(const omp_interop_t interop,
+                                      omp_interop_property_t property_id,
+                                      int *err) {
+  void *(*fptr)(const omp_interop_t, omp_interop_property_t, int *);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_ptr")))
+    return (*fptr)(interop, property_id, err);
+  return nullptr;
+}
+
+// libomptarget, if loaded, provides this function
+const char *FTN_STDCALL FTN_GET_INTEROP_STR(const omp_interop_t interop,
+                                            omp_interop_property_t property_id,
+                                            int *err) {
+  const char *(*fptr)(const omp_interop_t, omp_interop_property_t, int *);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_str")))
+    return (*fptr)(interop, property_id, err);
+  return nullptr;
+}
+
+// libomptarget, if loaded, provides this function
+const char *FTN_STDCALL FTN_GET_INTEROP_NAME(
+    const omp_interop_t interop, omp_interop_property_t property_id) {
+  const char *(*fptr)(const omp_interop_t, omp_interop_property_t);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_name")))
+    return (*fptr)(interop, property_id);
+  return nullptr;
+}
+
+// libomptarget, if loaded, provides this function
+const char *FTN_STDCALL FTN_GET_INTEROP_TYPE_DESC(
+    const omp_interop_t interop, omp_interop_property_t property_id) {
+  const char *(*fptr)(const omp_interop_t, omp_interop_property_t);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_type_desc")))
+    return (*fptr)(interop, property_id);
+  return nullptr;
+}
+
+// libomptarget, if loaded, provides this function
+const char *FTN_STDCALL FTN_GET_INTEROP_RC_DESC(
+    const omp_interop_t interop, omp_interop_property_t property_id) {
+  const char *(*fptr)(const omp_interop_t, omp_interop_property_t);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_rec_desc")))
+    return (*fptr)(interop, property_id);
+  return nullptr;
+}
+
 // display environment variables when requested
 void FTN_STDCALL FTN_DISPLAY_ENV(int verbose) {
 #ifndef KMP_STUB
diff --git a/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h b/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h
index 5b9e396e3d..66e1e1ecd2 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h
@@ -140,6 +140,14 @@
 #define FTN_SET_TEAMS_THREAD_LIMIT omp_set_teams_thread_limit
 #define FTN_GET_TEAMS_THREAD_LIMIT omp_get_teams_thread_limit
 
+#define FTN_GET_NUM_INTEROP_PROPERTIES omp_get_num_interop_properties
+#define FTN_GET_INTEROP_INT omp_get_interop_int
+#define FTN_GET_INTEROP_PTR omp_get_interop_ptr
+#define FTN_GET_INTEROP_STR omp_get_interop_str
+#define FTN_GET_INTEROP_NAME omp_get_interop_name
+#define FTN_GET_INTEROP_TYPE_DESC omp_get_interop_type_desc
+#define FTN_GET_INTEROP_RC_DESC omp_get_interop_rc_desc
+
 #endif /* KMP_FTN_PLAIN */
 
 /* ------------------------------------------------------------------------ */
@@ -268,6 +276,14 @@
 #define FTN_SET_TEAMS_THREAD_LIMIT omp_set_teams_thread_limit_
 #define FTN_GET_TEAMS_THREAD_LIMIT omp_get_teams_thread_limit_
 
+#define FTN_GET_NUM_INTEROP_PROPERTIES omp_get_num_interop_properties_
+#define FTN_GET_INTEROP_INT omp_get_interop_int_
+#define FTN_GET_INTEROP_PTR omp_get_interop_ptr_
+#define FTN_GET_INTEROP_STR omp_get_interop_str_
+#define FTN_GET_INTEROP_NAME omp_get_interop_name_
+#define FTN_GET_INTEROP_TYPE_DESC omp_get_interop_type_desc_
+#define FTN_GET_INTEROP_RC_DESC omp_get_interop_rc_desc_
+
 #endif /* KMP_FTN_APPEND */
 
 /* ------------------------------------------------------------------------ */
@@ -394,6 +410,14 @@
 #define FTN_SET_TEAMS_THREAD_LIMIT OMP_SET_TEAMS_THREAD_LIMIT
 #define FTN_GET_TEAMS_THREAD_LIMIT OMP_GET_TEAMS_THREAD_LIMIT
 
+#define FTN_GET_NUM_INTEROP_PROPERTIES OMP_GET_NUM_INTEROP_PROPERTIES
+#define FTN_GET_INTEROP_INT OMP_GET_INTEROP_INT
+#define FTN_GET_INTEROP_PTR OMP_GET_INTEROP_PTR
+#define FTN_GET_INTEROP_STR OMP_GET_INTEROP_STR
+#define FTN_GET_INTEROP_NAME OMP_GET_INTEROP_NAME
+#define FTN_GET_INTEROP_TYPE_DESC OMP_GET_INTEROP_TYPE_DESC
+#define FTN_GET_INTEROP_RC_DESC OMP_GET_INTEROP_RC_DESC
+
 #endif /* KMP_FTN_UPPER */
 
 /* ------------------------------------------------------------------------ */
@@ -522,6 +546,14 @@
 #define FTN_SET_TEAMS_THREAD_LIMIT OMP_SET_TEAMS_THREAD_LIMIT_
 #define FTN_GET_TEAMS_THREAD_LIMIT OMP_GET_TEAMS_THREAD_LIMIT_
 
+#define FTN_GET_NUM_INTEROP_PROPERTIES OMP_GET_NUM_INTEROP_PROPERTIES_
+#define FTN_GET_INTEROP_INT OMP_GET_INTEROP_INT_
+#define FTN_GET_INTEROP_PTR OMP_GET_INTEROP_PTR_
+#define FTN_GET_INTEROP_STR OMP_GET_INTEROP_STR_
+#define FTN_GET_INTEROP_NAME OMP_GET_INTEROP_NAME_
+#define FTN_GET_INTEROP_TYPE_DESC OMP_GET_INTEROP_TYPE_DESC_
+#define FTN_GET_INTEROP_RC_DESC OMP_GET_INTEROP_RC_DESC_
+
 #endif /* KMP_FTN_UAPPEND */
 
 /* -------------------------- GOMP API NAMES ------------------------ */
@@ -712,5 +744,6 @@
 #define KMP_API_NAME_GOMP_SECTIONS2_START GOMP_sections2_start
 #define KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER                  \
   GOMP_workshare_task_reduction_unregister
-
+#define KMP_API_NAME_GOMP_ALLOC GOMP_alloc
+#define KMP_API_NAME_GOMP_FREE GOMP_free
 #endif /* KMP_FTN_OS_H */
diff --git a/contrib/libs/cxxsupp/openmp/kmp_global.cpp b/contrib/libs/cxxsupp/openmp/kmp_global.cpp
index b519fcf678..62bdac3c4b 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_global.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_global.cpp
@@ -110,8 +110,8 @@ char const *__kmp_barrier_type_name[bs_last_barrier] = {"plain", "forkjoin"
                                                         "reduction"
 #endif // KMP_FAST_REDUCTION_BARRIER
 };
-char const *__kmp_barrier_pattern_name[bp_last_bar] = {"linear", "tree",
-                                                       "hyper", "hierarchical"};
+char const *__kmp_barrier_pattern_name[bp_last_bar] = {
+    "linear", "tree", "hyper", "hierarchical", "dist"};
 
 int __kmp_allThreadsSpecified = 0;
 size_t __kmp_align_alloc = CACHE_LINE;
@@ -219,6 +219,13 @@ int __kmp_mwait_enabled = FALSE;
 int __kmp_mwait_hints = 0;
 #endif
 
+#if KMP_HAVE_UMWAIT
+int __kmp_waitpkg_enabled = 0;
+int __kmp_tpause_state = 0;
+int __kmp_tpause_hint = 1;
+int __kmp_tpause_enabled = 0;
+#endif
+
 /* map OMP 3.0 schedule types with our internal schedule types */
 enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext +
                               kmp_sched_upper_std - kmp_sched_lower - 2] = {
@@ -280,6 +287,7 @@ char *__kmp_cpuinfo_file = NULL;
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 kmp_nested_proc_bind_t __kmp_nested_proc_bind = {NULL, 0, 0};
+kmp_proc_bind_t __kmp_teams_proc_bind = proc_bind_spread;
 int __kmp_affinity_num_places = 0;
 int __kmp_display_affinity = FALSE;
 char *__kmp_affinity_format = NULL;
@@ -424,6 +432,7 @@ kmp_int32 __kmp_use_yield_exp_set = 0;
 
 kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT;
 kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
+kmp_uint64 __kmp_pause_init = 1; // for tpause
 
 /* ------------------------------------------------------ */
 /* STATE mostly syncronized with global lock */
diff --git a/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp b/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp
index 61a3199f1a..d77d4809a7 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp
@@ -23,18 +23,24 @@ enum {
   KMP_GOMP_TASK_DEPENDS_FLAG = 8
 };
 
+enum {
+  KMP_GOMP_DEPOBJ_IN = 1,
+  KMP_GOMP_DEPOBJ_OUT = 2,
+  KMP_GOMP_DEPOBJ_INOUT = 3,
+  KMP_GOMP_DEPOBJ_MTXINOUTSET = 4
+};
+
 // This class helps convert gomp dependency info into
 // kmp_depend_info_t structures
 class kmp_gomp_depends_info_t {
   void **depend;
   kmp_int32 num_deps;
-  size_t num_out, num_mutexinout, num_in;
+  size_t num_out, num_mutexinout, num_in, num_depobj;
   size_t offset;
 
 public:
   kmp_gomp_depends_info_t(void **depend) : depend(depend) {
     size_t ndeps = (kmp_intptr_t)depend[0];
-    size_t num_doable;
     // GOMP taskdep structure:
     // if depend[0] != 0:
     // depend =  [ ndeps | nout | &out | ... | &out | &in | ... | &in ]
@@ -45,21 +51,17 @@ public:
     if (ndeps) {
       num_out = (kmp_intptr_t)depend[1];
       num_in = ndeps - num_out;
-      num_mutexinout = 0;
-      num_doable = ndeps;
+      num_mutexinout = num_depobj = 0;
       offset = 2;
     } else {
       ndeps = (kmp_intptr_t)depend[1];
       num_out = (kmp_intptr_t)depend[2];
       num_mutexinout = (kmp_intptr_t)depend[3];
       num_in = (kmp_intptr_t)depend[4];
-      num_doable = num_out + num_mutexinout + num_in;
+      num_depobj = ndeps - num_out - num_mutexinout - num_in;
+      KMP_ASSERT(num_depobj <= ndeps);
       offset = 5;
     }
-    // TODO: Support gomp depobj
-    if (ndeps != num_doable) {
-      KMP_FATAL(GompFeatureNotSupported, "depobj");
-    }
     num_deps = static_cast<kmp_int32>(ndeps);
   }
   kmp_int32 get_num_deps() const { return num_deps; }
@@ -67,7 +69,6 @@ public:
     kmp_depend_info_t retval;
     memset(&retval, '\0', sizeof(retval));
     KMP_ASSERT(index < (size_t)num_deps);
-    retval.base_addr = (kmp_intptr_t)depend[offset + index];
     retval.len = 0;
     // Because inout and out are logically equivalent,
     // use inout and in dependency flags. GOMP does not provide a
@@ -75,10 +76,37 @@ public:
     if (index < num_out) {
       retval.flags.in = 1;
       retval.flags.out = 1;
+      retval.base_addr = (kmp_intptr_t)depend[offset + index];
     } else if (index >= num_out && index < (num_out + num_mutexinout)) {
       retval.flags.mtx = 1;
-    } else {
+      retval.base_addr = (kmp_intptr_t)depend[offset + index];
+    } else if (index >= (num_out + num_mutexinout) &&
+               index < (num_out + num_mutexinout + num_in)) {
       retval.flags.in = 1;
+      retval.base_addr = (kmp_intptr_t)depend[offset + index];
+    } else {
+      // depobj is a two element array (size of elements are size of pointer)
+      // depobj[0] = base_addr
+      // depobj[1] = type (in, out, inout, mutexinoutset, etc.)
+      kmp_intptr_t *depobj = (kmp_intptr_t *)depend[offset + index];
+      retval.base_addr = depobj[0];
+      switch (depobj[1]) {
+      case KMP_GOMP_DEPOBJ_IN:
+        retval.flags.in = 1;
+        break;
+      case KMP_GOMP_DEPOBJ_OUT:
+        retval.flags.out = 1;
+        break;
+      case KMP_GOMP_DEPOBJ_INOUT:
+        retval.flags.in = 1;
+        retval.flags.out = 1;
+        break;
+      case KMP_GOMP_DEPOBJ_MTXINOUTSET:
+        retval.flags.mtx = 1;
+        break;
+      default:
+        KMP_FATAL(GompFeatureNotSupported, "Unknown depobj type");
+      }
     }
     return retval;
   }
@@ -1206,7 +1234,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data,
 
   // The low-order bit is the "untied" flag
   if (!(gomp_flags & KMP_GOMP_TASK_UNTIED_FLAG)) {
-    input_flags->tiedness = 1;
+    input_flags->tiedness = TASK_TIED;
   }
   // The second low-order bit is the "final" flag
   if (gomp_flags & KMP_GOMP_TASK_FINAL_FLAG) {
@@ -1494,6 +1522,13 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task)(void *),
   KA_TRACE(20, ("GOMP_parallel_sections: T#%d\n", gtid));
 
 #if OMPT_SUPPORT
+  ompt_frame_t *task_frame;
+  kmp_info_t *thr;
+  if (ompt_enabled.enabled) {
+    thr = __kmp_threads[gtid];
+    task_frame = &(thr->th.th_current_task->ompt_task_info.frame);
+    task_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
@@ -1509,9 +1544,31 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task)(void *),
 
     KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
   }
+
+#if OMPT_SUPPORT
+  ompt_frame_t *child_frame;
+  if (ompt_enabled.enabled) {
+    child_frame = &(thr->th.th_current_task->ompt_task_info.frame);
+    child_frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+
   task(data);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    child_frame->exit_frame = ompt_data_none;
+  }
+#endif
+
   KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
   KA_TRACE(20, ("GOMP_parallel_sections exit: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    task_frame->enter_frame = ompt_data_none;
+  }
+#endif
 }
 
 #define PARALLEL_LOOP(func, schedule, ompt_pre, ompt_post)                     \
@@ -1738,7 +1795,7 @@ void __GOMP_taskloop(void (*func)(void *), void *data,
   KMP_ASSERT(arg_align > 0);
   // The low-order bit is the "untied" flag
   if (!(gomp_flags & 1)) {
-    input_flags->tiedness = 1;
+    input_flags->tiedness = TASK_TIED;
   }
   // The second low-order bit is the "final" flag
   if (gomp_flags & 2) {
@@ -2428,6 +2485,26 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER)(
   }
 }
 
+// allocator construct
+void *KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ALLOC)(size_t alignment, size_t size,
+                                               uintptr_t allocator) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_alloc: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_alloc(gtid, alignment, size, (omp_allocator_handle_t)allocator);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_FREE)(void *ptr, uintptr_t allocator) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_free: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return ___kmpc_free(gtid, ptr, (omp_allocator_handle_t)allocator);
+}
+
 /* The following sections of code create aliases for the GOMP_* functions, then
    create versioned symbols using the assembler directive .symver. This is only
    pertinent for ELF .so library. The KMP_VERSION_SYMBOL macro is defined in
@@ -2616,6 +2693,10 @@ KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_START, 50, "GOMP_5.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS2_START, 50, "GOMP_5.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER, 50,
                    "GOMP_5.0");
+
+// GOMP_5.0.1 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ALLOC, 501, "GOMP_5.0.1");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_FREE, 501, "GOMP_5.0.1");
 #endif // KMP_USE_VERSION_SYMBOLS
 
 #ifdef __cplusplus
diff --git a/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc b/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc
index 8e3e90caae..776cca2b66 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc
+++ b/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc
@@ -223,6 +223,7 @@ __kmp_i18n_default_messages[] =
         "%1$s value \"%2$u\" will be used.",
         "%1$s value \"%2$s\" will be used.",
         "%1$s value \"%2$s\" will be used.",
+        "Mixing other barrier patterns with dist is prohibited. Using dist for all barrier patterns.",
         "%1$s maximum value \"%2$d\" will be used.",
         "%1$s minimum value \"%2$d\" will be used.",
         "Memory allocation failed.",
@@ -307,6 +308,8 @@ __kmp_i18n_default_messages[] =
         "Allocator %1$s is not available, will use default allocator.",
         "%1$s: %2$s (%3$d total cores)",
         "%1$s: granularity setting: %2$s does not exist in topology.  Using granularity=%3$s instead.",
+        "%1$s: hybrid core type detected: %2$d %3$s cores.",
+        "%1$s:   %2$d with core efficiency %3$d.",
         "%1$s must be bound to a work-sharing or work-queuing construct with an \"ordered\" clause",
         "Detected end of %1$s without first executing a corresponding beginning.",
         "Iteration range too large in %1$s.",
@@ -402,6 +405,15 @@ __kmp_i18n_default_messages[] =
         "KMP_HW_SUBSET ignored: %1$s, %2$s: layers are equivalent, please only specify one.",
         "KMP_HW_SUBSET ignored: %1$s layer should come after %2$s.",
         "%1$s: topology layer \"%2$s\" is equivalent to \"%3$s\".",
+        "%1$s: granularity=%2$s is too coarse, setting granularity=group.",
+        "%1$s: \"%2$s\" value is deprecated. Please use \"%3$s\" instead.",
+        "num_teams value must be positive, it is %1$d, using %2$d instead.",
+        "KMP_HW_SUBSET ignored: %1$s, %2$s: attributes are ambiguous, please only specify one.",
+        "KMP_HW_SUBSET ignored: %1$s: attribute specified more than once.",
+        "KMP_HW_SUBSET ignored: %1$s: attribute value %2$s is invalid.",
+        "KMP_HW_SUBSET ignored: all hardware resources would be filtered, please reduce the filter.",
+        "KMP_HW_SUBSET ignored: Too many attributes specified. This machine is not a hybrid architecutre.",
+        "KMP_HW_SUBSET: ignoring %1$s attribute. This machine is not a hybrid architecutre.",
         NULL
     };
 
@@ -437,6 +449,7 @@ __kmp_i18n_default_hints[] =
         "System error #193 is \"Bad format of EXE or DLL file\". Usually it means the file is found, but it is corrupted or a file for another architecture. Check whether \"%1$s\" is a file for %2$s architecture.",
         "System-related limit on the number of threads.",
         "Try setting new bounds (preferably less than or equal to %1$d) for num_teams clause.",
+        "Valid values are from %1$d to %2$d.",
         NULL
     };
 
@@ -453,8 +466,8 @@ __kmp_i18n_sections[] =
         { 5, __kmp_i18n_default_meta },
         { 79, __kmp_i18n_default_strings },
         { 6, __kmp_i18n_default_formats },
-        { 286, __kmp_i18n_default_messages },
-        { 28, __kmp_i18n_default_hints },
+        { 298, __kmp_i18n_default_messages },
+        { 29, __kmp_i18n_default_hints },
         { 0, NULL }
     };
 
diff --git a/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc b/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc
index 7fec5e6223..a66f8117c2 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc
+++ b/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc
@@ -217,6 +217,7 @@ enum kmp_i18n_id {
     kmp_i18n_msg_Using_uint_Value,
     kmp_i18n_msg_Using_uint64_Value,
     kmp_i18n_msg_Using_str_Value,
+    kmp_i18n_msg_BarrierPatternOverride,
     kmp_i18n_msg_MaxValueUsing,
     kmp_i18n_msg_MinValueUsing,
     kmp_i18n_msg_MemoryAllocFailed,
@@ -301,6 +302,8 @@ enum kmp_i18n_id {
     kmp_i18n_msg_OmpNoAllocator,
     kmp_i18n_msg_TopologyGeneric,
     kmp_i18n_msg_AffGranularityBad,
+    kmp_i18n_msg_TopologyHybrid,
+    kmp_i18n_msg_TopologyHybridCoreEff,
     kmp_i18n_msg_CnsBoundToWorksharing,
     kmp_i18n_msg_CnsDetectedEnd,
     kmp_i18n_msg_CnsIterationRangeTooLarge,
@@ -396,6 +399,15 @@ enum kmp_i18n_id {
     kmp_i18n_msg_AffHWSubsetEqvLayers,
     kmp_i18n_msg_AffHWSubsetOutOfOrder,
     kmp_i18n_msg_AffEqualTopologyTypes,
+    kmp_i18n_msg_AffGranTooCoarseProcGroup,
+    kmp_i18n_msg_StgDeprecatedValue,
+    kmp_i18n_msg_NumTeamsNotPositive,
+    kmp_i18n_msg_AffHWSubsetIncompat,
+    kmp_i18n_msg_AffHWSubsetAttrRepeat,
+    kmp_i18n_msg_AffHWSubsetAttrInvalid,
+    kmp_i18n_msg_AffHWSubsetAllFiltered,
+    kmp_i18n_msg_AffHWSubsetAttrsNonHybrid,
+    kmp_i18n_msg_AffHWSubsetIgnoringAttr,
     kmp_i18n_msg_last,
 
     // Set #5, hints.
@@ -428,6 +440,7 @@ enum kmp_i18n_id {
     kmp_i18n_hnt_BadExeFormat,
     kmp_i18n_hnt_SystemLimitOnThreads,
     kmp_i18n_hnt_SetNewBound,
+    kmp_i18n_hnt_ValidValuesRange,
     kmp_i18n_hnt_last,
 
     kmp_i18n_xxx_lastest
diff --git a/contrib/libs/cxxsupp/openmp/kmp_itt.cpp b/contrib/libs/cxxsupp/openmp/kmp_itt.cpp
index a76c639625..f99b264da6 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_itt.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_itt.cpp
@@ -24,12 +24,9 @@
 #error #include "ittnotify_config.h"
 __itt_global __kmp_ittapi_clean_global;
 extern __itt_global __kmp_itt__ittapi_global;
-kmp_int32 __kmp_barrier_domain_count;
-kmp_int32 __kmp_region_domain_count;
-__itt_domain *__kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS];
-__itt_domain *__kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS];
-__itt_domain *__kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS];
-kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS];
+
+kmp_itthash_t __kmp_itt_barrier_domains = {{0}, 0};
+kmp_itthash_t __kmp_itt_region_domains = {{0}, 0};
 __itt_domain *metadata_domain = NULL;
 __itt_string_handle *string_handle_imbl = NULL;
 __itt_string_handle *string_handle_loop = NULL;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_itt.h b/contrib/libs/cxxsupp/openmp/kmp_itt.h
index 75a24540d4..c640e83b71 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_itt.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_itt.h
@@ -278,15 +278,21 @@ __kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller);
     } /* if */                                                                 \
   } while (0)
 
-const int KMP_MAX_FRAME_DOMAINS =
-    512; // Maximum number of frame domains to use (maps to
+// Maximum number of frame domains to use (maps to
 // different OpenMP regions in the user source code).
-extern kmp_int32 __kmp_barrier_domain_count;
-extern kmp_int32 __kmp_region_domain_count;
-extern __itt_domain *__kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS];
-extern __itt_domain *__kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS];
-extern __itt_domain *__kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS];
-extern kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS];
+const int KMP_MAX_FRAME_DOMAINS = 997;
+typedef struct kmp_itthash_entry {
+  ident_t *loc;
+  int team_size;
+  __itt_domain *d;
+  struct kmp_itthash_entry *next_in_bucket;
+} kmp_itthash_entry_t;
+typedef struct kmp_itthash {
+  kmp_itthash_entry_t *buckets[KMP_MAX_FRAME_DOMAINS];
+  int count; // just a heuristic to limit number of entries
+} kmp_itthash_t;
+extern kmp_itthash_t __kmp_itt_region_domains;
+extern kmp_itthash_t __kmp_itt_barrier_domains;
 extern __itt_domain *metadata_domain;
 extern __itt_string_handle *string_handle_imbl;
 extern __itt_string_handle *string_handle_loop;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_lock.cpp b/contrib/libs/cxxsupp/openmp/kmp_lock.cpp
index 59726f2b9f..fff7305b57 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_lock.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_lock.cpp
@@ -96,12 +96,19 @@ __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   }
 
   kmp_uint32 spins;
+  kmp_uint64 time;
   KMP_FSYNC_PREPARE(lck);
   KMP_INIT_YIELD(spins);
+  KMP_INIT_BACKOFF(time);
   kmp_backoff_t backoff = __kmp_spin_backoff_params;
   do {
+#if !KMP_HAVE_UMWAIT
     __kmp_spin_backoff(&backoff);
-    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+#else
+    if (!__kmp_tpause_enabled)
+      __kmp_spin_backoff(&backoff);
+#endif
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
   } while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
            !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy));
   KMP_FSYNC_ACQUIRED(lck);
@@ -1344,14 +1351,15 @@ static int __kmp_test_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
 }
 
 int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
-  kmp_info_t *this_thr;
   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
   volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
 
   KA_TRACE(1000,
            ("__kmp_release_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
   KMP_DEBUG_ASSERT(gtid >= 0);
-  this_thr = __kmp_thread_from_gtid(gtid);
+#if KMP_DEBUG || DEBUG_QUEUING_LOCKS
+  kmp_info_t *this_thr = __kmp_thread_from_gtid(gtid);
+#endif
   KMP_DEBUG_ASSERT(this_thr != NULL);
 #ifdef DEBUG_QUEUING_LOCKS
   TRACE_LOCK(gtid + 1, "rel ent");
@@ -2226,10 +2234,12 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   // The current implementation of KMP_WAIT doesn't allow for mask
   // and poll to be re-read every spin iteration.
   kmp_uint32 spins;
+  kmp_uint64 time;
   KMP_FSYNC_PREPARE(lck);
   KMP_INIT_YIELD(spins);
+  KMP_INIT_BACKOFF(time);
   while (polls[ticket & mask] < ticket) { // atomic load
-    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
     // Re-read the mask and the poll pointer from the lock structure.
     //
     // Make certain that "mask" is read before "polls" !!!
@@ -2658,9 +2668,17 @@ void __kmp_spin_backoff(kmp_backoff_t *boff) {
   kmp_uint32 i;
   for (i = boff->step; i > 0; i--) {
     kmp_uint64 goal = __kmp_tsc() + boff->min_tick;
-    do {
-      KMP_CPU_PAUSE();
-    } while (before(__kmp_tsc(), goal));
+#if KMP_HAVE_UMWAIT
+    if (__kmp_umwait_enabled) {
+      __kmp_tpause(0, boff->min_tick);
+    } else {
+#endif
+      do {
+        KMP_CPU_PAUSE();
+      } while (before(__kmp_tsc(), goal));
+#if KMP_HAVE_UMWAIT
+    }
+#endif
   }
   boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1);
 }
@@ -3103,7 +3121,7 @@ kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
                                                   kmp_int32 gtid,
                                                   kmp_indirect_locktag_t tag) {
   kmp_indirect_lock_t *lck;
-  kmp_lock_index_t idx;
+  kmp_lock_index_t idx, table_idx;
 
   __kmp_acquire_lock(&__kmp_global_lock, gtid);
 
@@ -3116,26 +3134,41 @@ kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
     KA_TRACE(20, ("__kmp_allocate_indirect_lock: reusing an existing lock %p\n",
                   lck));
   } else {
-    idx = __kmp_i_lock_table.next;
-    // Check capacity and double the size if it is full
-    if (idx == __kmp_i_lock_table.size) {
-      // Double up the space for block pointers
-      int row = __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK;
-      kmp_indirect_lock_t **new_table = (kmp_indirect_lock_t **)__kmp_allocate(
-          2 * row * sizeof(kmp_indirect_lock_t *));
-      KMP_MEMCPY(new_table, __kmp_i_lock_table.table,
-                 row * sizeof(kmp_indirect_lock_t *));
-      kmp_indirect_lock_t **old_table = __kmp_i_lock_table.table;
-      __kmp_i_lock_table.table = new_table;
-      __kmp_free(old_table);
-      // Allocate new objects in the new blocks
-      for (int i = row; i < 2 * row; ++i)
-        *(__kmp_i_lock_table.table + i) = (kmp_indirect_lock_t *)__kmp_allocate(
-            KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
-      __kmp_i_lock_table.size = 2 * idx;
+    kmp_uint32 row, col;
+    kmp_indirect_lock_table_t *lock_table = &__kmp_i_lock_table;
+    idx = 0;
+    // Find location in list of lock tables to put new lock
+    while (1) {
+      table_idx = lock_table->next; // index within this table
+      idx += lock_table->next; // global index within list of tables
+      if (table_idx < lock_table->nrow_ptrs * KMP_I_LOCK_CHUNK) {
+        row = table_idx / KMP_I_LOCK_CHUNK;
+        col = table_idx % KMP_I_LOCK_CHUNK;
+        // Allocate a new row of locks if necessary
+        if (!lock_table->table[row]) {
+          lock_table->table[row] = (kmp_indirect_lock_t *)__kmp_allocate(
+              sizeof(kmp_indirect_lock_t) * KMP_I_LOCK_CHUNK);
+        }
+        break;
+      }
+      // Allocate a new lock table if necessary with double the capacity
+      if (!lock_table->next_table) {
+        kmp_indirect_lock_table_t *next_table =
+            (kmp_indirect_lock_table_t *)__kmp_allocate(
+                sizeof(kmp_indirect_lock_table_t));
+        next_table->table = (kmp_indirect_lock_t **)__kmp_allocate(
+            sizeof(kmp_indirect_lock_t *) * 2 * lock_table->nrow_ptrs);
+        next_table->nrow_ptrs = 2 * lock_table->nrow_ptrs;
+        next_table->next = 0;
+        next_table->next_table = nullptr;
+        lock_table->next_table = next_table;
+      }
+      lock_table = lock_table->next_table;
+      KMP_ASSERT(lock_table);
     }
-    __kmp_i_lock_table.next++;
-    lck = KMP_GET_I_LOCK(idx);
+    lock_table->next++;
+
+    lck = &lock_table->table[row][col];
     // Allocate a new base lock object
     lck->lock = (kmp_user_lock_p)__kmp_allocate(__kmp_indirect_lock_size[tag]);
     KA_TRACE(20,
@@ -3166,10 +3199,7 @@ __kmp_lookup_indirect_lock(void **user_lock, const char *func) {
     }
     if (OMP_LOCK_T_SIZE < sizeof(void *)) {
       kmp_lock_index_t idx = KMP_EXTRACT_I_INDEX(user_lock);
-      if (idx >= __kmp_i_lock_table.size) {
-        KMP_FATAL(LockIsUninitialized, func);
-      }
-      lck = KMP_GET_I_LOCK(idx);
+      lck = __kmp_get_i_lock(idx);
     } else {
       lck = *((kmp_indirect_lock_t **)user_lock);
     }
@@ -3179,7 +3209,7 @@ __kmp_lookup_indirect_lock(void **user_lock, const char *func) {
     return lck;
   } else {
     if (OMP_LOCK_T_SIZE < sizeof(void *)) {
-      return KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(user_lock));
+      return __kmp_get_i_lock(KMP_EXTRACT_I_INDEX(user_lock));
     } else {
       return *((kmp_indirect_lock_t **)user_lock);
     }
@@ -3189,13 +3219,13 @@ __kmp_lookup_indirect_lock(void **user_lock, const char *func) {
 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *lock,
                                      kmp_dyna_lockseq_t seq) {
 #if KMP_USE_ADAPTIVE_LOCKS
-  if (seq == lockseq_adaptive && !__kmp_cpuinfo.rtm) {
+  if (seq == lockseq_adaptive && !__kmp_cpuinfo.flags.rtm) {
     KMP_WARNING(AdaptiveNotSupported, "kmp_lockseq_t", "adaptive");
     seq = lockseq_queuing;
   }
 #endif
 #if KMP_USE_TSX
-  if (seq == lockseq_rtm_queuing && !__kmp_cpuinfo.rtm) {
+  if (seq == lockseq_rtm_queuing && !__kmp_cpuinfo.flags.rtm) {
     seq = lockseq_queuing;
   }
 #endif
@@ -3322,12 +3352,13 @@ void __kmp_init_dynamic_user_locks() {
     return;
 
   // Initialize lock index table
-  __kmp_i_lock_table.size = KMP_I_LOCK_CHUNK;
-  __kmp_i_lock_table.table =
-      (kmp_indirect_lock_t **)__kmp_allocate(sizeof(kmp_indirect_lock_t *));
+  __kmp_i_lock_table.nrow_ptrs = KMP_I_LOCK_TABLE_INIT_NROW_PTRS;
+  __kmp_i_lock_table.table = (kmp_indirect_lock_t **)__kmp_allocate(
+      sizeof(kmp_indirect_lock_t *) * KMP_I_LOCK_TABLE_INIT_NROW_PTRS);
   *(__kmp_i_lock_table.table) = (kmp_indirect_lock_t *)__kmp_allocate(
       KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
   __kmp_i_lock_table.next = 0;
+  __kmp_i_lock_table.next_table = nullptr;
 
   // Indirect lock size
   __kmp_indirect_lock_size[locktag_ticket] = sizeof(kmp_ticket_lock_t);
@@ -3392,7 +3423,6 @@ void __kmp_init_dynamic_user_locks() {
 
 // Clean up the lock table.
 void __kmp_cleanup_indirect_user_locks() {
-  kmp_lock_index_t i;
   int k;
 
   // Clean up locks in the pools first (they were already destroyed before going
@@ -3410,22 +3440,29 @@ void __kmp_cleanup_indirect_user_locks() {
     __kmp_indirect_lock_pool[k] = NULL;
   }
   // Clean up the remaining undestroyed locks.
-  for (i = 0; i < __kmp_i_lock_table.next; i++) {
-    kmp_indirect_lock_t *l = KMP_GET_I_LOCK(i);
-    if (l->lock != NULL) {
-      // Locks not destroyed explicitly need to be destroyed here.
-      KMP_I_LOCK_FUNC(l, destroy)(l->lock);
-      KA_TRACE(
-          20,
-          ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p from table\n",
-           l));
-      __kmp_free(l->lock);
+  kmp_indirect_lock_table_t *ptr = &__kmp_i_lock_table;
+  while (ptr) {
+    for (kmp_uint32 row = 0; row < ptr->nrow_ptrs; ++row) {
+      if (!ptr->table[row])
+        continue;
+      for (kmp_uint32 col = 0; col < KMP_I_LOCK_CHUNK; ++col) {
+        kmp_indirect_lock_t *l = &ptr->table[row][col];
+        if (l->lock) {
+          // Locks not destroyed explicitly need to be destroyed here.
+          KMP_I_LOCK_FUNC(l, destroy)(l->lock);
+          KA_TRACE(20, ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p "
+                        "from table\n",
+                        l));
+          __kmp_free(l->lock);
+        }
+      }
+      __kmp_free(ptr->table[row]);
     }
+    kmp_indirect_lock_table_t *next_table = ptr->next_table;
+    if (ptr != &__kmp_i_lock_table)
+      __kmp_free(ptr);
+    ptr = next_table;
   }
-  // Free the table
-  for (i = 0; i < __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK; i++)
-    __kmp_free(__kmp_i_lock_table.table[i]);
-  __kmp_free(__kmp_i_lock_table.table);
 
   __kmp_init_user_locks = FALSE;
 }
diff --git a/contrib/libs/cxxsupp/openmp/kmp_lock.h b/contrib/libs/cxxsupp/openmp/kmp_lock.h
index 4f6ad6414e..a19f4ca323 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_lock.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_lock.h
@@ -651,12 +651,15 @@ extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
     if (lck->tas.lk.poll != 0 ||                                               \
         !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {     \
       kmp_uint32 spins;                                                        \
+      kmp_uint64 time;                                                         \
       KMP_FSYNC_PREPARE(lck);                                                  \
       KMP_INIT_YIELD(spins);                                                   \
+      KMP_INIT_BACKOFF(time);                                                  \
       do {                                                                     \
-        KMP_YIELD_OVERSUB_ELSE_SPIN(spins);                                    \
-      } while (lck->tas.lk.poll != 0 || !__kmp_atomic_compare_store_acq(       \
-                                            &lck->tas.lk.poll, 0, gtid + 1));  \
+        KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);                              \
+      } while (                                                                \
+          lck->tas.lk.poll != 0 ||                                             \
+          !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));    \
     }                                                                          \
     KMP_FSYNC_ACQUIRED(lck);                                                   \
   } else {                                                                     \
@@ -758,10 +761,12 @@ extern int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
       if ((lck->tas.lk.poll != 0) ||                                           \
           !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {   \
         kmp_uint32 spins;                                                      \
+        kmp_uint64 time;                                                       \
         KMP_FSYNC_PREPARE(lck);                                                \
         KMP_INIT_YIELD(spins);                                                 \
+        KMP_INIT_BACKOFF(time);                                                \
         do {                                                                   \
-          KMP_YIELD_OVERSUB_ELSE_SPIN(spins);                                  \
+          KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);                            \
         } while (                                                              \
             (lck->tas.lk.poll != 0) ||                                         \
             !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));  \
@@ -1217,22 +1222,41 @@ extern kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
        ? __kmp_indirect_get_flags[(lck)->type]((lck)->lock)                    \
        : NULL)
 
-#define KMP_I_LOCK_CHUNK                                                       \
-  1024 // number of kmp_indirect_lock_t objects to be allocated together
+// number of kmp_indirect_lock_t objects to be allocated together
+#define KMP_I_LOCK_CHUNK 1024
+// Keep at a power of 2 since it is used in multiplication & division
+KMP_BUILD_ASSERT(KMP_I_LOCK_CHUNK % 2 == 0);
+// number of row entries in the initial lock table
+#define KMP_I_LOCK_TABLE_INIT_NROW_PTRS 8
 
 // Lock table for indirect locks.
 typedef struct kmp_indirect_lock_table {
   kmp_indirect_lock_t **table; // blocks of indirect locks allocated
-  kmp_lock_index_t size; // size of the indirect lock table
+  kmp_uint32 nrow_ptrs; // number *table pointer entries in table
   kmp_lock_index_t next; // index to the next lock to be allocated
+  struct kmp_indirect_lock_table *next_table;
 } kmp_indirect_lock_table_t;
 
 extern kmp_indirect_lock_table_t __kmp_i_lock_table;
 
 // Returns the indirect lock associated with the given index.
-#define KMP_GET_I_LOCK(index)                                                  \
-  (*(__kmp_i_lock_table.table + (index) / KMP_I_LOCK_CHUNK) +                  \
-   (index) % KMP_I_LOCK_CHUNK)
+// Returns nullptr if no lock at given index
+static inline kmp_indirect_lock_t *__kmp_get_i_lock(kmp_lock_index_t idx) {
+  kmp_indirect_lock_table_t *lock_table = &__kmp_i_lock_table;
+  while (lock_table) {
+    kmp_lock_index_t max_locks = lock_table->nrow_ptrs * KMP_I_LOCK_CHUNK;
+    if (idx < max_locks) {
+      kmp_lock_index_t row = idx / KMP_I_LOCK_CHUNK;
+      kmp_lock_index_t col = idx % KMP_I_LOCK_CHUNK;
+      if (!lock_table->table[row] || idx >= lock_table->next)
+        break;
+      return &lock_table->table[row][col];
+    }
+    idx -= max_locks;
+    lock_table = lock_table->next_table;
+  }
+  return nullptr;
+}
 
 // Number of locks in a lock block, which is fixed to "1" now.
 // TODO: No lock block implementation now. If we do support, we need to manage
@@ -1241,8 +1265,9 @@ extern int __kmp_num_locks_in_block;
 
 // Fast lock table lookup without consistency checking
 #define KMP_LOOKUP_I_LOCK(l)                                                   \
-  ((OMP_LOCK_T_SIZE < sizeof(void *)) ? KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(l)) \
-                                      : *((kmp_indirect_lock_t **)(l)))
+  ((OMP_LOCK_T_SIZE < sizeof(void *))                                          \
+       ? __kmp_get_i_lock(KMP_EXTRACT_I_INDEX(l))                              \
+       : *((kmp_indirect_lock_t **)(l)))
 
 // Used once in kmp_error.cpp
 extern kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p, kmp_uint32);
diff --git a/contrib/libs/cxxsupp/openmp/kmp_os.h b/contrib/libs/cxxsupp/openmp/kmp_os.h
index 4437cf2518..d71e9aecb3 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_os.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_os.h
@@ -1025,6 +1025,30 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
 #define KMP_MB() /* nothing to do */
 #endif
 
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if KMP_COMPILER_ICC
+#define KMP_MFENCE_() _mm_mfence()
+#define KMP_SFENCE_() _mm_sfence()
+#elif KMP_COMPILER_MSVC
+#define KMP_MFENCE_() MemoryBarrier()
+#define KMP_SFENCE_() MemoryBarrier()
+#else
+#define KMP_MFENCE_() __sync_synchronize()
+#define KMP_SFENCE_() __sync_synchronize()
+#endif
+#define KMP_MFENCE()                                                           \
+  if (UNLIKELY(!__kmp_cpuinfo.initialized)) {                                  \
+    __kmp_query_cpuid(&__kmp_cpuinfo);                                         \
+  }                                                                            \
+  if (__kmp_cpuinfo.flags.sse2) {                                              \
+    KMP_MFENCE_();                                                             \
+  }
+#define KMP_SFENCE() KMP_SFENCE_()
+#else
+#define KMP_MFENCE() KMP_MB()
+#define KMP_SFENCE() KMP_MB()
+#endif
+
 #ifndef KMP_IMB
 #define KMP_IMB() /* nothing to do */
 #endif
diff --git a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
index fe931bb157..34f8a01743 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
@@ -107,6 +107,10 @@ static int __kmp_unregister_root_other_thread(int gtid);
 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
 
+void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
+                               int new_nthreads);
+void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
+
 /* Calculate the identifier of the current thread */
 /* fast (and somewhat portable) way to get unique identifier of executing
    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
@@ -910,7 +914,8 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
    assured that there are enough threads available, because we checked on that
    earlier within critical section forkjoin */
 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
-                                    kmp_info_t *master_th, int master_gtid) {
+                                    kmp_info_t *master_th, int master_gtid,
+                                    int fork_teams_workers) {
   int i;
   int use_hot_team;
 
@@ -999,7 +1004,12 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
     }
 
 #if KMP_AFFINITY_SUPPORTED
-    __kmp_partition_places(team);
+    // Do not partition the places list for teams construct workers who
+    // haven't actually been forked to do real work yet. This partitioning
+    // will take place in the parallel region nested within the teams construct.
+    if (!fork_teams_workers) {
+      __kmp_partition_places(team);
+    }
 #endif
   }
 
@@ -1204,7 +1214,7 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     this_thr->th.th_team = serial_team;
     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
 
-    KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
+    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
                   this_thr->th.th_current_task));
     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
     this_thr->th.th_current_task->td_flags.executing = 0;
@@ -1563,15 +1573,24 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
       /* Change number of threads in the team if requested */
       if (master_set_numthreads) { // The parallel has num_threads clause
-        if (master_set_numthreads < master_th->th.th_teams_size.nth) {
+        if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
           // AC: only can reduce number of threads dynamically, can't increase
           kmp_info_t **other_threads = parent_team->t.t_threads;
+          // NOTE: if using distributed barrier, we need to run this code block
+          // even when the team size appears not to have changed from the max.
+          int old_proc = master_th->th.th_teams_size.nth;
+          if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
+              bp_dist_bar) {
+            __kmp_resize_dist_barrier(parent_team, old_proc,
+                                      master_set_numthreads);
+            __kmp_add_threads_to_team(parent_team, master_set_numthreads);
+          }
           parent_team->t.t_nproc = master_set_numthreads;
           for (i = 0; i < master_set_numthreads; ++i) {
             other_threads[i]->th.th_team_nproc = master_set_numthreads;
           }
-          // Keep extra threads hot in the team for possible next parallels
         }
+        // Keep extra threads hot in the team for possible next parallels
         master_th->th.th_set_nproc = 0;
       }
 
@@ -1584,6 +1603,41 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       }
 #endif
 
+      // Figure out the proc_bind policy for the nested parallel within teams
+      kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
+      // proc_bind_default means don't update
+      kmp_proc_bind_t proc_bind_icv = proc_bind_default;
+      if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
+        proc_bind = proc_bind_false;
+      } else {
+        // No proc_bind clause specified; use current proc-bind-var
+        if (proc_bind == proc_bind_default) {
+          proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
+        }
+        /* else: The proc_bind policy was specified explicitly on parallel
+           clause.
+           This overrides proc-bind-var for this parallel region, but does not
+           change proc-bind-var. */
+        // Figure the value of proc-bind-var for the child threads.
+        if ((level + 1 < __kmp_nested_proc_bind.used) &&
+            (__kmp_nested_proc_bind.bind_types[level + 1] !=
+             master_th->th.th_current_task->td_icvs.proc_bind)) {
+          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+        }
+      }
+      KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
+      // Need to change the bind-var ICV to correct value for each implicit task
+      if (proc_bind_icv != proc_bind_default &&
+          master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
+        kmp_info_t **other_threads = parent_team->t.t_threads;
+        for (i = 0; i < master_th->th.th_team_nproc; ++i) {
+          other_threads[i]->th.th_current_task->td_icvs.proc_bind =
+              proc_bind_icv;
+        }
+      }
+      // Reset for next parallel region
+      master_th->th.th_set_proc_bind = proc_bind_default;
+
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
            KMP_ITT_DEBUG) &&
@@ -1600,6 +1654,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
       }
 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+#if KMP_AFFINITY_SUPPORTED
+      __kmp_partition_places(parent_team);
+#endif
 
       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
                     "master_th=%p, gtid=%d\n",
@@ -1635,6 +1692,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     }
 #endif
 
+    // Need this to happen before we determine the number of threads, not while
+    // we are allocating the team
+    //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
     int enter_teams = 0;
     if (parent_team->t.t_active_level >=
         master_th->th.th_current_task->td_icvs.max_active_levels) {
@@ -1642,13 +1702,10 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     } else {
       enter_teams = ((ap == NULL && active_level == 0) ||
                      (ap && teams_level > 0 && teams_level == level));
-      nthreads =
-          master_set_numthreads
-              ? master_set_numthreads
-              : get__nproc_2(
-                    parent_team,
-                    master_tid); // TODO: get nproc directly from current task
-
+      nthreads = master_set_numthreads
+                     ? master_set_numthreads
+                     // TODO: get nproc directly from current task
+                     : get__nproc_2(parent_team, master_tid);
       // Check if we need to take forkjoin lock? (no need for serialized
       // parallel out of teams construct). This code moved here from
       // __kmp_reserve_threads() to speedup nested serialized parallels.
@@ -1940,16 +1997,21 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
     // Figure out the proc_bind_policy for the new team.
     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
-    kmp_proc_bind_t proc_bind_icv =
-        proc_bind_default; // proc_bind_default means don't update
+    // proc_bind_default means don't update
+    kmp_proc_bind_t proc_bind_icv = proc_bind_default;
     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
       proc_bind = proc_bind_false;
     } else {
+      // No proc_bind clause specified; use current proc-bind-var for this
+      // parallel region
       if (proc_bind == proc_bind_default) {
-        // No proc_bind clause specified; use current proc-bind-var for this
-        // parallel region
         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
       }
+      // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
+      if (master_th->th.th_teams_microtask &&
+          microtask == (microtask_t)__kmp_teams_master) {
+        proc_bind = __kmp_teams_proc_bind;
+      }
       /* else: The proc_bind policy was specified explicitly on parallel clause.
          This overrides proc-bind-var for this parallel region, but does not
          change proc-bind-var. */
@@ -1957,7 +2019,11 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       if ((level + 1 < __kmp_nested_proc_bind.used) &&
           (__kmp_nested_proc_bind.bind_types[level + 1] !=
            master_th->th.th_current_task->td_icvs.proc_bind)) {
-        proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+        // Do not modify the proc bind icv for the two teams construct forks
+        // They just let the proc bind icv pass through
+        if (!master_th->th.th_teams_microtask ||
+            !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
+          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
       }
     }
 
@@ -1983,6 +2049,8 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 #endif
                                  proc_bind, &new_icvs,
                                  argc USE_NESTED_HOT_ARG(master_th));
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
+        copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
     } else {
       /* allocate a new parallel team */
       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
@@ -1993,6 +2061,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
                                  proc_bind,
                                  &master_th->th.th_current_task->td_icvs,
                                  argc USE_NESTED_HOT_ARG(master_th));
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
+        copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
+                  &master_th->th.th_current_task->td_icvs);
     }
     KF_TRACE(
         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
@@ -2124,7 +2195,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
       root->r.r_active = TRUE;
 
-    __kmp_fork_team_threads(root, team, master_th, gtid);
+    __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
     __kmp_setup_icv_copy(team, nthreads,
                          &master_th->th.th_current_task->td_icvs, loc);
 
@@ -2359,6 +2430,12 @@ void __kmp_join_call(ident_t *loc, int gtid
       parent_team->t.t_stack_id = NULL;
     }
 #endif
+
+    if (team->t.t_nproc > 1 &&
+        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      team->t.b->update_num_threads(team->t.t_nproc);
+      __kmp_add_threads_to_team(team, team->t.t_nproc);
+    }
   }
 
   KMP_MB();
@@ -2387,6 +2464,14 @@ void __kmp_join_call(ident_t *loc, int gtid
   } // active_level == 1
 #endif /* USE_ITT_BUILD */
 
+#if KMP_AFFINITY_SUPPORTED
+  if (!exit_teams) {
+    // Restore master thread's partition.
+    master_th->th.th_first_place = team->t.t_first_place;
+    master_th->th.th_last_place = team->t.t_last_place;
+  }
+#endif // KMP_AFFINITY_SUPPORTED
+
   if (master_th->th.th_teams_microtask && !exit_teams &&
       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
       team->t.t_level == master_th->th.th_teams_level + 1) {
@@ -2494,11 +2579,6 @@ void __kmp_join_call(ident_t *loc, int gtid
                 master_th, team));
   __kmp_pop_current_task_from_thread(master_th);
 
-#if KMP_AFFINITY_SUPPORTED
-  // Restore master thread's partition.
-  master_th->th.th_first_place = team->t.t_first_place;
-  master_th->th.th_last_place = team->t.t_last_place;
-#endif // KMP_AFFINITY_SUPPORTED
   master_th->th.th_def_allocator = team->t.t_def_allocator;
 
 #if OMPD_SUPPORT
@@ -2646,6 +2726,9 @@ void __kmp_set_num_threads(int new_nth, int gtid) {
 
     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
 
+    if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
+    }
     // Release the extra threads we don't need any more.
     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
@@ -2665,6 +2748,11 @@ void __kmp_set_num_threads(int new_nth, int gtid) {
     }
 #endif
 
+    if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      hot_team->t.b->update_num_threads(new_nth);
+      __kmp_add_threads_to_team(hot_team, new_nth);
+    }
+
     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
 
     // Update the t_nproc field in the threads that are still active.
@@ -4018,7 +4106,8 @@ void __kmp_unregister_root_current_thread(int gtid) {
   kmp_task_team_t *task_team = thread->th.th_task_team;
 
   // we need to wait for the proxy tasks before finishing the thread
-  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
+  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
+                            task_team->tt.tt_hidden_helper_task_encountered)) {
 #if OMPT_SUPPORT
     // the runtime is shutting down so we won't report any events
     thread->th.ompt_thread_info.state = ompt_state_undefined;
@@ -4112,7 +4201,6 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
   this_thr->th.th_team_nproc = team->t.t_nproc;
   this_thr->th.th_team_master = master;
   this_thr->th.th_team_serialized = team->t.t_serialized;
-  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
 
   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
 
@@ -4281,6 +4369,12 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     new_thr->th.th_task_state_top = 0;
     new_thr->th.th_task_state_stack_sz = 4;
 
+    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      // Make sure pool thread has transitioned to waiting on own thread struct
+      KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
+      // Thread activated in __kmp_allocate_team when increasing team size
+    }
+
 #ifdef KMP_ADJUST_BLOCKTIME
     /* Adjust blocktime back to zero if necessary */
     /* Middle initialization might not have occurred yet */
@@ -4448,6 +4542,9 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     balign[b].bb.use_oncore_barrier = 0;
   }
 
+  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
+  new_thr->th.th_sleep_loc_type = flag_unset;
+
   new_thr->th.th_spin_here = FALSE;
   new_thr->th.th_next_waiting = 0;
 #if KMP_OS_UNIX
@@ -4976,6 +5073,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
   kmp_team_t *team;
   int use_hot_team = !root->r.r_active;
   int level = 0;
+  int do_place_partition = 1;
 
   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
@@ -4997,6 +5095,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         ++level; // not increment if #teams==1, or for outer fork of the teams;
         // increment otherwise
       }
+      // Do not perform the place partition if inner fork of the teams
+      // Wait until nested parallel region encountered inside teams construct
+      if ((master->th.th_teams_size.nteams == 1 &&
+           master->th.th_teams_level >= team->t.t_level) ||
+          (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
+        do_place_partition = 0;
     }
     hot_teams = master->th.th_hot_teams;
     if (level < __kmp_hot_teams_max_level && hot_teams &&
@@ -5027,6 +5131,17 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
     }
 #endif
 
+    if (team->t.t_nproc != new_nproc &&
+        __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      // Distributed barrier may need a resize
+      int old_nthr = team->t.t_nproc;
+      __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
+    }
+
+    // If not doing the place partition, then reset the team's proc bind
+    // to indicate that partitioning of all threads still needs to take place
+    if (do_place_partition == 0)
+      team->t.t_proc_bind = proc_bind_default;
     // Has the number of threads changed?
     /* Let's assume the most common case is that the number of threads is
        unchanged, and put that case first. */
@@ -5056,16 +5171,20 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       if ((team->t.t_size_changed == 0) &&
           (team->t.t_proc_bind == new_proc_bind)) {
         if (new_proc_bind == proc_bind_spread) {
-          __kmp_partition_places(
-              team, 1); // add flag to update only master for spread
+          if (do_place_partition) {
+            // add flag to update only master for spread
+            __kmp_partition_places(team, 1);
+          }
         }
         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
                        "proc_bind = %d, partition = [%d,%d]\n",
                        team->t.t_id, new_proc_bind, team->t.t_first_place,
                        team->t.t_last_place));
       } else {
-        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
-        __kmp_partition_places(team);
+        if (do_place_partition) {
+          KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+          __kmp_partition_places(team);
+        }
       }
 #else
       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
@@ -5076,6 +5195,11 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
                 new_nproc));
 
       team->t.t_size_changed = 1;
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        // Barrier size already reduced earlier in this function
+        // Activate team threads via th_used_in_team
+        __kmp_add_threads_to_team(team, new_nproc);
+      }
 #if KMP_NESTED_HOT_TEAMS
       if (__kmp_hot_teams_mode == 0) {
         // AC: saved number of threads should correspond to team's value in this
@@ -5137,10 +5261,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       }
 #endif
 
-      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+      if (do_place_partition) {
+        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
 #if KMP_AFFINITY_SUPPORTED
-      __kmp_partition_places(team);
+        __kmp_partition_places(team);
 #endif
+      }
     } else { // team->t.t_nproc < new_nproc
 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
       kmp_affin_mask_t *old_mask;
@@ -5152,7 +5278,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       KA_TRACE(20,
                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
                 new_nproc));
-
+      int old_nproc = team->t.t_nproc; // save old value and use to update only
       team->t.t_size_changed = 1;
 
 #if KMP_NESTED_HOT_TEAMS
@@ -5179,10 +5305,9 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
         team->t.t_nproc = new_nproc; // just get reserved threads involved
       } else {
-        // we may have some threads in reserve, but not enough
-        team->t.t_nproc =
-            hot_teams[level]
-                .hot_team_nth; // get reserved threads involved if any
+        // We may have some threads in reserve, but not enough;
+        // get reserved threads involved if any.
+        team->t.t_nproc = hot_teams[level].hot_team_nth;
         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
 #endif // KMP_NESTED_HOT_TEAMS
         if (team->t.t_max_nproc < new_nproc) {
@@ -5237,8 +5362,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #if KMP_NESTED_HOT_TEAMS
       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
 #endif // KMP_NESTED_HOT_TEAMS
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        // Barrier size already increased earlier in this function
+        // Activate team threads via th_used_in_team
+        __kmp_add_threads_to_team(team, new_nproc);
+      }
       /* make sure everyone is syncronized */
-      int old_nproc = team->t.t_nproc; // save old value and use to update only
       // new threads below
       __kmp_initialize_team(team, new_nproc, new_icvs,
                             root->r.r_uber_thread->th.th_ident);
@@ -5273,10 +5402,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       }
 #endif
 
-      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+      if (do_place_partition) {
+        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
 #if KMP_AFFINITY_SUPPORTED
-      __kmp_partition_places(team);
+        __kmp_partition_places(team);
 #endif
+      }
     } // Check changes in number of threads
 
     kmp_info_t *master = team->t.t_threads[0];
@@ -5342,6 +5473,13 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       /* take this team from the team pool */
       __kmp_team_pool = team->t.t_next_pool;
 
+      if (max_nproc > 1 &&
+          __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        if (!team->t.b) { // Allocate barrier structure
+          team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
+        }
+      }
+
       /* setup the team for fresh use */
       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
 
@@ -5397,6 +5535,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 
   /* and set it up */
   team->t.t_max_nproc = max_nproc;
+  if (max_nproc > 1 &&
+      __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+    // Allocate barrier structure
+    team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
+  }
+
   /* NOTE well, for some reason allocating one big buffer and dividing it up
      seems to really hurt performance a lot on the P4, so, let's not use this */
   __kmp_allocate_team_arrays(team, max_nproc);
@@ -5469,7 +5613,6 @@ void __kmp_free_team(kmp_root_t *root,
   int use_hot_team = team == root->r.r_hot_team;
 #if KMP_NESTED_HOT_TEAMS
   int level;
-  kmp_hot_team_ptr_t *hot_teams;
   if (master) {
     level = team->t.t_active_level - 1;
     if (master->th.th_teams_microtask) { // in teams construct?
@@ -5483,7 +5626,9 @@ void __kmp_free_team(kmp_root_t *root,
         // team_of_workers before the parallel
       } // team->t.t_level will be increased inside parallel
     }
-    hot_teams = master->th.th_hot_teams;
+#if KMP_DEBUG
+    kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
+#endif
     if (level < __kmp_hot_teams_max_level) {
       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
       use_hot_team = 1;
@@ -5553,10 +5698,43 @@ void __kmp_free_team(kmp_root_t *root,
     /* free the worker threads */
     for (f = 1; f < team->t.t_nproc; ++f) {
       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
+                                    1, 2);
+      }
       __kmp_free_thread(team->t.t_threads[f]);
+    }
+
+    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      if (team->t.b) {
+        // wake up thread at old location
+        team->t.b->go_release();
+        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+          for (f = 1; f < team->t.t_nproc; ++f) {
+            if (team->t.b->sleep[f].sleep) {
+              __kmp_atomic_resume_64(
+                  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
+                  (kmp_atomic_flag_64<> *)NULL);
+            }
+          }
+        }
+        // Wait for threads to be removed from team
+        for (int f = 1; f < team->t.t_nproc; ++f) {
+          while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
+            KMP_CPU_PAUSE();
+        }
+      }
+    }
+
+    for (f = 1; f < team->t.t_nproc; ++f) {
       team->t.t_threads[f] = NULL;
     }
 
+    if (team->t.t_max_nproc > 1 &&
+        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      distributedBarrier::deallocate(team->t.b);
+      team->t.b = NULL;
+    }
     /* put the team back in the team pool */
     /* TODO limit size of team pool, call reap_team if pool too large */
     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
@@ -5955,11 +6133,18 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
       KA_TRACE(
           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
                gtid));
-      /* Need release fence here to prevent seg faults for tree forkjoin barrier
-       * (GEH) */
-      kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
-                         thread);
-      __kmp_release_64(&flag);
+      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        while (
+            !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
+          KMP_CPU_PAUSE();
+        __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
+      } else {
+        /* Need release fence here to prevent seg faults for tree forkjoin
+           barrier (GEH) */
+        kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
+                           thread);
+        __kmp_release_64(&flag);
+      }
     }
 
     // Terminate OS thread.
@@ -6054,6 +6239,31 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
 
 } // __kmp_reap_thread
 
+static void __kmp_itthash_clean(kmp_info_t *th) {
+#if USE_ITT_NOTIFY
+  if (__kmp_itt_region_domains.count > 0) {
+    for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
+      kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
+      while (bucket) {
+        kmp_itthash_entry_t *next = bucket->next_in_bucket;
+        __kmp_thread_free(th, bucket);
+        bucket = next;
+      }
+    }
+  }
+  if (__kmp_itt_barrier_domains.count > 0) {
+    for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
+      kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
+      while (bucket) {
+        kmp_itthash_entry_t *next = bucket->next_in_bucket;
+        __kmp_thread_free(th, bucket);
+        bucket = next;
+      }
+    }
+  }
+#endif
+}
+
 static void __kmp_internal_end(void) {
   int i;
 
@@ -6240,6 +6450,7 @@ void __kmp_internal_end_library(int gtid_req) {
                   gtid));
         return;
       } else {
+        __kmp_itthash_clean(__kmp_threads[gtid]);
         KA_TRACE(
             10,
             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
@@ -6486,7 +6697,7 @@ void __kmp_register_library_startup(void) {
 
     char *value = NULL; // Actual value of the environment variable.
 
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
     char *shm_name = __kmp_str_format("/%s", name);
     int shm_preexist = 0;
     char *data1;
@@ -6591,7 +6802,7 @@ void __kmp_register_library_startup(void) {
       } break;
       case 2: { // Neighbor is dead.
 
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
         // close shared memory.
         shm_unlink(shm_name); // this removes file in /dev/shm
 #else
@@ -6605,7 +6816,7 @@ void __kmp_register_library_startup(void) {
       }
     }
     KMP_INTERNAL_FREE((void *)value);
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
     KMP_INTERNAL_FREE((void *)shm_name);
 #endif
   } // while
@@ -6618,7 +6829,7 @@ void __kmp_unregister_library(void) {
   char *name = __kmp_reg_status_name();
   char *value = NULL;
 
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
   char *shm_name = __kmp_str_format("/%s", name);
   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
   if (fd1 == -1) {
@@ -6639,14 +6850,14 @@ void __kmp_unregister_library(void) {
   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
 //  Ok, this is our variable. Delete it.
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
     shm_unlink(shm_name); // this removes file in /dev/shm
 #else
     __kmp_env_unset(name);
 #endif
   }
 
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
   KMP_INTERNAL_FREE(shm_name);
 #endif
 
@@ -6684,7 +6895,9 @@ static void __kmp_check_mic_type() {
 static void __kmp_user_level_mwait_init() {
   struct kmp_cpuid buf;
   __kmp_x86_cpuid(7, 0, &buf);
-  __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
+  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
+  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
+  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
                 __kmp_umwait_enabled));
 }
@@ -6844,8 +7057,8 @@ static void __kmp_do_serial_initialize(void) {
 #if KMP_FAST_REDUCTION_BARRIER
 #define kmp_reduction_barrier_gather_bb ((int)1)
 #define kmp_reduction_barrier_release_bb ((int)1)
-#define kmp_reduction_barrier_gather_pat bp_hyper_bar
-#define kmp_reduction_barrier_release_pat bp_hyper_bar
+#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
+#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
 #endif // KMP_FAST_REDUCTION_BARRIER
   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
@@ -7500,6 +7713,11 @@ static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
       num_threads = 1;
     }
   } else {
+    if (num_threads < 0) {
+      __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
+                __kmp_msg_null);
+      num_threads = 1;
+    }
     // This thread will be the primary thread of the league primary threads
     // Store new thread limit; old limit is saved in th_cg_roots list
     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
@@ -7531,9 +7749,13 @@ static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
                           int num_threads) {
   kmp_info_t *thr = __kmp_threads[gtid];
-  KMP_DEBUG_ASSERT(num_teams >= 0);
-  KMP_DEBUG_ASSERT(num_threads >= 0);
-
+  if (num_teams < 0) {
+    // OpenMP specification requires requested values to be positive,
+    // but people can send us any value, so we'd better check
+    __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
+              __kmp_msg_null);
+    num_teams = 1;
+  }
   if (num_teams == 0) {
     if (__kmp_nteams > 0) {
       num_teams = __kmp_nteams;
@@ -7590,7 +7812,7 @@ void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
     num_teams = num_teams_ub;
   } else { // num_teams_lb <= num_teams <= num_teams_ub
-    if (num_threads == 0) {
+    if (num_threads <= 0) {
       if (num_teams_ub > __kmp_teams_max_nth) {
         num_teams = num_teams_lb;
       } else {
@@ -8702,6 +8924,96 @@ void __kmp_omp_display_env(int verbose) {
   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 }
 
+// The team size is changing, so distributed barrier must be modified
+void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
+                               int new_nthreads) {
+  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
+                   bp_dist_bar);
+  kmp_info_t **other_threads = team->t.t_threads;
+
+  // We want all the workers to stop waiting on the barrier while we adjust the
+  // size of the team.
+  for (int f = 1; f < old_nthreads; ++f) {
+    KMP_DEBUG_ASSERT(other_threads[f] != NULL);
+    // Ignore threads that are already inactive or not present in the team
+    if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
+      // teams construct causes thread_limit to get passed in, and some of
+      // those could be inactive; just ignore them
+      continue;
+    }
+    // If thread is transitioning still to in_use state, wait for it
+    if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
+      while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
+        KMP_CPU_PAUSE();
+    }
+    // The thread should be in_use now
+    KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
+    // Transition to unused state
+    team->t.t_threads[f]->th.th_used_in_team.store(2);
+    KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
+  }
+  // Release all the workers
+  kmp_uint64 new_value; // new value for go
+  new_value = team->t.b->go_release();
+
+  KMP_MFENCE();
+
+  // Workers should see transition status 2 and move to 0; but may need to be
+  // woken up first
+  size_t my_go_index;
+  int count = old_nthreads - 1;
+  while (count > 0) {
+    count = old_nthreads - 1;
+    for (int f = 1; f < old_nthreads; ++f) {
+      my_go_index = f / team->t.b->threads_per_go;
+      if (other_threads[f]->th.th_used_in_team.load() != 0) {
+        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
+          kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
+              void *, other_threads[f]->th.th_sleep_loc);
+          __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
+        }
+      } else {
+        KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
+        count--;
+      }
+    }
+  }
+  // Now update the barrier size
+  team->t.b->update_num_threads(new_nthreads);
+  team->t.b->go_reset();
+}
+
+void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
+  // Add the threads back to the team
+  KMP_DEBUG_ASSERT(team);
+  // Threads were paused and pointed at th_used_in_team temporarily during a
+  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
+  // the thread that it should transition itself back into the team. Then, if
+  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
+  // to wake it up.
+  for (int f = 1; f < new_nthreads; ++f) {
+    KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+    KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
+                                3);
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
+      __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
+                      (kmp_flag_32<false, false> *)NULL);
+    }
+  }
+  // The threads should be transitioning to the team; when they are done, they
+  // should have set th_used_in_team to 1. This loop forces master to wait until
+  // all threads have moved into the team and are waiting in the barrier.
+  int count = new_nthreads - 1;
+  while (count > 0) {
+    count = new_nthreads - 1;
+    for (int f = 1; f < new_nthreads; ++f) {
+      if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
+        count--;
+      }
+    }
+  }
+}
+
 // Globals and functions for hidden helper task
 kmp_info_t **__kmp_hidden_helper_threads;
 kmp_info_t *__kmp_hidden_helper_main_thread;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_settings.cpp b/contrib/libs/cxxsupp/openmp/kmp_settings.cpp
index 0b0973c766..112502fdce 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_settings.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_settings.cpp
@@ -164,7 +164,12 @@ int __kmp_convert_to_milliseconds(char const *data) {
     return (INT_MAX);
   value = (double)0.0;
   mult = '\0';
+#if KMP_OS_WINDOWS && KMP_MSVC_COMPAT
+  // On Windows, each %c parameter needs additional size parameter for sscanf_s
+  nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, 1, &extra, 1);
+#else
   nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, &extra);
+#endif
   if (nvalues < 1)
     return (-1);
   if (nvalues == 1)
@@ -297,8 +302,8 @@ void __kmp_check_stksize(size_t *val) {
   // if system stack size is too big then limit the size for worker threads
   if (*val > KMP_DEFAULT_STKSIZE * 16) // just a heuristics...
     *val = KMP_DEFAULT_STKSIZE * 16;
-  if (*val < KMP_MIN_STKSIZE)
-    *val = KMP_MIN_STKSIZE;
+  if (*val < __kmp_sys_min_stksize)
+    *val = __kmp_sys_min_stksize;
   if (*val > KMP_MAX_STKSIZE)
     *val = KMP_MAX_STKSIZE; // dead code currently, but may work in future
 #if KMP_OS_DARWIN
@@ -426,6 +431,7 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
                                       int *out_range, char *out_routine,
                                       char *out_file, int *out_lb,
                                       int *out_ub) {
+  const char *par_range_value;
   size_t len = KMP_STRLEN(value) + 1;
   par_range_to_print = (char *)KMP_INTERNAL_MALLOC(len + 1);
   KMP_STRNCPY_S(par_range_to_print, len + 1, value, len + 1);
@@ -434,11 +440,14 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
   __kmp_par_range_ub = INT_MAX;
   for (;;) {
     unsigned int len;
-    if (*value == '\0') {
+    if (!value || *value == '\0') {
       break;
     }
     if (!__kmp_strcasecmp_with_sentinel("routine", value, '=')) {
-      value = strchr(value, '=') + 1;
+      par_range_value = strchr(value, '=') + 1;
+      if (!par_range_value)
+        goto par_range_error;
+      value = par_range_value;
       len = __kmp_readstr_with_sentinel(out_routine, value,
                                         KMP_PAR_RANGE_ROUTINE_LEN - 1, ',');
       if (len == 0) {
@@ -451,7 +460,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
       continue;
     }
     if (!__kmp_strcasecmp_with_sentinel("filename", value, '=')) {
-      value = strchr(value, '=') + 1;
+      par_range_value = strchr(value, '=') + 1;
+      if (!par_range_value)
+        goto par_range_error;
+      value = par_range_value;
       len = __kmp_readstr_with_sentinel(out_file, value,
                                         KMP_PAR_RANGE_FILENAME_LEN - 1, ',');
       if (len == 0) {
@@ -465,7 +477,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
     }
     if ((!__kmp_strcasecmp_with_sentinel("range", value, '=')) ||
         (!__kmp_strcasecmp_with_sentinel("incl_range", value, '='))) {
-      value = strchr(value, '=') + 1;
+      par_range_value = strchr(value, '=') + 1;
+      if (!par_range_value)
+        goto par_range_error;
+      value = par_range_value;
       if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) {
         goto par_range_error;
       }
@@ -477,7 +492,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
       continue;
     }
     if (!__kmp_strcasecmp_with_sentinel("excl_range", value, '=')) {
-      value = strchr(value, '=') + 1;
+      par_range_value = strchr(value, '=') + 1;
+      if (!par_range_value)
+        goto par_range_error;
+      value = par_range_value;
       if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) {
         goto par_range_error;
       }
@@ -1684,6 +1702,8 @@ static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value,
   const char *var;
   /* ---------- Barrier method control ------------ */
 
+  static int dist_req = 0, non_dist_req = 0;
+  static bool warn = 1;
   for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
     var = __kmp_barrier_pattern_env_name[i];
 
@@ -1695,6 +1715,11 @@ static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value,
       for (j = bp_linear_bar; j < bp_last_bar; j++) {
         if (__kmp_match_with_sentinel(__kmp_barrier_pattern_name[j], value, 1,
                                       ',')) {
+          if (j == bp_dist_bar) {
+            dist_req++;
+          } else {
+            non_dist_req++;
+          }
           __kmp_barrier_gather_pattern[i] = (kmp_bar_pat_e)j;
           break;
         }
@@ -1709,6 +1734,11 @@ static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value,
       if (comma != NULL) {
         for (j = bp_linear_bar; j < bp_last_bar; j++) {
           if (__kmp_str_match(__kmp_barrier_pattern_name[j], 1, comma + 1)) {
+            if (j == bp_dist_bar) {
+              dist_req++;
+            } else {
+              non_dist_req++;
+            }
             __kmp_barrier_release_pattern[i] = (kmp_bar_pat_e)j;
             break;
           }
@@ -1723,6 +1753,20 @@ static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value,
       }
     }
   }
+  if (dist_req != 0) {
+    // set all barriers to dist
+    if ((non_dist_req != 0) && warn) {
+      KMP_INFORM(BarrierPatternOverride, name,
+                 __kmp_barrier_pattern_name[bp_dist_bar]);
+      warn = 0;
+    }
+    for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+      if (__kmp_barrier_release_pattern[i] != bp_dist_bar)
+        __kmp_barrier_release_pattern[i] = bp_dist_bar;
+      if (__kmp_barrier_gather_pattern[i] != bp_dist_bar)
+        __kmp_barrier_gather_pattern[i] = bp_dist_bar;
+    }
+  }
 } // __kmp_stg_parse_barrier_pattern
 
 static void __kmp_stg_print_barrier_pattern(kmp_str_buf_t *buffer,
@@ -1739,7 +1783,7 @@ static void __kmp_stg_print_barrier_pattern(kmp_str_buf_t *buffer,
         __kmp_str_buf_print(buffer, "   %s='",
                             __kmp_barrier_pattern_env_name[i]);
       }
-      KMP_DEBUG_ASSERT(j < bs_last_barrier && k < bs_last_barrier);
+      KMP_DEBUG_ASSERT(j < bp_last_bar && k < bp_last_bar);
       __kmp_str_buf_print(buffer, "%s,%s'\n", __kmp_barrier_pattern_name[j],
                           __kmp_barrier_pattern_name[k]);
     }
@@ -3092,6 +3136,7 @@ static void __kmp_stg_parse_topology_method(char const *name, char const *value,
   }
 #if KMP_GROUP_AFFINITY
   else if (__kmp_str_match("group", 1, value)) {
+    KMP_WARNING(StgDeprecatedValue, name, value, "all");
     __kmp_affinity_top_method = affinity_top_method_group;
   }
 #endif /* KMP_GROUP_AFFINITY */
@@ -3155,6 +3200,47 @@ static void __kmp_stg_print_topology_method(kmp_str_buf_t *buffer,
   }
 } // __kmp_stg_print_topology_method
 
+// KMP_TEAMS_PROC_BIND
+struct kmp_proc_bind_info_t {
+  const char *name;
+  kmp_proc_bind_t proc_bind;
+};
+static kmp_proc_bind_info_t proc_bind_table[] = {
+    {"spread", proc_bind_spread},
+    {"true", proc_bind_spread},
+    {"close", proc_bind_close},
+    // teams-bind = false means "replicate the primary thread's affinity"
+    {"false", proc_bind_primary},
+    {"primary", proc_bind_primary}};
+static void __kmp_stg_parse_teams_proc_bind(char const *name, char const *value,
+                                            void *data) {
+  int valid;
+  const char *end;
+  valid = 0;
+  for (size_t i = 0; i < sizeof(proc_bind_table) / sizeof(proc_bind_table[0]);
+       ++i) {
+    if (__kmp_match_str(proc_bind_table[i].name, value, &end)) {
+      __kmp_teams_proc_bind = proc_bind_table[i].proc_bind;
+      valid = 1;
+      break;
+    }
+  }
+  if (!valid) {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+}
+static void __kmp_stg_print_teams_proc_bind(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  const char *value = KMP_I18N_STR(NotDefined);
+  for (size_t i = 0; i < sizeof(proc_bind_table) / sizeof(proc_bind_table[0]);
+       ++i) {
+    if (__kmp_teams_proc_bind == proc_bind_table[i].proc_bind) {
+      value = proc_bind_table[i].name;
+      break;
+    }
+  }
+  __kmp_stg_print_str(buffer, name, value);
+}
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 // OMP_PROC_BIND / bind-var is functional on all 4.0 builds, including OS X*
@@ -4415,7 +4501,7 @@ static void __kmp_stg_parse_lock_kind(char const *name, char const *value,
   }
 #if KMP_USE_ADAPTIVE_LOCKS
   else if (__kmp_str_match("adaptive", 1, value)) {
-    if (__kmp_cpuinfo.rtm) { // ??? Is cpuinfo available here?
+    if (__kmp_cpuinfo.flags.rtm) { // ??? Is cpuinfo available here?
       __kmp_user_lock_kind = lk_adaptive;
       KMP_STORE_LOCK_SEQ(adaptive);
     } else {
@@ -4427,7 +4513,7 @@ static void __kmp_stg_parse_lock_kind(char const *name, char const *value,
 #endif // KMP_USE_ADAPTIVE_LOCKS
 #if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX
   else if (__kmp_str_match("rtm_queuing", 1, value)) {
-    if (__kmp_cpuinfo.rtm) {
+    if (__kmp_cpuinfo.flags.rtm) {
       __kmp_user_lock_kind = lk_rtm_queuing;
       KMP_STORE_LOCK_SEQ(rtm_queuing);
     } else {
@@ -4436,7 +4522,7 @@ static void __kmp_stg_parse_lock_kind(char const *name, char const *value,
       KMP_STORE_LOCK_SEQ(queuing);
     }
   } else if (__kmp_str_match("rtm_spin", 1, value)) {
-    if (__kmp_cpuinfo.rtm) {
+    if (__kmp_cpuinfo.flags.rtm) {
       __kmp_user_lock_kind = lk_rtm_spin;
       KMP_STORE_LOCK_SEQ(rtm_spin);
     } else {
@@ -4875,28 +4961,85 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
 
   // Check each component
   for (int i = 0; i < level; ++i) {
-    int offset = 0;
-    int num = atoi(components[i]); // each component should start with a number
-    if (num <= 0) {
-      goto err; // only positive integers are valid for count
-    }
-    if ((pos = strchr(components[i], '@'))) {
-      offset = atoi(pos + 1); // save offset
-      *pos = '\0'; // cut the offset from the component
-    }
-    pos = components[i] + strspn(components[i], digits);
-    if (pos == components[i]) {
-      goto err;
-    }
-    // detect the component type
-    kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos);
-    if (type == KMP_HW_UNKNOWN) {
-      goto err;
-    }
-    if (__kmp_hw_subset->specified(type)) {
-      goto err;
+    int core_level = 0;
+    char *core_components[MAX_T_LEVEL];
+    // Split possible core components by '&' delimiter
+    pos = components[i];
+    core_components[core_level++] = pos;
+    while ((pos = strchr(pos, '&'))) {
+      if (core_level >= MAX_T_LEVEL)
+        goto err; // too many different core types
+      *pos = '\0'; // modify input and avoid more copying
+      core_components[core_level++] = ++pos; // expect something after '&'
+    }
+
+    for (int j = 0; j < core_level; ++j) {
+      char *offset_ptr;
+      char *attr_ptr;
+      int offset = 0;
+      kmp_hw_attr_t attr;
+      int num;
+      // components may begin with an optional count of the number of resources
+      if (isdigit(*core_components[j])) {
+        num = atoi(core_components[j]);
+        if (num <= 0) {
+          goto err; // only positive integers are valid for count
+        }
+        pos = core_components[j] + strspn(core_components[j], digits);
+      } else if (*core_components[j] == '*') {
+        num = kmp_hw_subset_t::USE_ALL;
+        pos = core_components[j] + 1;
+      } else {
+        num = kmp_hw_subset_t::USE_ALL;
+        pos = core_components[j];
+      }
+
+      offset_ptr = strchr(core_components[j], '@');
+      attr_ptr = strchr(core_components[j], ':');
+
+      if (offset_ptr) {
+        offset = atoi(offset_ptr + 1); // save offset
+        *offset_ptr = '\0'; // cut the offset from the component
+      }
+      if (attr_ptr) {
+        attr.clear();
+        // save the attribute
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+        if (__kmp_str_match("intel_core", -1, attr_ptr + 1)) {
+          attr.set_core_type(KMP_HW_CORE_TYPE_CORE);
+        } else if (__kmp_str_match("intel_atom", -1, attr_ptr + 1)) {
+          attr.set_core_type(KMP_HW_CORE_TYPE_ATOM);
+        }
+#endif
+        if (__kmp_str_match("eff", 3, attr_ptr + 1)) {
+          const char *number = attr_ptr + 1;
+          // skip the eff[iciency] token
+          while (isalpha(*number))
+            number++;
+          if (!isdigit(*number)) {
+            goto err;
+          }
+          int efficiency = atoi(number);
+          attr.set_core_eff(efficiency);
+        } else {
+          goto err;
+        }
+        *attr_ptr = '\0'; // cut the attribute from the component
+      }
+      // detect the component type
+      kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos);
+      if (type == KMP_HW_UNKNOWN) {
+        goto err;
+      }
+      // Only the core type can have attributes
+      if (attr && type != KMP_HW_CORE)
+        goto err;
+      // Must allow core be specified more than once
+      if (type != KMP_HW_CORE && __kmp_hw_subset->specified(type)) {
+        goto err;
+      }
+      __kmp_hw_subset->push_back(num, type, offset, attr);
     }
-    __kmp_hw_subset->push_back(num, type, offset);
   }
   return;
 err:
@@ -4908,6 +5051,21 @@ err:
   return;
 }
 
+static inline const char *
+__kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
+  switch (type) {
+  case KMP_HW_CORE_TYPE_UNKNOWN:
+    return "unknown";
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  case KMP_HW_CORE_TYPE_ATOM:
+    return "intel_atom";
+  case KMP_HW_CORE_TYPE_CORE:
+    return "intel_core";
+#endif
+  }
+  return "unknown";
+}
+
 static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
                                       void *data) {
   kmp_str_buf_t buf;
@@ -4923,10 +5081,20 @@ static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
   depth = __kmp_hw_subset->get_depth();
   for (int i = 0; i < depth; ++i) {
     const auto &item = __kmp_hw_subset->at(i);
-    __kmp_str_buf_print(&buf, "%s%d%s", (i > 0 ? "," : ""), item.num,
-                        __kmp_hw_get_keyword(item.type));
-    if (item.offset)
-      __kmp_str_buf_print(&buf, "@%d", item.offset);
+    if (i > 0)
+      __kmp_str_buf_print(&buf, "%c", ',');
+    for (int j = 0; j < item.num_attrs; ++j) {
+      __kmp_str_buf_print(&buf, "%s%d%s", (j > 0 ? "&" : ""), item.num[j],
+                          __kmp_hw_get_keyword(item.type));
+      if (item.attr[j].is_core_type_valid())
+        __kmp_str_buf_print(
+            &buf, ":%s",
+            __kmp_hw_get_core_type_keyword(item.attr[j].get_core_type()));
+      if (item.attr[j].is_core_eff_valid())
+        __kmp_str_buf_print(&buf, ":eff%d", item.attr[j].get_core_eff());
+      if (item.offset[j])
+        __kmp_str_buf_print(&buf, "@%d", item.offset[j]);
+    }
   }
   __kmp_str_buf_print(buffer, "%s'\n", buf.str);
   __kmp_str_buf_free(&buf);
@@ -5003,6 +5171,27 @@ static void __kmp_stg_print_mwait_hints(kmp_str_buf_t *buffer, char const *name,
 
 #endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
 
+#if KMP_HAVE_UMWAIT
+// -----------------------------------------------------------------------------
+// KMP_TPAUSE
+// 0 = don't use TPAUSE, 1 = use C0.1 state, 2 = use C0.2 state
+
+static void __kmp_stg_parse_tpause(char const *name, char const *value,
+                                   void *data) {
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_tpause_state);
+  if (__kmp_tpause_state != 0) {
+    // The actual hint passed to tpause is: 0 for C0.2 and 1 for C0.1
+    if (__kmp_tpause_state == 2) // use C0.2
+      __kmp_tpause_hint = 0; // default was set to 1 for C0.1
+  }
+} // __kmp_stg_parse_tpause
+
+static void __kmp_stg_print_tpause(kmp_str_buf_t *buffer, char const *name,
+                                   void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_tpause_state);
+} // __kmp_stg_print_tpause
+#endif // KMP_HAVE_UMWAIT
+
 // -----------------------------------------------------------------------------
 // OMP_DISPLAY_ENV
 
@@ -5260,6 +5449,8 @@ static kmp_setting_t __kmp_stg_table[] = {
 #endif /* KMP_GOMP_COMPAT */
     {"OMP_PROC_BIND", __kmp_stg_parse_proc_bind, __kmp_stg_print_proc_bind,
      NULL, 0, 0},
+    {"KMP_TEAMS_PROC_BIND", __kmp_stg_parse_teams_proc_bind,
+     __kmp_stg_print_teams_proc_bind, NULL, 0, 0},
     {"OMP_PLACES", __kmp_stg_parse_places, __kmp_stg_print_places, NULL, 0, 0},
     {"KMP_TOPOLOGY_METHOD", __kmp_stg_parse_topology_method,
      __kmp_stg_print_topology_method, NULL, 0, 0},
@@ -5366,6 +5557,10 @@ static kmp_setting_t __kmp_stg_table[] = {
     {"KMP_MWAIT_HINTS", __kmp_stg_parse_mwait_hints,
      __kmp_stg_print_mwait_hints, NULL, 0, 0},
 #endif
+
+#if KMP_HAVE_UMWAIT
+    {"KMP_TPAUSE", __kmp_stg_parse_tpause, __kmp_stg_print_tpause, NULL, 0, 0},
+#endif
     {"", NULL, NULL, NULL, 0, 0}}; // settings
 
 static int const __kmp_stg_count =
@@ -5942,65 +6137,27 @@ void __kmp_env_initialize(char const *string) {
       // Handle the Win 64 group affinity stuff if there are multiple
       // processor groups, or if the user requested it, and OMP 4.0
       // affinity is not in effect.
-      if (((__kmp_num_proc_groups > 1) &&
-           (__kmp_affinity_type == affinity_default) &&
-           (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default)) ||
-          (__kmp_affinity_top_method == affinity_top_method_group)) {
+      if (__kmp_num_proc_groups > 1 &&
+          __kmp_affinity_type == affinity_default &&
+          __kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
+        // Do not respect the initial processor affinity mask if it is assigned
+        // exactly one Windows Processor Group since this is interpreted as the
+        // default OS assignment. Not respecting the mask allows the runtime to
+        // use all the logical processors in all groups.
         if (__kmp_affinity_respect_mask == affinity_respect_mask_default &&
             exactly_one_group) {
           __kmp_affinity_respect_mask = FALSE;
         }
+        // Use compact affinity with anticipation of pinning to at least the
+        // group granularity since threads can only be bound to one group.
         if (__kmp_affinity_type == affinity_default) {
           __kmp_affinity_type = affinity_compact;
           __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
         }
-        if (__kmp_affinity_top_method == affinity_top_method_default) {
-          if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
-            __kmp_affinity_top_method = affinity_top_method_group;
-            __kmp_affinity_gran = KMP_HW_PROC_GROUP;
-          } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) {
-            __kmp_affinity_top_method = affinity_top_method_group;
-          } else {
-            __kmp_affinity_top_method = affinity_top_method_all;
-          }
-        } else if (__kmp_affinity_top_method == affinity_top_method_group) {
-          if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
-            __kmp_affinity_gran = KMP_HW_PROC_GROUP;
-          } else if ((__kmp_affinity_gran != KMP_HW_PROC_GROUP) &&
-                     (__kmp_affinity_gran != KMP_HW_THREAD)) {
-            const char *str = __kmp_hw_get_keyword(__kmp_affinity_gran);
-            KMP_WARNING(AffGranTopGroup, var, str);
-            __kmp_affinity_gran = KMP_HW_THREAD;
-          }
-        } else {
-          if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
-            __kmp_affinity_gran = KMP_HW_CORE;
-          } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) {
-            const char *str = NULL;
-            switch (__kmp_affinity_type) {
-            case affinity_physical:
-              str = "physical";
-              break;
-            case affinity_logical:
-              str = "logical";
-              break;
-            case affinity_compact:
-              str = "compact";
-              break;
-            case affinity_scatter:
-              str = "scatter";
-              break;
-            case affinity_explicit:
-              str = "explicit";
-              break;
-            // No MIC on windows, so no affinity_balanced case
-            default:
-              KMP_DEBUG_ASSERT(0);
-            }
-            KMP_WARNING(AffGranGroupType, var, str);
-            __kmp_affinity_gran = KMP_HW_CORE;
-          }
-        }
+        if (__kmp_affinity_top_method == affinity_top_method_default)
+          __kmp_affinity_top_method = affinity_top_method_all;
+        if (__kmp_affinity_gran == KMP_HW_UNKNOWN)
+          __kmp_affinity_gran = KMP_HW_PROC_GROUP;
       } else
 
 #endif /* KMP_GROUP_AFFINITY */
diff --git a/contrib/libs/cxxsupp/openmp/kmp_stats.h b/contrib/libs/cxxsupp/openmp/kmp_stats.h
index 78bbb9068a..0e3ea3b9cf 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_stats.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_stats.h
@@ -246,6 +246,8 @@ enum stats_state_e {
 // KMP_tree_release       -- time in __kmp_tree_barrier_release
 // KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
 // KMP_hyper_release      -- time in __kmp_hyper_barrier_release
+// KMP_dist_gather       -- time in __kmp_dist_barrier_gather
+// KMP_dist_release      -- time in __kmp_dist_barrier_release
 // clang-format off
 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)                                \
   macro(KMP_fork_call, 0, arg)                                                 \
@@ -255,6 +257,8 @@ enum stats_state_e {
   macro(KMP_hier_release, 0, arg)                                              \
   macro(KMP_hyper_gather, 0, arg)                                              \
   macro(KMP_hyper_release, 0, arg)                                             \
+  macro(KMP_dist_gather, 0, arg)                                              \
+  macro(KMP_dist_release, 0, arg)                                             \
   macro(KMP_linear_gather, 0, arg)                                             \
   macro(KMP_linear_release, 0, arg)                                            \
   macro(KMP_tree_gather, 0, arg)                                               \
diff --git a/contrib/libs/cxxsupp/openmp/kmp_str.cpp b/contrib/libs/cxxsupp/openmp/kmp_str.cpp
index ffce2b88ab..e64f989fbc 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_str.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_str.cpp
@@ -515,6 +515,31 @@ int __kmp_str_match(char const *target, int len, char const *data) {
   return ((len > 0) ? i >= len : (!target[i] && (len || !data[i])));
 } // __kmp_str_match
 
+// If data contains all of target, returns true, otherwise returns false.
+// len should be the length of target
+bool __kmp_str_contains(char const *target, int len, char const *data) {
+  int i = 0, j = 0, start = 0;
+  if (target == NULL || data == NULL) {
+    return FALSE;
+  }
+  while (target[i]) {
+    if (!data[j])
+      return FALSE;
+    if (TOLOWER(target[i]) != TOLOWER(data[j])) {
+      j = start + 1;
+      start = j;
+      i = 0;
+    } else {
+      if (i == 0)
+        start = j;
+      j++;
+      i++;
+    }
+  }
+
+  return i == len;
+} // __kmp_str_contains
+
 int __kmp_str_match_false(char const *data) {
   int result =
       __kmp_str_match("false", 1, data) || __kmp_str_match("off", 2, data) ||
diff --git a/contrib/libs/cxxsupp/openmp/kmp_str.h b/contrib/libs/cxxsupp/openmp/kmp_str.h
index ff6179908e..855b5df55d 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_str.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_str.h
@@ -106,6 +106,7 @@ int __kmp_str_eqf(char const *lhs, char const *rhs);
 char *__kmp_str_format(char const *format, ...);
 void __kmp_str_free(char **str);
 int __kmp_str_match(char const *target, int len, char const *data);
+bool __kmp_str_contains(char const *target, int len, char const *data);
 int __kmp_str_match_false(char const *data);
 int __kmp_str_match_true(char const *data);
 void __kmp_str_replace(char *str, char search_for, char replace_with);
diff --git a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp
index dd3e7688d3..501830eaa7 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp
@@ -86,6 +86,7 @@ static kmp_dephash_t *__kmp_dephash_extend(kmp_info_t *thread,
   h->buckets = (kmp_dephash_entry **)(h + 1);
   h->generation = gen;
   h->nconflicts = 0;
+  h->last_all = current_dephash->last_all;
 
   // make sure buckets are properly initialized
   for (size_t i = 0; i < new_size; i++) {
@@ -142,6 +143,7 @@ static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread,
   h->nelements = 0;
   h->nconflicts = 0;
   h->buckets = (kmp_dephash_entry **)(h + 1);
+  h->last_all = NULL;
 
   for (size_t i = 0; i < h_size; i++)
     h->buckets[i] = 0;
@@ -174,7 +176,10 @@ static kmp_dephash_entry *__kmp_dephash_find(kmp_info_t *thread,
         thread, sizeof(kmp_dephash_entry_t));
 #endif
     entry->addr = addr;
-    entry->last_out = NULL;
+    if (!h->last_all) // no predecessor task with omp_all_memory dependence
+      entry->last_out = NULL;
+    else // else link the omp_all_memory depnode to the new entry
+      entry->last_out = __kmp_node_ref(h->last_all);
     entry->last_set = NULL;
     entry->prev_set = NULL;
     entry->last_flag = 0;
@@ -290,6 +295,63 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
   return npredecessors;
 }
 
+static inline kmp_int32
+__kmp_process_dep_all(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *h,
+                      bool dep_barrier, kmp_task_t *task) {
+  KA_TRACE(30, ("__kmp_process_dep_all: T#%d processing dep_all, "
+                "dep_barrier = %d\n",
+                gtid, dep_barrier));
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_int32 npredecessors = 0;
+
+  // process previous omp_all_memory node if any
+  npredecessors +=
+      __kmp_depnode_link_successor(gtid, thread, task, node, h->last_all);
+  __kmp_node_deref(thread, h->last_all);
+  if (!dep_barrier) {
+    h->last_all = __kmp_node_ref(node);
+  } else {
+    // if this is a sync point in the serial sequence, then the previous
+    // outputs are guaranteed to be completed after the execution of this
+    // task so the previous output nodes can be cleared.
+    h->last_all = NULL;
+  }
+
+  // process all regular dependences
+  for (size_t i = 0; i < h->size; i++) {
+    kmp_dephash_entry_t *info = h->buckets[i];
+    if (!info) // skip empty slots in dephash
+      continue;
+    for (; info; info = info->next_in_bucket) {
+      // for each entry the omp_all_memory works as OUT dependence
+      kmp_depnode_t *last_out = info->last_out;
+      kmp_depnode_list_t *last_set = info->last_set;
+      kmp_depnode_list_t *prev_set = info->prev_set;
+      if (last_set) {
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
+        __kmp_depnode_list_free(thread, last_set);
+        __kmp_depnode_list_free(thread, prev_set);
+        info->last_set = NULL;
+        info->prev_set = NULL;
+        info->last_flag = 0; // no sets in this dephash entry
+      } else {
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
+      }
+      __kmp_node_deref(thread, last_out);
+      if (!dep_barrier) {
+        info->last_out = __kmp_node_ref(node);
+      } else {
+        info->last_out = NULL;
+      }
+    }
+  }
+  KA_TRACE(30, ("__kmp_process_dep_all: T#%d found %d predecessors\n", gtid,
+                npredecessors));
+  return npredecessors;
+}
+
 template <bool filter>
 static inline kmp_int32
 __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
@@ -417,7 +479,7 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
                              kmp_depend_info_t *dep_list,
                              kmp_int32 ndeps_noalias,
                              kmp_depend_info_t *noalias_dep_list) {
-  int i, n_mtxs = 0;
+  int i, n_mtxs = 0, dep_all = 0;
 #if KMP_DEBUG
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 #endif
@@ -429,7 +491,8 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
   // Filter deps in dep_list
   // TODO: Different algorithm for large dep_list ( > 10 ? )
   for (i = 0; i < ndeps; i++) {
-    if (dep_list[i].base_addr != 0) {
+    if (dep_list[i].base_addr != 0 &&
+        dep_list[i].base_addr != (kmp_intptr_t)KMP_SIZE_T_MAX) {
       KMP_DEBUG_ASSERT(
           dep_list[i].flag == KMP_DEP_IN || dep_list[i].flag == KMP_DEP_OUT ||
           dep_list[i].flag == KMP_DEP_INOUT ||
@@ -451,6 +514,13 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
           dep_list[i].flag = KMP_DEP_OUT; // downgrade mutexinoutset to inout
         }
       }
+    } else if (dep_list[i].flag == KMP_DEP_ALL ||
+               dep_list[i].base_addr == (kmp_intptr_t)KMP_SIZE_T_MAX) {
+      // omp_all_memory dependence can be marked by compiler by either
+      // (addr=0 && flag=0x80) (flag KMP_DEP_ALL), or (addr=-1).
+      // omp_all_memory overrides all other dependences if any
+      dep_all = 1;
+      break;
     }
   }
 
@@ -464,10 +534,14 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
   // the end
   int npredecessors;
 
-  npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier, ndeps,
-                                           dep_list, task);
-  npredecessors += __kmp_process_deps<false>(
-      gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task);
+  if (!dep_all) { // regular dependences
+    npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier,
+                                             ndeps, dep_list, task);
+    npredecessors += __kmp_process_deps<false>(
+        gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task);
+  } else { // omp_all_memory dependence
+    npredecessors = __kmp_process_dep_all(gtid, node, *hash, dep_barrier, task);
+  }
 
   node->dn.task = task;
   KMP_MB();
@@ -755,8 +829,10 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
   bool ignore = current_task->td_flags.team_serial ||
                 current_task->td_flags.tasking_ser ||
                 current_task->td_flags.final;
-  ignore = ignore && thread->th.th_task_team != NULL &&
-           thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE;
+  ignore =
+      ignore && thread->th.th_task_team != NULL &&
+      thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE &&
+      thread->th.th_task_team->tt.tt_hidden_helper_task_encountered == FALSE;
   ignore = ignore || current_task->td_dephash == NULL;
 
   if (ignore) {
diff --git a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h
index 73abf07018..99f182bbd0 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h
@@ -73,6 +73,8 @@ static inline void __kmp_dephash_free_entries(kmp_info_t *thread,
       h->buckets[i] = 0;
     }
   }
+  __kmp_node_deref(thread, h->last_all);
+  h->last_all = NULL;
 }
 
 static inline void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h) {
@@ -144,9 +146,10 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
           // encountering thread's queue; otherwise, it can be pushed to its own
           // queue.
           if (!next_taskdata->td_flags.hidden_helper) {
-            __kmpc_give_task(
-                successor->dn.task,
-                __kmp_tid_from_gtid(next_taskdata->encountering_gtid));
+            kmp_int32 encountering_gtid =
+                next_taskdata->td_alloc_thread->th.th_info.ds.ds_gtid;
+            kmp_int32 encountering_tid = __kmp_tid_from_gtid(encountering_gtid);
+            __kmpc_give_task(successor->dn.task, encountering_tid);
           } else {
             __kmp_omp_task(gtid, successor->dn.task, false);
           }
diff --git a/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp b/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp
index 55e9c30763..e445438524 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp
@@ -324,10 +324,16 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 
-  // We don't need to map to shadow gtid if it is already hidden helper thread
-  if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) {
-    gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
-    thread = __kmp_threads[gtid];
+  // If we encounter a hidden helper task, and the current thread is not a
+  // hidden helper thread, we have to give the task to any hidden helper thread
+  // starting from its shadow one.
+  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
+               !KMP_HIDDEN_HELPER_THREAD(gtid))) {
+    kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
+    __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
+    // Signal the hidden helper threads.
+    __kmp_hidden_helper_worker_thread_signal();
+    return TASK_SUCCESSFULLY_PUSHED;
   }
 
   kmp_task_team_t *task_team = thread->th.th_task_team;
@@ -434,16 +440,8 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
                 gtid, taskdata, thread_data->td.td_deque_ntasks,
                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
 
-  auto hidden_helper = taskdata->td_flags.hidden_helper;
-
   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
 
-  // Signal one worker thread to execute the task
-  if (UNLIKELY(hidden_helper)) {
-    // Wake hidden helper threads up if they're sleeping
-    __kmp_hidden_helper_worker_thread_signal();
-  }
-
   return TASK_SUCCESSFULLY_PUSHED;
 }
 
@@ -809,6 +807,24 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
            gtid, taskdata, children));
 }
 
+// Only need to keep track of child task counts if any of the following:
+// 1. team parallel and tasking not serialized;
+// 2. it is a proxy or detachable or hidden helper task
+// 3. the children counter of its parent task is greater than 0.
+// The reason for the 3rd one is for serialized team that found detached task,
+// hidden helper task, T. In this case, the execution of T is still deferred,
+// and it is also possible that a regular task depends on T. In this case, if we
+// don't track the children, task synchronization will be broken.
+static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
+  kmp_tasking_flags_t flags = taskdata->td_flags;
+  bool ret = !(flags.team_serial || flags.tasking_ser);
+  ret = ret || flags.proxy == TASK_PROXY ||
+        flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
+  ret = ret ||
+        KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
+  return ret;
+}
+
 // __kmp_task_finish: bookkeeping to do when a task finishes execution
 //
 // gtid: global thread ID for calling thread
@@ -825,8 +841,9 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_task_team_t *task_team =
       thread->th.th_task_team; // might be NULL for serial teams...
+#if KMP_DEBUG
   kmp_int32 children = 0;
-
+#endif
   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
                 "task %p\n",
                 gtid, taskdata, resumed_task));
@@ -934,16 +951,15 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
     if (ompt)
       __ompt_task_finish(task, resumed_task, ompt_task_complete);
 #endif
-
-    // Only need to keep track of count if team parallel and tasking not
-    // serialized, or task is detachable and event has already been fulfilled
-    if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
-        taskdata->td_flags.detachable == TASK_DETACHABLE ||
-        taskdata->td_flags.hidden_helper) {
+    // TODO: What would be the balance between the conditions in the function
+    // and an atomic operation?
+    if (__kmp_track_children_task(taskdata)) {
       __kmp_release_deps(gtid, taskdata);
       // Predecrement simulated by "- 1" calculation
-      children =
-          KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
+#if KMP_DEBUG
+      children = -1 +
+#endif
+          KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
       KMP_DEBUG_ASSERT(children >= 0);
       if (taskdata->td_taskgroup)
         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
@@ -1189,7 +1205,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   kmp_task_t *task;
   kmp_taskdata_t *taskdata;
   kmp_info_t *thread = __kmp_threads[gtid];
-  kmp_info_t *encountering_thread = thread;
   kmp_team_t *team = thread->th.th_team;
   kmp_taskdata_t *parent_task = thread->th.th_current_task;
   size_t shareds_offset;
@@ -1201,15 +1216,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
     if (__kmp_enable_hidden_helper) {
       if (!TCR_4(__kmp_init_hidden_helper))
         __kmp_hidden_helper_initialize();
-
-      // For a hidden helper task encountered by a regular thread, we will push
-      // the task to the (gtid%__kmp_hidden_helper_threads_num)-th hidden helper
-      // thread.
-      if (!KMP_HIDDEN_HELPER_THREAD(gtid)) {
-        thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
-        // We don't change the parent-child relation for hidden helper task as
-        // we need that to do per-task-region synchronization.
-      }
     } else {
       // If the hidden helper task is not enabled, reset the flag to FALSE.
       flags->hidden_helper = FALSE;
@@ -1232,8 +1238,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
     // Untied task encountered causes the TSC algorithm to check entire deque of
     // the victim thread. If no untied task encountered, then checking the head
     // of the deque should be enough.
-    KMP_CHECK_UPDATE(
-        encountering_thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
+    KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
   }
 
   // Detachable tasks are not proxy tasks yet but could be in the future. Doing
@@ -1247,32 +1252,30 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
     }
     /* are we running in a sequential parallel or tskm_immediate_exec... we need
        tasking support enabled */
-    if ((encountering_thread->th.th_task_team) == NULL) {
+    if ((thread->th.th_task_team) == NULL) {
       /* This should only happen if the team is serialized
           setup a task team and propagate it to the thread */
       KMP_DEBUG_ASSERT(team->t.t_serialized);
       KA_TRACE(30,
                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
                 gtid));
-      __kmp_task_team_setup(
-          encountering_thread, team,
-          1); // 1 indicates setup the current team regardless of nthreads
-      encountering_thread->th.th_task_team =
-          team->t.t_task_team[encountering_thread->th.th_task_state];
+      // 1 indicates setup the current team regardless of nthreads
+      __kmp_task_team_setup(thread, team, 1);
+      thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
     }
-    kmp_task_team_t *task_team = encountering_thread->th.th_task_team;
+    kmp_task_team_t *task_team = thread->th.th_task_team;
 
     /* tasking must be enabled now as the task might not be pushed */
     if (!KMP_TASKING_ENABLED(task_team)) {
       KA_TRACE(
           30,
           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
-      __kmp_enable_tasking(task_team, encountering_thread);
-      kmp_int32 tid = encountering_thread->th.th_info.ds.ds_tid;
+      __kmp_enable_tasking(task_team, thread);
+      kmp_int32 tid = thread->th.th_info.ds.ds_tid;
       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
       // No lock needed since only owner can allocate
       if (thread_data->td.td_deque == NULL) {
-        __kmp_alloc_task_deque(encountering_thread, thread_data);
+        __kmp_alloc_task_deque(thread, thread_data);
       }
     }
 
@@ -1297,11 +1300,11 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
 
   // Avoid double allocation here by combining shareds with taskdata
 #if USE_FAST_MEMORY
-  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(
-      encountering_thread, shareds_offset + sizeof_shareds);
+  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
+                                                               sizeof_shareds);
 #else /* ! USE_FAST_MEMORY */
-  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(
-      encountering_thread, shareds_offset + sizeof_shareds);
+  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
+                                                               sizeof_shareds);
 #endif /* USE_FAST_MEMORY */
 
   task = KMP_TASKDATA_TO_TASK(taskdata);
@@ -1328,7 +1331,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
 
   taskdata->td_task_id = KMP_GEN_TASK_ID();
   taskdata->td_team = thread->th.th_team;
-  taskdata->td_alloc_thread = encountering_thread;
+  taskdata->td_alloc_thread = thread;
   taskdata->td_parent = parent_task;
   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
   KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
@@ -1342,10 +1345,16 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
 
   taskdata->td_flags = *flags;
-  taskdata->encountering_gtid = gtid;
   taskdata->td_task_team = thread->th.th_task_team;
   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
   taskdata->td_flags.tasktype = TASK_EXPLICIT;
+  // If it is hidden helper task, we need to set the team and task team
+  // correspondingly.
+  if (flags->hidden_helper) {
+    kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
+    taskdata->td_team = shadow_thread->th.th_team;
+    taskdata->td_task_team = shadow_thread->th.th_task_team;
+  }
 
   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
@@ -1382,11 +1391,9 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   if (UNLIKELY(ompt_enabled.enabled))
     __ompt_task_init(taskdata, gtid);
 #endif
-  // Only need to keep track of child task counts if team parallel and tasking
-  // not serialized or if it is a proxy or detachable or hidden helper task
-  if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE ||
-      flags->hidden_helper ||
-      !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
+  // TODO: What would be the balance between the conditions in the function and
+  // an atomic operation?
+  if (__kmp_track_children_task(taskdata)) {
     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
     if (parent_task->td_taskgroup)
       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
@@ -1438,11 +1445,12 @@ kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
                                          size_t sizeof_shareds,
                                          kmp_routine_entry_t task_entry,
                                          kmp_int64 device_id) {
-  if (__kmp_enable_hidden_helper) {
-    auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
+  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
+  // target task is untied defined in the specification
+  input_flags.tiedness = TASK_UNTIED;
+
+  if (__kmp_enable_hidden_helper)
     input_flags.hidden_helper = TRUE;
-    input_flags.tiedness = TASK_UNTIED;
-  }
 
   return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
                                sizeof_shareds, task_entry);
@@ -1613,13 +1621,15 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
     KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
 #endif
 
+    if (task->routine != NULL) {
 #ifdef KMP_GOMP_COMPAT
-    if (taskdata->td_flags.native) {
-      ((void (*)(void *))(*(task->routine)))(task->shareds);
-    } else
+      if (taskdata->td_flags.native) {
+        ((void (*)(void *))(*(task->routine)))(task->shareds);
+      } else
 #endif /* KMP_GOMP_COMPAT */
-    {
-      (*(task->routine))(gtid, task);
+      {
+        (*(task->routine))(gtid, task);
+      }
     }
     KMP_POP_PARTITIONED_TIMER();
 
@@ -2833,15 +2843,14 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
     // We need to un-mark this victim as a finished victim.  This must be done
     // before releasing the lock, or else other threads (starting with the
     // primary thread victim) might be prematurely released from the barrier!!!
-    kmp_int32 count;
-
-    count = KMP_ATOMIC_INC(unfinished_threads);
-
+#if KMP_DEBUG
+    kmp_int32 count =
+#endif
+        KMP_ATOMIC_INC(unfinished_threads);
     KA_TRACE(
         20,
         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
          gtid, count + 1, task_team));
-
     *thread_finished = FALSE;
   }
   TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
@@ -2948,8 +2957,7 @@ static inline int __kmp_execute_tasks_template(
                 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
                  NULL)) {
               asleep = 1;
-              __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
-                                        other_thread->th.th_sleep_loc);
+              __kmp_null_resume_wrapper(other_thread);
               // A sleeping thread should not have any tasks on it's queue.
               // There is a slight possibility that it resumes, steals a task
               // from another thread, which spawns more tasks, all in the time
@@ -3034,9 +3042,10 @@ static inline int __kmp_execute_tasks_template(
       // done.  This decrement might be to the spin location, and result in the
       // termination condition being satisfied.
       if (!*thread_finished) {
-        kmp_int32 count;
-
-        count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
+#if KMP_DEBUG
+        kmp_int32 count = -1 +
+#endif
+            KMP_ATOMIC_DEC(unfinished_threads);
         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
                       "unfinished_threads to %d task_team=%p\n",
                       gtid, count, task_team));
@@ -3065,6 +3074,18 @@ static inline int __kmp_execute_tasks_template(
       return FALSE;
     }
 
+    // Check the flag again to see if it has already done in case to be trapped
+    // into infinite loop when a if0 task depends on a hidden helper task
+    // outside any parallel region. Detached tasks are not impacted in this case
+    // because the only thread executing this function has to execute the proxy
+    // task so it is in another code path that has the same check.
+    if (flag == NULL || (!final_spin && flag->done_check())) {
+      KA_TRACE(15,
+               ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
+                gtid));
+      return TRUE;
+    }
+
     // We could be getting tasks from target constructs; if this is the only
     // thread, keep trying to execute tasks from own queue
     if (nthreads == 1 &&
@@ -3098,6 +3119,16 @@ int __kmp_execute_tasks_64(
       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
 }
 
+template <bool C, bool S>
+int __kmp_atomic_execute_tasks_64(
+    kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
+    int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+    kmp_int32 is_constrained) {
+  return __kmp_execute_tasks_template(
+      thread, gtid, flag, final_spin,
+      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
 int __kmp_execute_tasks_oncore(
     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
@@ -3124,6 +3155,14 @@ template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
                                                  int *USE_ITT_BUILD_ARG(void *),
                                                  kmp_int32);
 
+template int __kmp_atomic_execute_tasks_64<false, true>(
+    kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
+    int *USE_ITT_BUILD_ARG(void *), kmp_int32);
+
+template int __kmp_atomic_execute_tasks_64<true, false>(
+    kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
+    int *USE_ITT_BUILD_ARG(void *), kmp_int32);
+
 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
 // next barrier so they can assist in executing enqueued tasks.
 // First thread in allocates the task team atomically.
@@ -3162,7 +3201,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
     // tasks and execute them.  In extra barrier mode, tasks do not sleep
     // at the separate tasking barrier, so this isn't a problem.
     for (i = 0; i < nthreads; i++) {
-      volatile void *sleep_loc;
+      void *sleep_loc;
       kmp_info_t *thread = threads_data[i].td.td_thr;
 
       if (i == this_thr->th.th_info.ds.ds_tid) {
@@ -3179,7 +3218,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
                       __kmp_gtid_from_thread(this_thr),
                       __kmp_gtid_from_thread(thread)));
-        __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
+        __kmp_null_resume_wrapper(thread);
       } else {
         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
                       __kmp_gtid_from_thread(this_thr),
@@ -3451,6 +3490,7 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
 
   TCW_4(task_team->tt.tt_found_tasks, FALSE);
   TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
 
   KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
@@ -3512,9 +3552,11 @@ void __kmp_reap_task_teams(void) {
 void __kmp_wait_to_unref_task_teams(void) {
   kmp_info_t *thread;
   kmp_uint32 spins;
+  kmp_uint64 time;
   int done;
 
   KMP_INIT_YIELD(spins);
+  KMP_INIT_BACKOFF(time);
 
   for (;;) {
     done = TRUE;
@@ -3547,7 +3589,7 @@ void __kmp_wait_to_unref_task_teams(void) {
                     __kmp_gtid_from_thread(thread)));
 
       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
-        volatile void *sleep_loc;
+        void *sleep_loc;
         // If the thread is sleeping, awaken it.
         if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
             NULL) {
@@ -3555,7 +3597,7 @@ void __kmp_wait_to_unref_task_teams(void) {
               10,
               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
-          __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
+          __kmp_null_resume_wrapper(thread);
         }
       }
     }
@@ -3564,7 +3606,7 @@ void __kmp_wait_to_unref_task_teams(void) {
     }
 
     // If oversubscribed or have waited a bit, yield.
-    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
   }
 }
 
@@ -3613,6 +3655,7 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
         TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
         TCW_4(task_team->tt.tt_found_tasks, FALSE);
         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+        TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
         KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
                           team->t.t_nproc);
         TCW_4(task_team->tt.tt_active, TRUE);
@@ -3705,8 +3748,10 @@ void __kmp_task_team_wait(
          "setting active to false, setting local and team's pointer to NULL\n",
          __kmp_gtid_from_thread(this_thr), task_team));
     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
-                     task_team->tt.tt_found_proxy_tasks == TRUE);
+                     task_team->tt.tt_found_proxy_tasks == TRUE ||
+                     task_team->tt.tt_hidden_helper_task_encountered == TRUE);
     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+    TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
     KMP_MB();
@@ -3869,11 +3914,12 @@ static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
 }
 
 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
+#if KMP_DEBUG
   kmp_int32 children = 0;
-
   // Predecrement simulated by "- 1" calculation
-  children =
-      KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
+  children = -1 +
+#endif
+      KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
   KMP_DEBUG_ASSERT(children >= 0);
 
   // Remove the imaginary children
@@ -3936,7 +3982,7 @@ void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
 
   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
   // but we cannot use __kmp_get_random here
-  kmp_int32 start_k = start;
+  kmp_int32 start_k = start % nthreads;
   kmp_int32 pass = 1;
   kmp_int32 k = start_k;
 
diff --git a/contrib/libs/cxxsupp/openmp/kmp_utility.cpp b/contrib/libs/cxxsupp/openmp/kmp_utility.cpp
index 6531536f5d..9465f720e0 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_utility.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_utility.cpp
@@ -135,7 +135,7 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
 
   p->initialized = 1;
 
-  p->sse2 = 1; // Assume SSE2 by default.
+  p->flags.sse2 = 1; // Assume SSE2 by default.
 
   __kmp_x86_cpuid(0, 0, &buf);
 
@@ -175,7 +175,7 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
       data[i] = (t & 0xff);
     }
 
-    p->sse2 = (buf.edx >> 26) & 1;
+    p->flags.sse2 = (buf.edx >> 26) & 1;
 
 #ifdef KMP_DEBUG
 
@@ -253,15 +253,21 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
                 i, buf.eax, buf.ebx, buf.ecx, buf.edx));
     }
 #endif
-#if KMP_USE_ADAPTIVE_LOCKS
-    p->rtm = 0;
+    p->flags.rtm = 0;
+    p->flags.hybrid = 0;
     if (max_arg > 7) {
       /* RTM bit CPUID.07:EBX, bit 11 */
+      /* HYRBID bit CPUID.07:EDX, bit 15 */
       __kmp_x86_cpuid(7, 0, &buf);
-      p->rtm = (buf.ebx >> 11) & 1;
-      KA_TRACE(trace_level, (" RTM"));
+      p->flags.rtm = (buf.ebx >> 11) & 1;
+      p->flags.hybrid = (buf.edx >> 15) & 1;
+      if (p->flags.rtm) {
+        KA_TRACE(trace_level, (" RTM"));
+      }
+      if (p->flags.hybrid) {
+        KA_TRACE(trace_level, (" HYBRID"));
+      }
     }
-#endif
   }
 
   { // Parse CPU brand string for frequency, saving the string for later.
diff --git a/contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp b/contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp
index cabb5722f4..d41ddf231e 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_wait_release.cpp
@@ -33,6 +33,10 @@ template <bool C, bool S>
 void __kmp_mwait_64(int th_gtid, kmp_flag_64<C, S> *flag) {
   __kmp_mwait_template(th_gtid, flag);
 }
+template <bool C, bool S>
+void __kmp_atomic_mwait_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag) {
+  __kmp_mwait_template(th_gtid, flag);
+}
 void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag) {
   __kmp_mwait_template(th_gtid, flag);
 }
@@ -40,4 +44,8 @@ void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag) {
 template void __kmp_mwait_32<false, false>(int, kmp_flag_32<false, false> *);
 template void __kmp_mwait_64<false, true>(int, kmp_flag_64<false, true> *);
 template void __kmp_mwait_64<true, false>(int, kmp_flag_64<true, false> *);
+template void
+__kmp_atomic_mwait_64<false, true>(int, kmp_atomic_flag_64<false, true> *);
+template void
+__kmp_atomic_mwait_64<true, false>(int, kmp_atomic_flag_64<true, false> *);
 #endif
diff --git a/contrib/libs/cxxsupp/openmp/kmp_wait_release.h b/contrib/libs/cxxsupp/openmp/kmp_wait_release.h
index d528ce9f18..b32cb15de1 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_wait_release.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_wait_release.h
@@ -33,96 +33,288 @@ higher level operations such as barriers and fork/join.
 @{
 */
 
-/*!
- * The flag_type describes the storage used for the flag.
- */
-enum flag_type {
-  flag32, /**< 32 bit flags */
-  flag64, /**< 64 bit flags */
-  flag_oncore /**< special 64-bit flag for on-core barrier (hierarchical) */
-};
-
 struct flag_properties {
   unsigned int type : 16;
   unsigned int reserved : 16;
 };
 
-/*!
- * Base class for wait/release volatile flag
- */
-template <typename P> class kmp_flag_native {
-  volatile P *loc;
-  flag_properties t;
+template <enum flag_type FlagType> struct flag_traits {};
+
+template <> struct flag_traits<flag32> {
+  typedef kmp_uint32 flag_t;
+  static const flag_type t = flag32;
+  static inline flag_t tcr(flag_t f) { return TCR_4(f); }
+  static inline flag_t test_then_add4(volatile flag_t *f) {
+    return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f));
+  }
+  static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_OR32(f, v);
+  }
+  static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_AND32(f, v);
+  }
+};
+
+template <> struct flag_traits<atomic_flag64> {
+  typedef kmp_uint64 flag_t;
+  static const flag_type t = atomic_flag64;
+  static inline flag_t tcr(flag_t f) { return TCR_8(f); }
+  static inline flag_t test_then_add4(volatile flag_t *f) {
+    return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
+  }
+  static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_OR64(f, v);
+  }
+  static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_AND64(f, v);
+  }
+};
+
+template <> struct flag_traits<flag64> {
+  typedef kmp_uint64 flag_t;
+  static const flag_type t = flag64;
+  static inline flag_t tcr(flag_t f) { return TCR_8(f); }
+  static inline flag_t test_then_add4(volatile flag_t *f) {
+    return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
+  }
+  static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_OR64(f, v);
+  }
+  static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_AND64(f, v);
+  }
+};
+
+template <> struct flag_traits<flag_oncore> {
+  typedef kmp_uint64 flag_t;
+  static const flag_type t = flag_oncore;
+  static inline flag_t tcr(flag_t f) { return TCR_8(f); }
+  static inline flag_t test_then_add4(volatile flag_t *f) {
+    return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
+  }
+  static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_OR64(f, v);
+  }
+  static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_AND64(f, v);
+  }
+};
+
+/*! Base class for all flags */
+template <flag_type FlagType> class kmp_flag {
+protected:
+  flag_properties t; /**< "Type" of the flag in loc */
+  kmp_info_t *waiting_threads[1]; /**< Threads sleeping on this thread. */
+  kmp_uint32 num_waiting_threads; /**< #threads sleeping on this thread. */
+  std::atomic<bool> *sleepLoc;
 
 public:
-  typedef P flag_t;
-  kmp_flag_native(volatile P *p, flag_type ft)
-      : loc(p), t({(short unsigned int)ft, 0U}) {}
-  volatile P *get() { return loc; }
-  void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); }
-  void set(volatile P *new_loc) { loc = new_loc; }
+  typedef flag_traits<FlagType> traits_type;
+  kmp_flag() : t({FlagType, 0U}), num_waiting_threads(0), sleepLoc(nullptr) {}
+  kmp_flag(int nwaiters)
+      : t({FlagType, 0U}), num_waiting_threads(nwaiters), sleepLoc(nullptr) {}
+  kmp_flag(std::atomic<bool> *sloc)
+      : t({FlagType, 0U}), num_waiting_threads(0), sleepLoc(sloc) {}
+  /*! @result the flag_type */
   flag_type get_type() { return (flag_type)(t.type); }
-  P load() { return *loc; }
-  void store(P val) { *loc = val; }
+
+  /*! param i in   index into waiting_threads
+   *  @result the thread that is waiting at index i */
+  kmp_info_t *get_waiter(kmp_uint32 i) {
+    KMP_DEBUG_ASSERT(i < num_waiting_threads);
+    return waiting_threads[i];
+  }
+  /*! @result num_waiting_threads */
+  kmp_uint32 get_num_waiters() { return num_waiting_threads; }
+  /*! @param thr in   the thread which is now waiting
+   *  Insert a waiting thread at index 0. */
+  void set_waiter(kmp_info_t *thr) {
+    waiting_threads[0] = thr;
+    num_waiting_threads = 1;
+  }
+  enum barrier_type get_bt() { return bs_last_barrier; }
 };
 
-/*!
- * Base class for wait/release atomic flag
- */
-template <typename P> class kmp_flag {
-  std::atomic<P>
-      *loc; /**< Pointer to the flag storage that is modified by another thread
-             */
-  flag_properties t; /**< "Type" of the flag in loc */
+/*! Base class for wait/release volatile flag */
+template <typename PtrType, flag_type FlagType, bool Sleepable>
+class kmp_flag_native : public kmp_flag<FlagType> {
+protected:
+  volatile PtrType *loc;
+  PtrType checker; /**< When flag==checker, it has been released. */
+  typedef flag_traits<FlagType> traits_type;
+
 public:
-  typedef P flag_t;
-  kmp_flag(std::atomic<P> *p, flag_type ft)
-      : loc(p), t({(short unsigned int)ft, 0U}) {}
-  /*!
-   * @result the pointer to the actual flag
-   */
-  std::atomic<P> *get() { return loc; }
-  /*!
-   * @result void* pointer to the actual flag
-   */
+  typedef PtrType flag_t;
+  kmp_flag_native(volatile PtrType *p) : kmp_flag<FlagType>(), loc(p) {}
+  kmp_flag_native(volatile PtrType *p, kmp_info_t *thr)
+      : kmp_flag<FlagType>(1), loc(p) {
+    this->waiting_threads[0] = thr;
+  }
+  kmp_flag_native(volatile PtrType *p, PtrType c)
+      : kmp_flag<FlagType>(), loc(p), checker(c) {}
+  kmp_flag_native(volatile PtrType *p, PtrType c, std::atomic<bool> *sloc)
+      : kmp_flag<FlagType>(sloc), loc(p), checker(c) {}
+  virtual ~kmp_flag_native() {}
+  void *operator new(size_t size) { return __kmp_allocate(size); }
+  void operator delete(void *p) { __kmp_free(p); }
+  volatile PtrType *get() { return loc; }
+  void *get_void_p() { return RCAST(void *, CCAST(PtrType *, loc)); }
+  void set(volatile PtrType *new_loc) { loc = new_loc; }
+  PtrType load() { return *loc; }
+  void store(PtrType val) { *loc = val; }
+  /*! @result true if the flag object has been released. */
+  virtual bool done_check() {
+    if (Sleepable && !(this->sleepLoc))
+      return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) ==
+             checker;
+    else
+      return traits_type::tcr(*(this->get())) == checker;
+  }
+  /*! @param old_loc in   old value of flag
+   *  @result true if the flag's old value indicates it was released. */
+  virtual bool done_check_val(PtrType old_loc) { return old_loc == checker; }
+  /*! @result true if the flag object is not yet released.
+   * Used in __kmp_wait_template like:
+   * @code
+   * while (flag.notdone_check()) { pause(); }
+   * @endcode */
+  virtual bool notdone_check() {
+    return traits_type::tcr(*(this->get())) != checker;
+  }
+  /*! @result Actual flag value before release was applied.
+   * Trigger all waiting threads to run by modifying flag to release state. */
+  void internal_release() {
+    (void)traits_type::test_then_add4((volatile PtrType *)this->get());
+  }
+  /*! @result Actual flag value before sleep bit(s) set.
+   * Notes that there is at least one thread sleeping on the flag by setting
+   * sleep bit(s). */
+  PtrType set_sleeping() {
+    if (this->sleepLoc) {
+      this->sleepLoc->store(true);
+      return *(this->get());
+    }
+    return traits_type::test_then_or((volatile PtrType *)this->get(),
+                                     KMP_BARRIER_SLEEP_STATE);
+  }
+  /*! @result Actual flag value before sleep bit(s) cleared.
+   * Notes that there are no longer threads sleeping on the flag by clearing
+   * sleep bit(s). */
+  void unset_sleeping() {
+    if (this->sleepLoc) {
+      this->sleepLoc->store(false);
+      return;
+    }
+    traits_type::test_then_and((volatile PtrType *)this->get(),
+                               ~KMP_BARRIER_SLEEP_STATE);
+  }
+  /*! @param old_loc in   old value of flag
+   * Test if there are threads sleeping on the flag's old value in old_loc. */
+  bool is_sleeping_val(PtrType old_loc) {
+    if (this->sleepLoc)
+      return this->sleepLoc->load();
+    return old_loc & KMP_BARRIER_SLEEP_STATE;
+  }
+  /*! Test whether there are threads sleeping on the flag. */
+  bool is_sleeping() {
+    if (this->sleepLoc)
+      return this->sleepLoc->load();
+    return is_sleeping_val(*(this->get()));
+  }
+  bool is_any_sleeping() {
+    if (this->sleepLoc)
+      return this->sleepLoc->load();
+    return is_sleeping_val(*(this->get()));
+  }
+  kmp_uint8 *get_stolen() { return NULL; }
+};
+
+/*! Base class for wait/release atomic flag */
+template <typename PtrType, flag_type FlagType, bool Sleepable>
+class kmp_flag_atomic : public kmp_flag<FlagType> {
+protected:
+  std::atomic<PtrType> *loc; /**< Pointer to flag location to wait on */
+  PtrType checker; /**< Flag == checker means it has been released. */
+public:
+  typedef flag_traits<FlagType> traits_type;
+  typedef PtrType flag_t;
+  kmp_flag_atomic(std::atomic<PtrType> *p) : kmp_flag<FlagType>(), loc(p) {}
+  kmp_flag_atomic(std::atomic<PtrType> *p, kmp_info_t *thr)
+      : kmp_flag<FlagType>(1), loc(p) {
+    this->waiting_threads[0] = thr;
+  }
+  kmp_flag_atomic(std::atomic<PtrType> *p, PtrType c)
+      : kmp_flag<FlagType>(), loc(p), checker(c) {}
+  kmp_flag_atomic(std::atomic<PtrType> *p, PtrType c, std::atomic<bool> *sloc)
+      : kmp_flag<FlagType>(sloc), loc(p), checker(c) {}
+  /*! @result the pointer to the actual flag */
+  std::atomic<PtrType> *get() { return loc; }
+  /*! @result void* pointer to the actual flag */
   void *get_void_p() { return RCAST(void *, loc); }
-  /*!
-   * @param new_loc in   set loc to point at new_loc
-   */
-  void set(std::atomic<P> *new_loc) { loc = new_loc; }
-  /*!
-   * @result the flag_type
-   */
-  flag_type get_type() { return (flag_type)(t.type); }
-  /*!
-   * @result flag value
-   */
-  P load() { return loc->load(std::memory_order_acquire); }
-  /*!
-   * @param val the new flag value to be stored
-   */
-  void store(P val) { loc->store(val, std::memory_order_release); }
-  // Derived classes must provide the following:
-  /*
-  kmp_info_t * get_waiter(kmp_uint32 i);
-  kmp_uint32 get_num_waiters();
-  bool done_check();
-  bool done_check_val(P old_loc);
-  bool notdone_check();
-  P internal_release();
-  void suspend(int th_gtid);
-  void mwait(int th_gtid);
-  void resume(int th_gtid);
-  P set_sleeping();
-  P unset_sleeping();
-  bool is_sleeping();
-  bool is_any_sleeping();
-  bool is_sleeping_val(P old_loc);
-  int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
-                    int *thread_finished
-                    USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32
-                    is_constrained);
-  */
+  /*! @param new_loc in   set loc to point at new_loc */
+  void set(std::atomic<PtrType> *new_loc) { loc = new_loc; }
+  /*! @result flag value */
+  PtrType load() { return loc->load(std::memory_order_acquire); }
+  /*! @param val the new flag value to be stored */
+  void store(PtrType val) { loc->store(val, std::memory_order_release); }
+  /*! @result true if the flag object has been released. */
+  bool done_check() {
+    if (Sleepable && !(this->sleepLoc))
+      return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker;
+    else
+      return this->load() == checker;
+  }
+  /*! @param old_loc in   old value of flag
+   * @result true if the flag's old value indicates it was released. */
+  bool done_check_val(PtrType old_loc) { return old_loc == checker; }
+  /*! @result true if the flag object is not yet released.
+   * Used in __kmp_wait_template like:
+   * @code
+   * while (flag.notdone_check()) { pause(); }
+   * @endcode */
+  bool notdone_check() { return this->load() != checker; }
+  /*! @result Actual flag value before release was applied.
+   * Trigger all waiting threads to run by modifying flag to release state. */
+  void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
+  /*! @result Actual flag value before sleep bit(s) set.
+   * Notes that there is at least one thread sleeping on the flag by setting
+   * sleep bit(s). */
+  PtrType set_sleeping() {
+    if (this->sleepLoc) {
+      this->sleepLoc->store(true);
+      return *(this->get());
+    }
+    return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
+  }
+  /*! @result Actual flag value before sleep bit(s) cleared.
+   * Notes that there are no longer threads sleeping on the flag by clearing
+   * sleep bit(s). */
+  void unset_sleeping() {
+    if (this->sleepLoc) {
+      this->sleepLoc->store(false);
+      return;
+    }
+    KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
+  }
+  /*! @param old_loc in   old value of flag
+   * Test whether there are threads sleeping on flag's old value in old_loc. */
+  bool is_sleeping_val(PtrType old_loc) {
+    if (this->sleepLoc)
+      return this->sleepLoc->load();
+    return old_loc & KMP_BARRIER_SLEEP_STATE;
+  }
+  /*! Test whether there are threads sleeping on the flag. */
+  bool is_sleeping() {
+    if (this->sleepLoc)
+      return this->sleepLoc->load();
+    return is_sleeping_val(this->load());
+  }
+  bool is_any_sleeping() {
+    if (this->sleepLoc)
+      return this->sleepLoc->load();
+    return is_sleeping_val(this->load());
+  }
+  kmp_uint8 *get_stolen() { return NULL; }
 };
 
 #if OMPT_SUPPORT
@@ -185,6 +377,7 @@ __kmp_wait_template(kmp_info_t *this_thr,
 #else
   kmp_uint32 hibernate;
 #endif
+  kmp_uint64 time;
 
   KMP_FSYNC_SPIN_INIT(spin, NULL);
   if (flag->done_check()) {
@@ -264,8 +457,9 @@ final_spin=FALSE)
     ompt_entry_state = this_thr->th.ompt_thread_info.state;
     if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit ||
         KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
-      ompt_lw_taskteam_t *team =
-          this_thr->th.th_team->t.ompt_serialized_team_info;
+      ompt_lw_taskteam_t *team = NULL;
+      if (this_thr->th.th_team)
+        team = this_thr->th.th_team->t.ompt_serialized_team_info;
       if (team) {
         tId = &(team->ompt_task_info.task_data);
       } else {
@@ -283,6 +477,7 @@ final_spin=FALSE)
 #endif
 
   KMP_INIT_YIELD(spins); // Setup for waiting
+  KMP_INIT_BACKOFF(time);
 
   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
       __kmp_pause_status == kmp_soft_paused) {
@@ -340,11 +535,11 @@ final_spin=FALSE)
          disabled (KMP_TASKING=0).  */
       if (task_team != NULL) {
         if (TCR_SYNC_4(task_team->tt.tt_active)) {
-          if (KMP_TASKING_ENABLED(task_team))
+          if (KMP_TASKING_ENABLED(task_team)) {
             flag->execute_tasks(
                 this_thr, th_gtid, final_spin,
                 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
-          else
+          } else
             this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
         } else {
           KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
@@ -370,7 +565,7 @@ final_spin=FALSE)
 
     // If we are oversubscribed, or have waited a bit (and
     // KMP_LIBRARY=throughput), then yield
-    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
 
 #if KMP_STATS_ENABLED
     // Check if thread has been signalled to idle state
@@ -557,6 +752,7 @@ static inline void __kmp_mwait_template(int th_gtid, C *flag) {
     else {
       // if flag changes here, wake-up happens immediately
       TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+      th->th.th_sleep_loc_type = flag->get_type();
       __kmp_unlock_suspend_mx(th);
       KF_TRACE(50, ("__kmp_mwait_template: T#%d calling mwait\n", th_gtid));
 #if KMP_HAVE_UMWAIT
@@ -574,6 +770,7 @@ static inline void __kmp_mwait_template(int th_gtid, C *flag) {
       if (flag->is_sleeping())
         flag->unset_sleeping();
       TCW_PTR(th->th.th_sleep_loc, NULL);
+      th->th.th_sleep_loc_type = flag_unset;
     }
     // Mark thread as active again
     th->th.th_active = TRUE;
@@ -624,251 +821,15 @@ template <class C> static inline void __kmp_release_template(C *flag) {
   }
 }
 
-template <typename FlagType> struct flag_traits {};
-
-template <> struct flag_traits<kmp_uint32> {
-  typedef kmp_uint32 flag_t;
-  static const flag_type t = flag32;
-  static inline flag_t tcr(flag_t f) { return TCR_4(f); }
-  static inline flag_t test_then_add4(volatile flag_t *f) {
-    return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f));
-  }
-  static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
-    return KMP_TEST_THEN_OR32(f, v);
-  }
-  static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
-    return KMP_TEST_THEN_AND32(f, v);
-  }
-};
-
-template <> struct flag_traits<kmp_uint64> {
-  typedef kmp_uint64 flag_t;
-  static const flag_type t = flag64;
-  static inline flag_t tcr(flag_t f) { return TCR_8(f); }
-  static inline flag_t test_then_add4(volatile flag_t *f) {
-    return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
-  }
-  static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
-    return KMP_TEST_THEN_OR64(f, v);
-  }
-  static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
-    return KMP_TEST_THEN_AND64(f, v);
-  }
-};
-
-// Basic flag that does not use C11 Atomics
-template <typename FlagType, bool Sleepable>
-class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
-  typedef flag_traits<FlagType> traits_type;
-  FlagType checker; /**< Value to compare flag to to check if flag has been
-                       released. */
-  kmp_info_t
-      *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
-  kmp_uint32
-      num_waiting_threads; /**< Number of threads sleeping on this thread. */
-public:
-  kmp_basic_flag_native(volatile FlagType *p)
-      : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
-  kmp_basic_flag_native(volatile FlagType *p, kmp_info_t *thr)
-      : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(1) {
-    waiting_threads[0] = thr;
-  }
-  kmp_basic_flag_native(volatile FlagType *p, FlagType c)
-      : kmp_flag_native<FlagType>(p, traits_type::t), checker(c),
-        num_waiting_threads(0) {}
-  /*!
-   * param i in   index into waiting_threads
-   * @result the thread that is waiting at index i
-   */
-  kmp_info_t *get_waiter(kmp_uint32 i) {
-    KMP_DEBUG_ASSERT(i < num_waiting_threads);
-    return waiting_threads[i];
-  }
-  /*!
-   * @result num_waiting_threads
-   */
-  kmp_uint32 get_num_waiters() { return num_waiting_threads; }
-  /*!
-   * @param thr in   the thread which is now waiting
-   *
-   * Insert a waiting thread at index 0.
-   */
-  void set_waiter(kmp_info_t *thr) {
-    waiting_threads[0] = thr;
-    num_waiting_threads = 1;
-  }
-  /*!
-   * @result true if the flag object has been released.
-   */
-  bool done_check() {
-    if (Sleepable)
-      return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) ==
-             checker;
-    else
-      return traits_type::tcr(*(this->get())) == checker;
-  }
-  /*!
-   * @param old_loc in   old value of flag
-   * @result true if the flag's old value indicates it was released.
-   */
-  bool done_check_val(FlagType old_loc) { return old_loc == checker; }
-  /*!
-   * @result true if the flag object is not yet released.
-   * Used in __kmp_wait_template like:
-   * @code
-   * while (flag.notdone_check()) { pause(); }
-   * @endcode
-   */
-  bool notdone_check() { return traits_type::tcr(*(this->get())) != checker; }
-  /*!
-   * @result Actual flag value before release was applied.
-   * Trigger all waiting threads to run by modifying flag to release state.
-   */
-  void internal_release() {
-    (void)traits_type::test_then_add4((volatile FlagType *)this->get());
-  }
-  /*!
-   * @result Actual flag value before sleep bit(s) set.
-   * Notes that there is at least one thread sleeping on the flag by setting
-   * sleep bit(s).
-   */
-  FlagType set_sleeping() {
-    return traits_type::test_then_or((volatile FlagType *)this->get(),
-                                     KMP_BARRIER_SLEEP_STATE);
-  }
-  /*!
-   * @result Actual flag value before sleep bit(s) cleared.
-   * Notes that there are no longer threads sleeping on the flag by clearing
-   * sleep bit(s).
-   */
-  FlagType unset_sleeping() {
-    return traits_type::test_then_and((volatile FlagType *)this->get(),
-                                      ~KMP_BARRIER_SLEEP_STATE);
-  }
-  /*!
-   * @param old_loc in   old value of flag
-   * Test whether there are threads sleeping on the flag's old value in old_loc.
-   */
-  bool is_sleeping_val(FlagType old_loc) {
-    return old_loc & KMP_BARRIER_SLEEP_STATE;
-  }
-  /*!
-   * Test whether there are threads sleeping on the flag.
-   */
-  bool is_sleeping() { return is_sleeping_val(*(this->get())); }
-  bool is_any_sleeping() { return is_sleeping_val(*(this->get())); }
-  kmp_uint8 *get_stolen() { return NULL; }
-  enum barrier_type get_bt() { return bs_last_barrier; }
-};
-
-template <typename FlagType, bool Sleepable>
-class kmp_basic_flag : public kmp_flag<FlagType> {
-  typedef flag_traits<FlagType> traits_type;
-  FlagType checker; /**< Value to compare flag to to check if flag has been
-                       released. */
-  kmp_info_t
-      *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
-  kmp_uint32
-      num_waiting_threads; /**< Number of threads sleeping on this thread. */
-public:
-  kmp_basic_flag(std::atomic<FlagType> *p)
-      : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
-  kmp_basic_flag(std::atomic<FlagType> *p, kmp_info_t *thr)
-      : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
-    waiting_threads[0] = thr;
-  }
-  kmp_basic_flag(std::atomic<FlagType> *p, FlagType c)
-      : kmp_flag<FlagType>(p, traits_type::t), checker(c),
-        num_waiting_threads(0) {}
-  /*!
-   * param i in   index into waiting_threads
-   * @result the thread that is waiting at index i
-   */
-  kmp_info_t *get_waiter(kmp_uint32 i) {
-    KMP_DEBUG_ASSERT(i < num_waiting_threads);
-    return waiting_threads[i];
-  }
-  /*!
-   * @result num_waiting_threads
-   */
-  kmp_uint32 get_num_waiters() { return num_waiting_threads; }
-  /*!
-   * @param thr in   the thread which is now waiting
-   *
-   * Insert a waiting thread at index 0.
-   */
-  void set_waiter(kmp_info_t *thr) {
-    waiting_threads[0] = thr;
-    num_waiting_threads = 1;
-  }
-  /*!
-   * @result true if the flag object has been released.
-   */
-  bool done_check() {
-    if (Sleepable)
-      return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker;
-    else
-      return this->load() == checker;
-  }
-  /*!
-   * @param old_loc in   old value of flag
-   * @result true if the flag's old value indicates it was released.
-   */
-  bool done_check_val(FlagType old_loc) { return old_loc == checker; }
-  /*!
-   * @result true if the flag object is not yet released.
-   * Used in __kmp_wait_template like:
-   * @code
-   * while (flag.notdone_check()) { pause(); }
-   * @endcode
-   */
-  bool notdone_check() { return this->load() != checker; }
-  /*!
-   * @result Actual flag value before release was applied.
-   * Trigger all waiting threads to run by modifying flag to release state.
-   */
-  void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
-  /*!
-   * @result Actual flag value before sleep bit(s) set.
-   * Notes that there is at least one thread sleeping on the flag by setting
-   * sleep bit(s).
-   */
-  FlagType set_sleeping() {
-    return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
-  }
-  /*!
-   * @result Actual flag value before sleep bit(s) cleared.
-   * Notes that there are no longer threads sleeping on the flag by clearing
-   * sleep bit(s).
-   */
-  FlagType unset_sleeping() {
-    return KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
-  }
-  /*!
-   * @param old_loc in   old value of flag
-   * Test whether there are threads sleeping on the flag's old value in old_loc.
-   */
-  bool is_sleeping_val(FlagType old_loc) {
-    return old_loc & KMP_BARRIER_SLEEP_STATE;
-  }
-  /*!
-   * Test whether there are threads sleeping on the flag.
-   */
-  bool is_sleeping() { return is_sleeping_val(this->load()); }
-  bool is_any_sleeping() { return is_sleeping_val(this->load()); }
-  kmp_uint8 *get_stolen() { return NULL; }
-  enum barrier_type get_bt() { return bs_last_barrier; }
-};
-
 template <bool Cancellable, bool Sleepable>
-class kmp_flag_32 : public kmp_basic_flag<kmp_uint32, Sleepable> {
+class kmp_flag_32 : public kmp_flag_atomic<kmp_uint32, flag32, Sleepable> {
 public:
   kmp_flag_32(std::atomic<kmp_uint32> *p)
-      : kmp_basic_flag<kmp_uint32, Sleepable>(p) {}
+      : kmp_flag_atomic<kmp_uint32, flag32, Sleepable>(p) {}
   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr)
-      : kmp_basic_flag<kmp_uint32, Sleepable>(p, thr) {}
+      : kmp_flag_atomic<kmp_uint32, flag32, Sleepable>(p, thr) {}
   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c)
-      : kmp_basic_flag<kmp_uint32, Sleepable>(p, c) {}
+      : kmp_flag_atomic<kmp_uint32, flag32, Sleepable>(p, c) {}
   void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
   void mwait(int th_gtid) { __kmp_mwait_32(th_gtid, this); }
@@ -895,14 +856,16 @@ public:
 };
 
 template <bool Cancellable, bool Sleepable>
-class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64, Sleepable> {
+class kmp_flag_64 : public kmp_flag_native<kmp_uint64, flag64, Sleepable> {
 public:
   kmp_flag_64(volatile kmp_uint64 *p)
-      : kmp_basic_flag_native<kmp_uint64, Sleepable>(p) {}
+      : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p) {}
   kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr)
-      : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, thr) {}
+      : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p, thr) {}
   kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c)
-      : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, c) {}
+      : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p, c) {}
+  kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c, std::atomic<bool> *loc)
+      : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p, c, loc) {}
   void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
   void mwait(int th_gtid) { __kmp_mwait_64(th_gtid, this); }
@@ -928,20 +891,52 @@ public:
   flag_type get_ptr_type() { return flag64; }
 };
 
+template <bool Cancellable, bool Sleepable>
+class kmp_atomic_flag_64
+    : public kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable> {
+public:
+  kmp_atomic_flag_64(std::atomic<kmp_uint64> *p)
+      : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p) {}
+  kmp_atomic_flag_64(std::atomic<kmp_uint64> *p, kmp_info_t *thr)
+      : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p, thr) {}
+  kmp_atomic_flag_64(std::atomic<kmp_uint64> *p, kmp_uint64 c)
+      : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p, c) {}
+  kmp_atomic_flag_64(std::atomic<kmp_uint64> *p, kmp_uint64 c,
+                     std::atomic<bool> *loc)
+      : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p, c, loc) {}
+  void suspend(int th_gtid) { __kmp_atomic_suspend_64(th_gtid, this); }
+  void mwait(int th_gtid) { __kmp_atomic_mwait_64(th_gtid, this); }
+  void resume(int th_gtid) { __kmp_atomic_resume_64(th_gtid, this); }
+  int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
+                    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+                    kmp_int32 is_constrained) {
+    return __kmp_atomic_execute_tasks_64(
+        this_thr, gtid, this, final_spin,
+        thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+  }
+  bool wait(kmp_info_t *this_thr,
+            int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+    if (final_spin)
+      return __kmp_wait_template<kmp_atomic_flag_64, TRUE, Cancellable,
+                                 Sleepable>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    else
+      return __kmp_wait_template<kmp_atomic_flag_64, FALSE, Cancellable,
+                                 Sleepable>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+  void release() { __kmp_release_template(this); }
+  flag_type get_ptr_type() { return atomic_flag64; }
+};
+
 // Hierarchical 64-bit on-core barrier instantiation
-class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
-  kmp_uint64 checker;
-  kmp_info_t *waiting_threads[1];
-  kmp_uint32 num_waiting_threads;
-  kmp_uint32
-      offset; /**< Portion of flag that is of interest for an operation. */
+class kmp_flag_oncore : public kmp_flag_native<kmp_uint64, flag_oncore, false> {
+  kmp_uint32 offset; /**< Portion of flag of interest for an operation. */
   bool flag_switch; /**< Indicates a switch in flag location. */
   enum barrier_type bt; /**< Barrier type. */
-  kmp_info_t *this_thr; /**< Thread that may be redirected to different flag
-                           location. */
+  kmp_info_t *this_thr; /**< Thread to redirect to different flag location. */
 #if USE_ITT_BUILD
-  void *
-      itt_sync_obj; /**< ITT object that must be passed to new flag location. */
+  void *itt_sync_obj; /**< ITT object to pass to new flag location. */
 #endif
   unsigned char &byteref(volatile kmp_uint64 *loc, size_t offset) {
     return (RCAST(unsigned char *, CCAST(kmp_uint64 *, loc)))[offset];
@@ -949,31 +944,26 @@ class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
 
 public:
   kmp_flag_oncore(volatile kmp_uint64 *p)
-      : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
-        flag_switch(false) {}
+      : kmp_flag_native<kmp_uint64, flag_oncore, false>(p), flag_switch(false) {
+  }
   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
-      : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
-        offset(idx), flag_switch(false) {}
+      : kmp_flag_native<kmp_uint64, flag_oncore, false>(p), offset(idx),
+        flag_switch(false),
+        bt(bs_last_barrier) USE_ITT_BUILD_ARG(itt_sync_obj(nullptr)) {}
   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx,
                   enum barrier_type bar_t,
                   kmp_info_t *thr USE_ITT_BUILD_ARG(void *itt))
-      : kmp_flag_native<kmp_uint64>(p, flag_oncore), checker(c),
-        num_waiting_threads(0), offset(idx), flag_switch(false), bt(bar_t),
+      : kmp_flag_native<kmp_uint64, flag_oncore, false>(p, c), offset(idx),
+        flag_switch(false), bt(bar_t),
         this_thr(thr) USE_ITT_BUILD_ARG(itt_sync_obj(itt)) {}
-  kmp_info_t *get_waiter(kmp_uint32 i) {
-    KMP_DEBUG_ASSERT(i < num_waiting_threads);
-    return waiting_threads[i];
-  }
-  kmp_uint32 get_num_waiters() { return num_waiting_threads; }
-  void set_waiter(kmp_info_t *thr) {
-    waiting_threads[0] = thr;
-    num_waiting_threads = 1;
-  }
-  bool done_check_val(kmp_uint64 old_loc) {
+  virtual ~kmp_flag_oncore() override {}
+  void *operator new(size_t size) { return __kmp_allocate(size); }
+  void operator delete(void *p) { __kmp_free(p); }
+  bool done_check_val(kmp_uint64 old_loc) override {
     return byteref(&old_loc, offset) == checker;
   }
-  bool done_check() { return done_check_val(*get()); }
-  bool notdone_check() {
+  bool done_check() override { return done_check_val(*get()); }
+  bool notdone_check() override {
     // Calculate flag_switch
     if (this_thr->th.th_bar[bt].bb.wait_flag == KMP_BARRIER_SWITCH_TO_OWN_FLAG)
       flag_switch = true;
@@ -997,17 +987,6 @@ public:
       KMP_TEST_THEN_OR64(get(), mask);
     }
   }
-  kmp_uint64 set_sleeping() {
-    return KMP_TEST_THEN_OR64(get(), KMP_BARRIER_SLEEP_STATE);
-  }
-  kmp_uint64 unset_sleeping() {
-    return KMP_TEST_THEN_AND64(get(), ~KMP_BARRIER_SLEEP_STATE);
-  }
-  bool is_sleeping_val(kmp_uint64 old_loc) {
-    return old_loc & KMP_BARRIER_SLEEP_STATE;
-  }
-  bool is_sleeping() { return is_sleeping_val(*get()); }
-  bool is_any_sleeping() { return is_sleeping_val(*get()); }
   void wait(kmp_info_t *this_thr, int final_spin) {
     if (final_spin)
       __kmp_wait_template<kmp_flag_oncore, TRUE>(
@@ -1038,27 +1017,39 @@ public:
         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
 #endif
   }
-  kmp_uint8 *get_stolen() { return NULL; }
   enum barrier_type get_bt() { return bt; }
   flag_type get_ptr_type() { return flag_oncore; }
 };
 
-// Used to wake up threads, volatile void* flag is usually the th_sleep_loc
-// associated with int gtid.
-static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
+static inline void __kmp_null_resume_wrapper(kmp_info_t *thr) {
+  int gtid = __kmp_gtid_from_thread(thr);
+  void *flag = CCAST(void *, thr->th.th_sleep_loc);
+  flag_type type = thr->th.th_sleep_loc_type;
   if (!flag)
     return;
-
-  switch (RCAST(kmp_flag_64<> *, CCAST(void *, flag))->get_type()) {
+  // Attempt to wake up a thread: examine its type and call appropriate template
+  switch (type) {
   case flag32:
-    __kmp_resume_32(gtid, (kmp_flag_32<> *)NULL);
+    __kmp_resume_32(gtid, RCAST(kmp_flag_32<> *, flag));
     break;
   case flag64:
-    __kmp_resume_64(gtid, (kmp_flag_64<> *)NULL);
+    __kmp_resume_64(gtid, RCAST(kmp_flag_64<> *, flag));
+    break;
+  case atomic_flag64:
+    __kmp_atomic_resume_64(gtid, RCAST(kmp_atomic_flag_64<> *, flag));
     break;
   case flag_oncore:
-    __kmp_resume_oncore(gtid, (kmp_flag_oncore *)NULL);
+    __kmp_resume_oncore(gtid, RCAST(kmp_flag_oncore *, flag));
+    break;
+#ifdef KMP_DEBUG
+  case flag_unset:
+    KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d is unset\n", type));
     break;
+  default:
+    KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d does not match any "
+                   "known flag type\n",
+                   type));
+#endif
   }
 }
 
diff --git a/contrib/libs/cxxsupp/openmp/omp.h b/contrib/libs/cxxsupp/openmp/omp.h
index cb2fe49599..2ddf4f630b 100644
--- a/contrib/libs/cxxsupp/openmp/omp.h
+++ b/contrib/libs/cxxsupp/openmp/omp.h
@@ -437,14 +437,23 @@
     extern omp_allocator_handle_t __KAI_KMPC_CONVENTION omp_get_default_allocator(void);
 #   ifdef __cplusplus
     extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a = omp_null_allocator);
-    extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t a = omp_null_allocator);
+    extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
+                                                         omp_allocator_handle_t a = omp_null_allocator);
+    extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size,
+                                                  omp_allocator_handle_t a = omp_null_allocator);
+    extern void *__KAI_KMPC_CONVENTION omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
+                                                          omp_allocator_handle_t a = omp_null_allocator);
     extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size,
                                                    omp_allocator_handle_t allocator = omp_null_allocator,
                                                    omp_allocator_handle_t free_allocator = omp_null_allocator);
     extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
 #   else
     extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
+    extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
+                                                         omp_allocator_handle_t a);
     extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t a);
+    extern void *__KAI_KMPC_CONVENTION omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
+                                                          omp_allocator_handle_t a);
     extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
                                                    omp_allocator_handle_t free_allocator);
     extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
diff --git a/contrib/libs/cxxsupp/openmp/ompt-general.cpp b/contrib/libs/cxxsupp/openmp/ompt-general.cpp
index 3d8ef041f7..c1468c0c32 100644
--- a/contrib/libs/cxxsupp/openmp/ompt-general.cpp
+++ b/contrib/libs/cxxsupp/openmp/ompt-general.cpp
@@ -295,9 +295,16 @@ ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) {
         OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success. \n");
         OMPT_VERBOSE_INIT_PRINT("Searching for ompt_start_tool in %s... ",
                                 fname);
+        dlerror(); // Clear any existing error
         start_tool = (ompt_start_tool_t)dlsym(h, "ompt_start_tool");
         if (!start_tool) {
-          OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", dlerror());
+          char *error = dlerror();
+          if (error != NULL) {
+            OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", error);
+          } else {
+            OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n",
+                                              "ompt_start_tool = NULL");
+          }
         } else
 #elif KMP_OS_WINDOWS
       OMPT_VERBOSE_INIT_PRINT("Opening %s... ", fname);
diff --git a/contrib/libs/cxxsupp/openmp/ompt-specific.cpp b/contrib/libs/cxxsupp/openmp/ompt-specific.cpp
index 1ad0e17ed4..c28b9bd1a6 100644
--- a/contrib/libs/cxxsupp/openmp/ompt-specific.cpp
+++ b/contrib/libs/cxxsupp/openmp/ompt-specific.cpp
@@ -283,10 +283,6 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
     link_lwt->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
     *OMPT_CUR_TEAM_INFO(thr) = tmp_team;
 
-    ompt_task_info_t tmp_task = lwt->ompt_task_info;
-    link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
-    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
-
     // link the taskteam into the list of taskteams:
     ompt_lw_taskteam_t *my_parent =
         thr->th.th_team->t.ompt_serialized_team_info;
@@ -297,6 +293,10 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
       ompd_bp_parallel_begin();
     }
 #endif
+
+    ompt_task_info_t tmp_task = lwt->ompt_task_info;
+    link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
+    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
   } else {
     // this is the first serialized team, so we just store the values in the
     // team and drop the taskteam-object
@@ -313,6 +313,9 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
 void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
   ompt_lw_taskteam_t *lwtask = thr->th.th_team->t.ompt_serialized_team_info;
   if (lwtask) {
+    ompt_task_info_t tmp_task = lwtask->ompt_task_info;
+    lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
+    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
 #if OMPD_SUPPORT
     if (ompd_state & OMPD_ENABLE_BP) {
       ompd_bp_parallel_end();
@@ -324,10 +327,6 @@ void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
     lwtask->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
     *OMPT_CUR_TEAM_INFO(thr) = tmp_team;
 
-    ompt_task_info_t tmp_task = lwtask->ompt_task_info;
-    lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
-    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
-
     if (lwtask->heap) {
       __kmp_free(lwtask);
       lwtask = NULL;
@@ -365,13 +364,9 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type,
     if (team == NULL)
       return 0;
     ompt_lw_taskteam_t *lwt = NULL,
-                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team),
-                       *prev_lwt = NULL;
+                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
 
     while (ancestor_level > 0) {
-      // needed for thread_num
-      prev_team = team;
-      prev_lwt = lwt;
       // next lightweight team (if any)
       if (lwt)
         lwt = lwt->parent;
@@ -390,6 +385,7 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type,
           taskdata = taskdata->td_parent;
           if (team == NULL)
             return 0;
+          prev_team = team;
           team = team->t.t_parent;
           if (taskdata) {
             next_lwt = LWT_FROM_TEAM(taskdata->td_team);
@@ -431,9 +427,18 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type,
     if (thread_num) {
       if (level == 0)
         *thread_num = __kmp_get_tid();
-      else if (prev_lwt)
+      else if (lwt)
         *thread_num = 0;
-      else
+      else if (!prev_team) {
+        // The innermost parallel region contains at least one explicit task.
+        // The task at level > 0 is either an implicit task that
+        // corresponds to the mentioned region or one of the explicit tasks
+        // nested inside the same region. Note that the task isn't the
+        // innermost explicit tasks (because of condition level > 0).
+        // Since the task at this level still belongs to the innermost parallel
+        // region, thread_num is determined the same way as for level==0.
+        *thread_num = __kmp_get_tid();
+      } else
         *thread_num = prev_team->t.t_master_tid;
       //        *thread_num = team->t.t_master_tid;
     }
diff --git a/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp b/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp
index 42ad1d56f9..5cd6ad6a03 100644
--- a/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp
+++ b/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp
@@ -1051,6 +1051,8 @@ void __kmp_reap_worker(kmp_info_t *th) {
                   "exit_val = %p\n",
                   th->th.th_info.ds.ds_gtid, exit_val));
   }
+#else
+  (void)status; // unused variable
 #endif /* KMP_DEBUG */
 
   KA_TRACE(10, ("__kmp_reap_worker: done reaping T#%d\n",
@@ -1232,7 +1234,7 @@ static void __kmp_atfork_child(void) {
   // affinity in the parent
   kmp_set_thread_affinity_mask_initial();
 #endif
-  // Set default not to bind threads tightly in the child (we’re expecting
+  // Set default not to bind threads tightly in the child (we're expecting
   // over-subscription after the fork and this can improve things for
   // scripting languages that use OpenMP inside process-parallel code).
   __kmp_affinity_type = affinity_none;
@@ -1407,9 +1409,13 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
   /* TODO: shouldn't this use release semantics to ensure that
      __kmp_suspend_initialize_thread gets called first? */
   old_spin = flag->set_sleeping();
+  TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+  th->th.th_sleep_loc_type = flag->get_type();
   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
       __kmp_pause_status != kmp_soft_paused) {
     flag->unset_sleeping();
+    TCW_PTR(th->th.th_sleep_loc, NULL);
+    th->th.th_sleep_loc_type = flag_unset;
     __kmp_unlock_suspend_mx(th);
     return;
   }
@@ -1417,8 +1423,10 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
                " was %x\n",
                th_gtid, flag->get(), flag->load(), old_spin));
 
-  if (flag->done_check_val(old_spin)) {
-    old_spin = flag->unset_sleeping();
+  if (flag->done_check_val(old_spin) || flag->done_check()) {
+    flag->unset_sleeping();
+    TCW_PTR(th->th.th_sleep_loc, NULL);
+    th->th.th_sleep_loc_type = flag_unset;
     KF_TRACE(5, ("__kmp_suspend_template: T#%d false alarm, reset sleep bit "
                  "for spin(%p)\n",
                  th_gtid, flag->get()));
@@ -1427,7 +1435,6 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
        "with low probability" return when the condition variable has
        not been signaled or broadcast */
     int deactivated = FALSE;
-    TCW_PTR(th->th.th_sleep_loc, (void *)flag);
 
     while (flag->is_sleeping()) {
 #ifdef DEBUG_SUSPEND
@@ -1449,6 +1456,9 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
         deactivated = TRUE;
       }
 
+      KMP_DEBUG_ASSERT(th->th.th_sleep_loc);
+      KMP_DEBUG_ASSERT(flag->get_type() == th->th.th_sleep_loc_type);
+
 #if USE_SUSPEND_TIMEOUT
       struct timespec now;
       struct timeval tval;
@@ -1478,6 +1488,18 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
       if ((status != 0) && (status != EINTR) && (status != ETIMEDOUT)) {
         KMP_SYSFAIL("pthread_cond_wait", status);
       }
+
+      KMP_DEBUG_ASSERT(flag->get_type() == flag->get_ptr_type());
+
+      if (!flag->is_sleeping() &&
+          ((status == EINTR) || (status == ETIMEDOUT))) {
+        // if interrupt or timeout, and thread is no longer sleeping, we need to
+        // make sure sleep_loc gets reset; however, this shouldn't be needed if
+        // we woke up with resume
+        flag->unset_sleeping();
+        TCW_PTR(th->th.th_sleep_loc, NULL);
+        th->th.th_sleep_loc_type = flag_unset;
+      }
 #ifdef KMP_DEBUG
       if (status == ETIMEDOUT) {
         if (flag->is_sleeping()) {
@@ -1487,6 +1509,8 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
           KF_TRACE(2, ("__kmp_suspend_template: T#%d timeout wakeup, sleep bit "
                        "not set!\n",
                        th_gtid));
+          TCW_PTR(th->th.th_sleep_loc, NULL);
+          th->th.th_sleep_loc_type = flag_unset;
         }
       } else if (flag->is_sleeping()) {
         KF_TRACE(100,
@@ -1504,6 +1528,13 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
       }
     }
   }
+  // We may have had the loop variable set before entering the loop body;
+  // so we need to reset sleep_loc.
+  TCW_PTR(th->th.th_sleep_loc, NULL);
+  th->th.th_sleep_loc_type = flag_unset;
+
+  KMP_DEBUG_ASSERT(!flag->is_sleeping());
+  KMP_DEBUG_ASSERT(!th->th.th_sleep_loc);
 #ifdef DEBUG_SUSPEND
   {
     char buffer[128];
@@ -1525,6 +1556,10 @@ template <bool C, bool S>
 void __kmp_suspend_64(int th_gtid, kmp_flag_64<C, S> *flag) {
   __kmp_suspend_template(th_gtid, flag);
 }
+template <bool C, bool S>
+void __kmp_atomic_suspend_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag) {
+  __kmp_suspend_template(th_gtid, flag);
+}
 void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
   __kmp_suspend_template(th_gtid, flag);
 }
@@ -1532,6 +1567,10 @@ void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
 template void __kmp_suspend_32<false, false>(int, kmp_flag_32<false, false> *);
 template void __kmp_suspend_64<false, true>(int, kmp_flag_64<false, true> *);
 template void __kmp_suspend_64<true, false>(int, kmp_flag_64<true, false> *);
+template void
+__kmp_atomic_suspend_64<false, true>(int, kmp_atomic_flag_64<false, true> *);
+template void
+__kmp_atomic_suspend_64<true, false>(int, kmp_atomic_flag_64<true, false> *);
 
 /* This routine signals the thread specified by target_gtid to wake up
    after setting the sleep bit indicated by the flag argument to FALSE.
@@ -1554,36 +1593,50 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
 
   __kmp_lock_suspend_mx(th);
 
-  if (!flag) { // coming from __kmp_null_resume_wrapper
+  if (!flag || flag != th->th.th_sleep_loc) {
+    // coming from __kmp_null_resume_wrapper, or thread is now sleeping on a
+    // different location; wake up at new location
     flag = (C *)CCAST(void *, th->th.th_sleep_loc);
   }
 
   // First, check if the flag is null or its type has changed. If so, someone
   // else woke it up.
-  if (!flag || flag->get_type() != flag->get_ptr_type()) { // get_ptr_type
-    // simply shows what flag was cast to
+  if (!flag) { // Thread doesn't appear to be sleeping on anything
     KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
                  "awake: flag(%p)\n",
-                 gtid, target_gtid, NULL));
+                 gtid, target_gtid, (void *)NULL));
     __kmp_unlock_suspend_mx(th);
     return;
+  } else if (flag->get_type() != th->th.th_sleep_loc_type) {
+    // Flag type does not appear to match this function template; possibly the
+    // thread is sleeping on something else. Try null resume again.
+    KF_TRACE(
+        5,
+        ("__kmp_resume_template: T#%d retrying, thread T#%d Mismatch flag(%p), "
+         "spin(%p) type=%d ptr_type=%d\n",
+         gtid, target_gtid, flag, flag->get(), flag->get_type(),
+         th->th.th_sleep_loc_type));
+    __kmp_unlock_suspend_mx(th);
+    __kmp_null_resume_wrapper(th);
+    return;
   } else { // if multiple threads are sleeping, flag should be internally
     // referring to a specific thread here
-    typename C::flag_t old_spin = flag->unset_sleeping();
-    if (!flag->is_sleeping_val(old_spin)) {
+    if (!flag->is_sleeping()) {
       KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
-                   "awake: flag(%p): "
-                   "%u => %u\n",
-                   gtid, target_gtid, flag->get(), old_spin, flag->load()));
+                   "awake: flag(%p): %u\n",
+                   gtid, target_gtid, flag->get(), (unsigned int)flag->load()));
       __kmp_unlock_suspend_mx(th);
       return;
     }
-    KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset "
-                 "sleep bit for flag's loc(%p): "
-                 "%u => %u\n",
-                 gtid, target_gtid, flag->get(), old_spin, flag->load()));
   }
+  KMP_DEBUG_ASSERT(flag);
+  flag->unset_sleeping();
   TCW_PTR(th->th.th_sleep_loc, NULL);
+  th->th.th_sleep_loc_type = flag_unset;
+
+  KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset "
+               "sleep bit for flag's loc(%p): %u\n",
+               gtid, target_gtid, flag->get(), (unsigned int)flag->load()));
 
 #ifdef DEBUG_SUSPEND
   {
@@ -1609,12 +1662,19 @@ template <bool C, bool S>
 void __kmp_resume_64(int target_gtid, kmp_flag_64<C, S> *flag) {
   __kmp_resume_template(target_gtid, flag);
 }
+template <bool C, bool S>
+void __kmp_atomic_resume_64(int target_gtid, kmp_atomic_flag_64<C, S> *flag) {
+  __kmp_resume_template(target_gtid, flag);
+}
 void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
   __kmp_resume_template(target_gtid, flag);
 }
 
 template void __kmp_resume_32<false, true>(int, kmp_flag_32<false, true> *);
+template void __kmp_resume_32<false, false>(int, kmp_flag_32<false, false> *);
 template void __kmp_resume_64<false, true>(int, kmp_flag_64<false, true> *);
+template void
+__kmp_atomic_resume_64<false, true>(int, kmp_atomic_flag_64<false, true> *);
 
 #if KMP_USE_MONITOR
 void __kmp_resume_monitor() {
@@ -1741,8 +1801,12 @@ static int __kmp_get_xproc(void) {
 
   int r = 0;
 
-#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-    KMP_OS_OPENBSD || KMP_OS_HURD
+#if KMP_OS_LINUX
+
+  __kmp_type_convert(sysconf(_SC_NPROCESSORS_CONF), &(r));
+
+#elif KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD || \
+    KMP_OS_HURD
 
   __kmp_type_convert(sysconf(_SC_NPROCESSORS_ONLN), &(r));
author	thegeorg <thegeorg@yandex-team.ru>	2022-06-03 10:53:07 +0300
committer	thegeorg <thegeorg@yandex-team.ru>	2022-06-03 10:53:07 +0300
commit	a1d4361e379e2c72a469ad1bd64569cbc2db131f (patch)
tree	0caddb240a10132376e4653a31578e117d33f9fd
parent	41f55a521834080d9d703c099c0418cfff3a0546 (diff)
download	ydb-a1d4361e379e2c72a469ad1bd64569cbc2db131f.tar.gz