Update contrib/libs/cxxsupp/openmp to 20.1.7

commit_hash:722dd5fe79203d22ad4a0be288ac0caeb6b3dd68
author: mikhnenko <[email protected]> 2025-07-15 20:05:43 +0300
committer: mikhnenko <[email protected]> 2025-07-15 20:52:16 +0300
commit: a40bd4f45bbc18fd95b1596e655b8942ceb2cf4b (patch)
tree: bce599ca02c778c277198de6d131d37db71997d0 /contrib/libs/cxxsupp/openmp
parent: 728e0eaef4dc1f1152d2c3a4cc1bbdf597f3ef3d (diff)
61 files changed, 9200 insertions, 2587 deletions
diff --git a/contrib/libs/cxxsupp/openmp/.yandex_meta/__init__.py b/contrib/libs/cxxsupp/openmp/.yandex_meta/__init__.py
index 0fd8f88c8bb..398338cb3da 100644
--- a/contrib/libs/cxxsupp/openmp/.yandex_meta/__init__.py
+++ b/contrib/libs/cxxsupp/openmp/.yandex_meta/__init__.py
@@ -55,7 +55,7 @@ llvm_openmp = CMakeNinjaNixProject(
     arcdir="contrib/libs/cxxsupp/openmp",
     nixattr="llvmPackages_13.openmp",
     install_subdir="runtime/src",
-    ignore_commands=["perl"],
+    ignore_commands=["python3.10"],
     flags=[
         "-DOPENMP_ENABLE_LIBOMPTARGET=OFF",
         "-DOPENMP_ENABLE_LIBOMP_PROFILING=OFF",
@@ -73,8 +73,10 @@ llvm_openmp = CMakeNinjaNixProject(
         "kmp_stats_timing.h",
         "kmp_stub.h",
         "legacy/ittnotify.h",
+        "libperfstat.h",
         "llvm/Support/TimeProfiler.h",
         "ompd-specific.h",
+        "procfs.h",
     ],
     platform_dispatchers=["kmp_config.h"],
     post_build=post_build,
diff --git a/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report b/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report
index 756e43c4164..60c73f2edad 100644
--- a/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report
+++ b/contrib/libs/cxxsupp/openmp/.yandex_meta/devtools.licenses.report
@@ -128,6 +128,8 @@ BELONGS ya.make
         kmp_barrier.cpp [7:8]
         kmp_barrier.h [7:8]
         kmp_cancel.cpp [4:5]
+        kmp_collapse.cpp [7:8]
+        kmp_collapse.h [7:8]
         kmp_config-linux.h [6:7]
         kmp_csupport.cpp [7:8]
         kmp_debug.cpp [7:8]
@@ -167,6 +169,7 @@ BELONGS ya.make
         kmp_tasking.cpp [7:8]
         kmp_threadprivate.cpp [7:8]
         kmp_utility.cpp [7:8]
+        kmp_utils.h [7:8]
         kmp_version.cpp [7:8]
         kmp_version.h [7:8]
         kmp_wait_release.cpp [7:8]
@@ -196,6 +199,8 @@ BELONGS ya.make
         kmp_barrier.cpp [7:8]
         kmp_barrier.h [7:8]
         kmp_cancel.cpp [4:5]
+        kmp_collapse.cpp [7:8]
+        kmp_collapse.h [7:8]
         kmp_config-linux.h [6:7]
         kmp_csupport.cpp [7:8]
         kmp_debug.cpp [7:8]
@@ -235,6 +240,7 @@ BELONGS ya.make
         kmp_tasking.cpp [7:8]
         kmp_threadprivate.cpp [7:8]
         kmp_utility.cpp [7:8]
+        kmp_utils.h [7:8]
         kmp_version.cpp [7:8]
         kmp_version.h [7:8]
         kmp_wait_release.cpp [7:8]
@@ -330,6 +336,8 @@ BELONGS ya.make
         kmp_barrier.cpp [9:9]
         kmp_barrier.h [9:9]
         kmp_cancel.cpp [6:6]
+        kmp_collapse.cpp [9:9]
+        kmp_collapse.h [9:9]
         kmp_config-linux.h [8:8]
         kmp_csupport.cpp [9:9]
         kmp_debug.cpp [9:9]
@@ -369,6 +377,7 @@ BELONGS ya.make
         kmp_tasking.cpp [9:9]
         kmp_threadprivate.cpp [9:9]
         kmp_utility.cpp [9:9]
+        kmp_utils.h [9:9]
         kmp_version.cpp [9:9]
         kmp_version.h [9:9]
         kmp_wait_release.cpp [9:9]
@@ -398,6 +407,8 @@ BELONGS ya.make
         kmp_barrier.cpp [9:9]
         kmp_barrier.h [9:9]
         kmp_cancel.cpp [6:6]
+        kmp_collapse.cpp [9:9]
+        kmp_collapse.h [9:9]
         kmp_config-linux.h [8:8]
         kmp_csupport.cpp [9:9]
         kmp_debug.cpp [9:9]
@@ -437,6 +448,7 @@ BELONGS ya.make
         kmp_tasking.cpp [9:9]
         kmp_threadprivate.cpp [9:9]
         kmp_utility.cpp [9:9]
+        kmp_utils.h [9:9]
         kmp_version.cpp [9:9]
         kmp_version.h [9:9]
         kmp_wait_release.cpp [9:9]
diff --git a/contrib/libs/cxxsupp/openmp/.yandex_meta/override.nix b/contrib/libs/cxxsupp/openmp/.yandex_meta/override.nix
index 64b4741ad4a..071076d48b1 100644
--- a/contrib/libs/cxxsupp/openmp/.yandex_meta/override.nix
+++ b/contrib/libs/cxxsupp/openmp/.yandex_meta/override.nix
@@ -1,18 +1,15 @@
 pkgs: attrs: with pkgs; with attrs; rec {
   pname = "openmp";
-  version = "15.0.7";
+  version = "20.1.7";
 
   src = fetchFromGitHub {
     owner = "llvm";
     repo = "llvm-project";
     rev = "llvmorg-${version}";
-    hash = "sha256-wjuZQyXQ/jsmvy6y1aksCcEDXGBjuhpgngF3XQJ/T4s=";
+    hash = "sha256-OSd26CLKziKo/eM/5rhtcWd0AxdtJk0ELA5YIxqINKs=";
   };
 
-  # This hack makes message-converter.pl script to not emit time on every build.
-  preConfigure = ''
-    substituteInPlace "runtime/tools/message-converter.pl" --replace "\" on \" . localtime() . " ""
-  '';
+  buildInputs = [ pkgs.python3 ];
 
   sourceRoot = "source/openmp";
 }
diff --git a/contrib/libs/cxxsupp/openmp/README.rst b/contrib/libs/cxxsupp/openmp/README.rst
index ffa49e4d2a4..2dfc8630858 100644
--- a/contrib/libs/cxxsupp/openmp/README.rst
+++ b/contrib/libs/cxxsupp/openmp/README.rst
@@ -119,6 +119,10 @@ Options for all Libraries
   Compiler to use for testing. Defaults to the compiler that was also used for
   building.
 
+**OPENMP_TEST_Fortran_COMPILER** = ``${CMAKE_Fortran_COMPILER}``
+  Compiler to use for testing. Defaults to the compiler that was also used for
+  building. Will default to flang if build is in-tree.
+
 **OPENMP_LLVM_TOOLS_DIR** = ``/path/to/built/llvm/tools``
   Additional path to search for LLVM tools needed by tests.
 
@@ -137,7 +141,7 @@ Options for all Libraries
 Options for ``libomp``
 ----------------------
 
-**LIBOMP_ARCH** = ``aarch64|arm|i386|mic|mips|mips64|ppc64|ppc64le|x86_64|riscv64``
+**LIBOMP_ARCH** = ``aarch64|aarch64_32|arm|i386|loongarch64|mic|mips|mips64|ppc64|ppc64le|x86_64|riscv64|s390x``
   The default value for this option is chosen based on probing the compiler for
   architecture macros (e.g., is ``__x86_64__`` predefined by compiler?).
 
@@ -194,7 +198,7 @@ Optional Features
 **LIBOMP_OMPT_SUPPORT** = ``ON|OFF``
   Include support for the OpenMP Tools Interface (OMPT).
   This option is supported and ``ON`` by default for x86, x86_64, AArch64,
-  PPC64 and RISCV64 on Linux* and macOS*.
+  PPC64, RISCV64, LoongArch64, and s390x on Linux* and macOS*.
   This option is ``OFF`` if this feature is not supported for the platform.
 
 **LIBOMP_OMPT_OPTIONAL** = ``ON|OFF``
@@ -357,3 +361,35 @@ Advanced Builds with Various Options
 **Footnotes**
 
 .. [*] Other names and brands may be claimed as the property of others.
+
+How to Run Tests
+================
+
+There are following check-* make targets for tests.
+
+- ``check-ompt`` (ompt tests under runtime/test/ompt)
+- ``check-ompt-multiplex`` (ompt multiplex tests under tools/multiplex/tests)
+- ``check-libarcher`` (libarcher tests under tools/archer/tests)
+- ``check-libomp`` (libomp tests under runtime/test. This includes check-ompt tests too)
+- ``check-libomptarget-*`` (libomptarget tests for specific target under libomptarget/test)
+- ``check-libomptarget`` (all check-libomptarget-* tests)
+- ``check-openmp`` (combination of all above tests excluding duplicates)
+
+For example, to run all available tests, use ``make check-openmp``.
+
+Options for Tests
+------------------
+Tests use lit framework.
+See `lit documentation <https://llvm.org/docs/CommandGuide/lit.html>`_ for lit options.
+
+**CHECK_OPENMP_ENV** = ``""``
+  Default environment variables which test process uses for ``check-openmp``
+  separated by space.  This can be used for individual targets (``check-ompt``,
+  ``check-ompt-multiplex``, ``check-libarcher``, ``check-libomp`` and
+  ``check-libomptarget-*``) too.  Note that each test still overrides
+  environment variables if needed.  For example, to change barrier pattern to be
+  used from default hyper barrier to hierarchical barrier, run:
+
+.. code-block:: console
+
+  $ CHECK_OPENMP_ENV="KMP_PLAIN_BARRIER_PATTERN=hier,hier KMP_FORKJOIN_BARRIER_PATTERN=hier,hier KMP_REDUCTION_BARRIER_PATTERN=hier,hier" make check-openmp
diff --git a/contrib/libs/cxxsupp/openmp/exports_so.txt b/contrib/libs/cxxsupp/openmp/exports_so.txt
index ac188af3105..124c80a1422 100644
--- a/contrib/libs/cxxsupp/openmp/exports_so.txt
+++ b/contrib/libs/cxxsupp/openmp/exports_so.txt
@@ -26,6 +26,7 @@ VERSION {
         # OMPT API
         #
         ompt_start_tool;     # OMPT start interface
+        ompt_libomp_connect; # OMPT libomptarget interface
 
         ompc_*;    # omp.h renames some standard functions to ompc_*.
         kmp_*;     # Intel extensions.
@@ -71,10 +72,8 @@ VERSION {
         __kmp_fork_call;
         __kmp_invoke_microtask;
 #if KMP_USE_MONITOR
-        __kmp_launch_monitor;
         __kmp_reap_monitor;
 #endif
-        __kmp_launch_worker;
         __kmp_reap_worker;
         __kmp_release_64;
         __kmp_wait_64;
diff --git a/contrib/libs/cxxsupp/openmp/kmp.h b/contrib/libs/cxxsupp/openmp/kmp.h
index 8c84b83b147..05a142a7b32 100644
--- a/contrib/libs/cxxsupp/openmp/kmp.h
+++ b/contrib/libs/cxxsupp/openmp/kmp.h
@@ -27,6 +27,9 @@
 #ifndef KMP_STATIC_STEAL_ENABLED
 #define KMP_STATIC_STEAL_ENABLED 1
 #endif
+#define KMP_WEIGHTED_ITERATIONS_SUPPORTED                                      \
+  (KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED &&                       \
+   (KMP_ARCH_X86 || KMP_ARCH_X86_64))
 
 #define TASK_CURRENT_NOT_QUEUED 0
 #define TASK_CURRENT_QUEUED 1
@@ -60,7 +63,15 @@
 #undef KMP_CANCEL_THREADS
 #endif
 
+// Some WASI targets (e.g., wasm32-wasi-threads) do not support thread
+// cancellation.
+#if KMP_OS_WASI
+#undef KMP_CANCEL_THREADS
+#endif
+
+#if !KMP_OS_WASI
 #include <signal.h>
+#endif
 #include <stdarg.h>
 #include <stddef.h>
 #include <stdio.h>
@@ -92,7 +103,8 @@ class kmp_stats_list;
 #define KMP_USE_HIER_SCHED KMP_AFFINITY_SUPPORTED
 #endif
 
-#if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED
+// OMPD_SKIP_HWLOC used in libompd/omp-icv.cpp to avoid OMPD depending on hwloc
+#if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED && !defined(OMPD_SKIP_HWLOC)
 #include "hwloc.h"
 #ifndef HWLOC_OBJ_NUMANODE
 #define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
@@ -121,7 +133,7 @@ class kmp_stats_list;
 #endif
 #include "kmp_i18n.h"
 
-#define KMP_HANDLE_SIGNALS (KMP_OS_UNIX || KMP_OS_WINDOWS)
+#define KMP_HANDLE_SIGNALS ((KMP_OS_UNIX && !KMP_OS_WASI) || KMP_OS_WINDOWS)
 
 #include "kmp_wrapper_malloc.h"
 #if KMP_OS_UNIX
@@ -180,6 +192,7 @@ class kmp_stats_list;
 
 #define KMP_NSEC_PER_SEC 1000000000L
 #define KMP_USEC_PER_SEC 1000000L
+#define KMP_NSEC_PER_USEC 1000L
 
 /*!
 @ingroup BASIC_TYPES
@@ -519,6 +532,15 @@ enum clock_function_type {
 enum mic_type { non_mic, mic1, mic2, mic3, dummy };
 #endif
 
+// OpenMP 3.1 - Nested num threads array
+typedef struct kmp_nested_nthreads_t {
+  int *nth;
+  int size;
+  int used;
+} kmp_nested_nthreads_t;
+
+extern kmp_nested_nthreads_t __kmp_nested_nth;
+
 /* -- fast reduction stuff ------------------------------------------------ */
 
 #undef KMP_FAST_REDUCTION_BARRIER
@@ -597,7 +619,9 @@ typedef int PACKED_REDUCTION_METHOD_T;
 #endif
 
 #if KMP_OS_UNIX
+#if !KMP_OS_WASI
 #include <dlfcn.h>
+#endif
 #include <pthread.h>
 #endif
 
@@ -675,7 +699,7 @@ typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE, const GROUP_AFFINITY *,
 extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
 #endif /* KMP_OS_WINDOWS */
 
-#if KMP_USE_HWLOC
+#if KMP_USE_HWLOC && !defined(OMPD_SKIP_HWLOC)
 extern hwloc_topology_t __kmp_hwloc_topology;
 extern int __kmp_hwloc_error;
 #endif
@@ -690,10 +714,12 @@ extern size_t __kmp_affin_mask_size;
 #define KMP_CPU_ISSET(i, mask) (mask)->is_set(i)
 #define KMP_CPU_CLR(i, mask) (mask)->clear(i)
 #define KMP_CPU_ZERO(mask) (mask)->zero()
+#define KMP_CPU_ISEMPTY(mask) (mask)->empty()
 #define KMP_CPU_COPY(dest, src) (dest)->copy(src)
 #define KMP_CPU_AND(dest, src) (dest)->bitwise_and(src)
 #define KMP_CPU_COMPLEMENT(max_bit_number, mask) (mask)->bitwise_not()
 #define KMP_CPU_UNION(dest, src) (dest)->bitwise_or(src)
+#define KMP_CPU_EQUAL(dest, src) (dest)->is_equal(src)
 #define KMP_CPU_ALLOC(ptr) (ptr = __kmp_affinity_dispatch->allocate_mask())
 #define KMP_CPU_FREE(ptr) __kmp_affinity_dispatch->deallocate_mask(ptr)
 #define KMP_CPU_ALLOC_ON_STACK(ptr) KMP_CPU_ALLOC(ptr)
@@ -730,6 +756,8 @@ public:
     virtual void clear(int i) {}
     // Zero out entire mask
     virtual void zero() {}
+    // Check whether mask is empty
+    virtual bool empty() const { return true; }
     // Copy src into this mask
     virtual void copy(const Mask *src) {}
     // this &= rhs
@@ -738,6 +766,8 @@ public:
     virtual void bitwise_or(const Mask *rhs) {}
     // this = ~this
     virtual void bitwise_not() {}
+    // this == rhs
+    virtual bool is_equal(const Mask *rhs) const { return false; }
     // API for iterating over an affinity mask
     // for (int i = mask->begin(); i != mask->end(); i = mask->next(i))
     virtual int begin() const { return 0; }
@@ -753,6 +783,15 @@ public:
     // Only 1 DWORD in the mask should have any procs set.
     // Return the appropriate index, or -1 for an invalid mask.
     virtual int get_proc_group() const { return -1; }
+    int get_max_cpu() const {
+      int cpu;
+      int max_cpu = -1;
+      KMP_CPU_SET_ITERATE(cpu, this) {
+        if (cpu > max_cpu)
+          max_cpu = cpu;
+      }
+      return max_cpu;
+    }
   };
   void *operator new(size_t n);
   void operator delete(void *p);
@@ -789,6 +828,33 @@ private:
 typedef KMPAffinity::Mask kmp_affin_mask_t;
 extern KMPAffinity *__kmp_affinity_dispatch;
 
+#ifndef KMP_OS_AIX
+class kmp_affinity_raii_t {
+  kmp_affin_mask_t *mask;
+  bool restored;
+
+public:
+  kmp_affinity_raii_t(const kmp_affin_mask_t *new_mask = nullptr)
+      : mask(nullptr), restored(false) {
+    if (KMP_AFFINITY_CAPABLE()) {
+      KMP_CPU_ALLOC(mask);
+      KMP_ASSERT(mask != NULL);
+      __kmp_get_system_affinity(mask, /*abort_on_error=*/true);
+      if (new_mask)
+        __kmp_set_system_affinity(new_mask, /*abort_on_error=*/true);
+    }
+  }
+  void restore() {
+    if (mask && KMP_AFFINITY_CAPABLE() && !restored) {
+      __kmp_set_system_affinity(mask, /*abort_on_error=*/true);
+      KMP_CPU_FREE(mask);
+    }
+    restored = true;
+  }
+  ~kmp_affinity_raii_t() { restore(); }
+};
+#endif // !KMP_OS_AIX
+
 // Declare local char buffers with this size for printing debug and info
 // messages, using __kmp_affinity_print_mask().
 #define KMP_AFFIN_MASK_PRINT_LEN 1024
@@ -823,27 +889,77 @@ enum affinity_top_method {
   affinity_top_method_default
 };
 
-#define affinity_respect_mask_default (-1)
+#define affinity_respect_mask_default (2)
+
+typedef struct kmp_affinity_flags_t {
+  unsigned dups : 1;
+  unsigned verbose : 1;
+  unsigned warnings : 1;
+  unsigned respect : 2;
+  unsigned reset : 1;
+  unsigned initialized : 1;
+  unsigned core_types_gran : 1;
+  unsigned core_effs_gran : 1;
+  unsigned omp_places : 1;
+  unsigned reserved : 22;
+} kmp_affinity_flags_t;
+KMP_BUILD_ASSERT(sizeof(kmp_affinity_flags_t) == 4);
+
+typedef struct kmp_affinity_ids_t {
+  int os_id;
+  int ids[KMP_HW_LAST];
+} kmp_affinity_ids_t;
+
+typedef struct kmp_affinity_attrs_t {
+  int core_type : 8;
+  int core_eff : 8;
+  unsigned valid : 1;
+  unsigned reserved : 15;
+} kmp_affinity_attrs_t;
+#define KMP_AFFINITY_ATTRS_UNKNOWN                                             \
+  { KMP_HW_CORE_TYPE_UNKNOWN, kmp_hw_attr_t::UNKNOWN_CORE_EFF, 0, 0 }
+
+typedef struct kmp_affinity_t {
+  char *proclist;
+  enum affinity_type type;
+  kmp_hw_t gran;
+  int gran_levels;
+  kmp_affinity_attrs_t core_attr_gran;
+  int compact;
+  int offset;
+  kmp_affinity_flags_t flags;
+  unsigned num_masks;
+  kmp_affin_mask_t *masks;
+  kmp_affinity_ids_t *ids;
+  kmp_affinity_attrs_t *attrs;
+  unsigned num_os_id_masks;
+  kmp_affin_mask_t *os_id_masks;
+  const char *env_var;
+} kmp_affinity_t;
+
+#define KMP_AFFINITY_INIT(env)                                                 \
+  {                                                                            \
+    nullptr, affinity_default, KMP_HW_UNKNOWN, -1, KMP_AFFINITY_ATTRS_UNKNOWN, \
+        0, 0,                                                                  \
+        {TRUE,  FALSE, TRUE, affinity_respect_mask_default, FALSE, FALSE,      \
+         FALSE, FALSE, FALSE},                                                 \
+        0, nullptr, nullptr, nullptr, 0, nullptr, env                          \
+  }
 
-extern enum affinity_type __kmp_affinity_type; /* Affinity type */
-extern kmp_hw_t __kmp_affinity_gran; /* Affinity granularity */
-extern int __kmp_affinity_gran_levels; /* corresponding int value */
-extern int __kmp_affinity_dups; /* Affinity duplicate masks */
 extern enum affinity_top_method __kmp_affinity_top_method;
-extern int __kmp_affinity_compact; /* Affinity 'compact' value */
-extern int __kmp_affinity_offset; /* Affinity offset value  */
-extern int __kmp_affinity_verbose; /* Was verbose specified for KMP_AFFINITY? */
-extern int __kmp_affinity_warnings; /* KMP_AFFINITY warnings enabled ? */
-extern int __kmp_affinity_respect_mask; // Respect process' init affinity mask?
-extern char *__kmp_affinity_proclist; /* proc ID list */
-extern kmp_affin_mask_t *__kmp_affinity_masks;
-extern unsigned __kmp_affinity_num_masks;
+extern kmp_affinity_t __kmp_affinity;
+extern kmp_affinity_t __kmp_hh_affinity;
+extern kmp_affinity_t *__kmp_affinities[2];
+
 extern void __kmp_affinity_bind_thread(int which);
 
 extern kmp_affin_mask_t *__kmp_affin_fullMask;
 extern kmp_affin_mask_t *__kmp_affin_origMask;
 extern char *__kmp_cpuinfo_file;
-extern bool __kmp_affin_reset;
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+extern int __kmp_first_osid_with_ecore;
+#endif
 
 #endif /* KMP_AFFINITY_SUPPORTED */
 
@@ -882,7 +998,7 @@ extern char *__kmp_tool_libraries;
 #define KMP_AFFINITY_NON_PROC_BIND                                             \
   ((__kmp_nested_proc_bind.bind_types[0] == proc_bind_false ||                 \
     __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) &&                \
-   (__kmp_affinity_num_masks > 0 || __kmp_affinity_type == affinity_balanced))
+   (__kmp_affinity.num_masks > 0 || __kmp_affinity.type == affinity_balanced))
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 extern int __kmp_affinity_num_places;
@@ -1009,6 +1125,7 @@ typedef struct kmp_allocator_t {
   kmp_allocator_t *fb_data;
   kmp_uint64 pool_size;
   kmp_uint64 pool_used;
+  bool pinned;
 } kmp_allocator_t;
 
 extern omp_allocator_handle_t __kmpc_init_allocator(int gtid,
@@ -1044,6 +1161,12 @@ extern void __kmp_init_target_mem();
 
 /* ------------------------------------------------------------------------ */
 
+#if ENABLE_LIBOMPTARGET
+extern void __kmp_init_target_task();
+#endif
+
+/* ------------------------------------------------------------------------ */
+
 #define KMP_UINT64_MAX                                                         \
   (~((kmp_uint64)1 << ((sizeof(kmp_uint64) * (1 << 3)) - 1)))
 
@@ -1053,23 +1176,41 @@ extern void __kmp_init_target_mem();
 #if defined(PTHREAD_THREADS_MAX) && PTHREAD_THREADS_MAX < INT_MAX
 #define KMP_MAX_NTH PTHREAD_THREADS_MAX
 #else
+#ifdef __ve__
+// VE's pthread supports only up to 64 threads per a VE process.
+// Please check p. 14 of following documentation for more details.
+// https://sxauroratsubasa.sakura.ne.jp/documents/veos/en/VEOS_high_level_design.pdf
+#define KMP_MAX_NTH 64
+#else
 #define KMP_MAX_NTH INT_MAX
 #endif
+#endif
 #endif /* KMP_MAX_NTH */
 
 #ifdef PTHREAD_STACK_MIN
-#define KMP_MIN_STKSIZE PTHREAD_STACK_MIN
+#define KMP_MIN_STKSIZE ((size_t)PTHREAD_STACK_MIN)
 #else
 #define KMP_MIN_STKSIZE ((size_t)(32 * 1024))
 #endif
 
+#if KMP_OS_AIX && KMP_ARCH_PPC
+#define KMP_MAX_STKSIZE 0x10000000 /* 256Mb max size on 32-bit AIX */
+#else
 #define KMP_MAX_STKSIZE (~((size_t)1 << ((sizeof(size_t) * (1 << 3)) - 1)))
+#endif
 
 #if KMP_ARCH_X86
 #define KMP_DEFAULT_STKSIZE ((size_t)(2 * 1024 * 1024))
 #elif KMP_ARCH_X86_64
 #define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
 #define KMP_BACKUP_STKSIZE ((size_t)(2 * 1024 * 1024))
+#elif KMP_ARCH_VE
+// Minimum stack size for pthread for VE is 4MB.
+//   https://www.hpc.nec/documents/veos/en/glibc/Difference_Points_glibc.htm
+#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
+#elif KMP_OS_AIX
+// The default stack size for worker threads on AIX is 4MB.
+#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
 #else
 #define KMP_DEFAULT_STKSIZE ((size_t)(1024 * 1024))
 #endif
@@ -1091,13 +1232,13 @@ extern void __kmp_init_target_mem();
 #define KMP_MAX_STKPADDING (2 * 1024 * 1024)
 
 #define KMP_BLOCKTIME_MULTIPLIER                                               \
-  (1000) /* number of blocktime units per second */
+  (1000000) /* number of blocktime units per second */
 #define KMP_MIN_BLOCKTIME (0)
 #define KMP_MAX_BLOCKTIME                                                      \
   (INT_MAX) /* Must be this for "infinite" setting the work */
 
-/* __kmp_blocktime is in milliseconds */
-#define KMP_DEFAULT_BLOCKTIME (__kmp_is_hybrid_cpu() ? (0) : (200))
+/* __kmp_blocktime is in microseconds */
+#define KMP_DEFAULT_BLOCKTIME (__kmp_is_hybrid_cpu() ? (0) : (200000))
 
 #if KMP_USE_MONITOR
 #define KMP_DEFAULT_MONITOR_STKSIZE ((size_t)(1024 * 1024))
@@ -1124,22 +1265,21 @@ extern void __kmp_init_target_mem();
 #if KMP_OS_UNIX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
 // HW TSC is used to reduce overhead (clock tick instead of nanosecond).
 extern kmp_uint64 __kmp_ticks_per_msec;
+extern kmp_uint64 __kmp_ticks_per_usec;
 #if KMP_COMPILER_ICC || KMP_COMPILER_ICX
 #define KMP_NOW() ((kmp_uint64)_rdtsc())
 #else
 #define KMP_NOW() __kmp_hardware_timestamp()
 #endif
-#define KMP_NOW_MSEC() (KMP_NOW() / __kmp_ticks_per_msec)
 #define KMP_BLOCKTIME_INTERVAL(team, tid)                                      \
-  (KMP_BLOCKTIME(team, tid) * __kmp_ticks_per_msec)
+  ((kmp_uint64)KMP_BLOCKTIME(team, tid) * __kmp_ticks_per_usec)
 #define KMP_BLOCKING(goal, count) ((goal) > KMP_NOW())
 #else
 // System time is retrieved sporadically while blocking.
 extern kmp_uint64 __kmp_now_nsec();
 #define KMP_NOW() __kmp_now_nsec()
-#define KMP_NOW_MSEC() (KMP_NOW() / KMP_USEC_PER_SEC)
 #define KMP_BLOCKTIME_INTERVAL(team, tid)                                      \
-  (KMP_BLOCKTIME(team, tid) * KMP_USEC_PER_SEC)
+  ((kmp_uint64)KMP_BLOCKTIME(team, tid) * (kmp_uint64)KMP_NSEC_PER_USEC)
 #define KMP_BLOCKING(goal, count) ((count) % 1000 != 0 || (goal) > KMP_NOW())
 #endif
 #endif // KMP_USE_MONITOR
@@ -1217,12 +1357,24 @@ extern kmp_uint64 __kmp_now_nsec();
 /* TODO: tune for KMP_OS_NETBSD */
 #define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_OPENBSD
+/* TODO: tune for KMP_OS_OPENBSD */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #elif KMP_OS_HURD
 /* TODO: tune for KMP_OS_HURD */
 #define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
-#elif KMP_OS_OPENBSD
-/* TODO: tune for KMP_OS_OPENBSD */
+#elif KMP_OS_SOLARIS
+/* TODO: tune for KMP_OS_SOLARIS */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_WASI
+/* TODO: tune for KMP_OS_WASI */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_AIX
+/* TODO: tune for KMP_OS_AIX */
 #define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #endif
@@ -1251,8 +1403,6 @@ typedef struct kmp_cpuinfo {
   int stepping; // CPUID(1).EAX[3:0] ( Stepping )
   kmp_cpuinfo_flags_t flags;
   int apic_id;
-  int physical_id;
-  int logical_id;
   kmp_uint64 frequency; // Nominal CPU frequency in Hz.
   char name[3 * sizeof(kmp_cpuid_t)]; // CPUID(0x80000002,0x80000003,0x80000004)
 } kmp_cpuinfo_t;
@@ -1472,6 +1622,7 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
 // requested. Uses a timed TPAUSE, and exponential backoff. If TPAUSE isn't
 // available, fall back to the regular CPU pause and yield combination.
 #if KMP_HAVE_UMWAIT
+#define KMP_TPAUSE_MAX_MASK ((kmp_uint64)0xFFFF)
 #define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time)                               \
   {                                                                            \
     if (__kmp_tpause_enabled) {                                                \
@@ -1480,7 +1631,7 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
       } else {                                                                 \
         __kmp_tpause(__kmp_tpause_hint, (time));                               \
       }                                                                        \
-      (time) *= 2;                                                             \
+      (time) = (time << 1 | 1) & KMP_TPAUSE_MAX_MASK;                          \
     } else {                                                                   \
       KMP_CPU_PAUSE();                                                         \
       if ((KMP_TRY_YIELD_OVERSUB)) {                                           \
@@ -1734,12 +1885,9 @@ typedef struct kmp_sched_flags {
   unsigned ordered : 1;
   unsigned nomerge : 1;
   unsigned contains_last : 1;
-#if KMP_USE_HIER_SCHED
-  unsigned use_hier : 1;
-  unsigned unused : 28;
-#else
-  unsigned unused : 29;
-#endif
+  unsigned use_hier : 1; // Used in KMP_USE_HIER_SCHED code
+  unsigned use_hybrid : 1; // Used in KMP_WEIGHTED_ITERATIONS_SUPPORTED code
+  unsigned unused : 27;
 } kmp_sched_flags_t;
 
 KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);
@@ -1753,26 +1901,37 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_int32 st;
   kmp_int32 tc;
   kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+  kmp_uint32 ordered_lower;
+  kmp_uint32 ordered_upper;
+
   // KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)
   //    a) parm3 is properly aligned and
   //    b) all parm1-4 are on the same cache line.
   // Because of parm1-4 are used together, performance seems to be better
   // if they are on the same cache line (not measured though).
 
-  struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
-    kmp_int32 parm1; //     structures in kmp_dispatch.cpp. This should
-    kmp_int32 parm2; //     make no real change at least while padding is off.
+  struct KMP_ALIGN(32) {
+    kmp_int32 parm1;
+    kmp_int32 parm2;
     kmp_int32 parm3;
     kmp_int32 parm4;
   };
 
-  kmp_uint32 ordered_lower;
-  kmp_uint32 ordered_upper;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+  kmp_uint32 pchunks;
+  kmp_uint32 num_procs_with_pcore;
+  kmp_int32 first_thread_with_ecore;
+#endif
 #if KMP_OS_WINDOWS
   kmp_int32 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info32_t;
 
+#if CACHE_LINE <= 128
+KMP_BUILD_ASSERT(sizeof(dispatch_private_info32_t) <= 128);
+#endif
+
 typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   kmp_int64 count; // current chunk number for static & static-steal scheduling
   kmp_int64 ub; /* upper-bound */
@@ -1781,14 +1940,16 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   kmp_int64 st; /* stride */
   kmp_int64 tc; /* trip count (number of iterations) */
   kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+  kmp_uint64 ordered_lower;
+  kmp_uint64 ordered_upper;
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
-  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+  // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
   //    a) parm3 is properly aligned and
   //    b) all parm1-4 are in the same cache line.
   // Because of parm1-4 are used together, performance seems to be better
   // if they are in the same line (not measured though).
-
   struct KMP_ALIGN(32) {
     kmp_int64 parm1;
     kmp_int64 parm2;
@@ -1796,12 +1957,21 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
     kmp_int64 parm4;
   };
 
-  kmp_uint64 ordered_lower;
-  kmp_uint64 ordered_upper;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+  kmp_uint64 pchunks;
+  kmp_uint64 num_procs_with_pcore;
+  kmp_int64 first_thread_with_ecore;
+#endif
+
 #if KMP_OS_WINDOWS
   kmp_int64 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info64_t;
+
+#if CACHE_LINE <= 128
+KMP_BUILD_ASSERT(sizeof(dispatch_private_info64_t) <= 128);
+#endif
+
 #else /* KMP_STATIC_STEAL_ENABLED */
 typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_int32 lb;
@@ -2011,6 +2181,7 @@ typedef struct kmp_internal_control {
   int nproc; /* internal control for #threads for next parallel region (per
                 thread) */
   int thread_limit; /* internal control for thread-limit-var */
+  int task_thread_limit; /* internal control for thread-limit-var of a task*/
   int max_active_levels; /* internal control for max_active_levels */
   kmp_r_sched_t
       sched; /* internal control for runtime schedule {sched,chunk} pair */
@@ -2337,19 +2508,30 @@ typedef struct kmp_dephash_entry kmp_dephash_entry_t;
 #define KMP_DEP_MTX 0x4
 #define KMP_DEP_SET 0x8
 #define KMP_DEP_ALL 0x80
-// Compiler sends us this info:
+// Compiler sends us this info. Note: some test cases contain an explicit copy
+// of this struct and should be in sync with any changes here.
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
   size_t len;
   union {
     kmp_uint8 flag; // flag as an unsigned char
     struct { // flag as a set of 8 bits
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+      /* Same fields as in the #else branch, but in reverse order */
+      unsigned all : 1;
+      unsigned unused : 3;
+      unsigned set : 1;
+      unsigned mtx : 1;
+      unsigned out : 1;
+      unsigned in : 1;
+#else
       unsigned in : 1;
       unsigned out : 1;
       unsigned mtx : 1;
       unsigned set : 1;
       unsigned unused : 3;
       unsigned all : 1;
+#endif
     } flags;
   };
 } kmp_depend_info_t;
@@ -2424,6 +2606,63 @@ typedef struct {
   } ed;
 } kmp_event_t;
 
+#if OMPX_TASKGRAPH
+// Initial number of allocated nodes while recording
+#define INIT_MAPSIZE 50
+
+typedef struct kmp_taskgraph_flags { /*This needs to be exactly 32 bits */
+  unsigned nowait : 1;
+  unsigned re_record : 1;
+  unsigned reserved : 30;
+} kmp_taskgraph_flags_t;
+
+/// Represents a TDG node
+typedef struct kmp_node_info {
+  kmp_task_t *task; // Pointer to the actual task
+  kmp_int32 *successors; // Array of the succesors ids
+  kmp_int32 nsuccessors; // Number of succesors of the node
+  std::atomic<kmp_int32>
+      npredecessors_counter; // Number of predessors on the fly
+  kmp_int32 npredecessors; // Total number of predecessors
+  kmp_int32 successors_size; // Number of allocated succesors ids
+  kmp_taskdata_t *parent_task; // Parent implicit task
+} kmp_node_info_t;
+
+/// Represent a TDG's current status
+typedef enum kmp_tdg_status {
+  KMP_TDG_NONE = 0,
+  KMP_TDG_RECORDING = 1,
+  KMP_TDG_READY = 2
+} kmp_tdg_status_t;
+
+/// Structure that contains a TDG
+typedef struct kmp_tdg_info {
+  kmp_int32 tdg_id; // Unique idenfifier of the TDG
+  kmp_taskgraph_flags_t tdg_flags; // Flags related to a TDG
+  kmp_int32 map_size; // Number of allocated TDG nodes
+  kmp_int32 num_roots; // Number of roots tasks int the TDG
+  kmp_int32 *root_tasks; // Array of tasks identifiers that are roots
+  kmp_node_info_t *record_map; // Array of TDG nodes
+  kmp_tdg_status_t tdg_status =
+      KMP_TDG_NONE; // Status of the TDG (recording, ready...)
+  std::atomic<kmp_int32> num_tasks; // Number of TDG nodes
+  kmp_bootstrap_lock_t
+      graph_lock; // Protect graph attributes when updated via taskloop_recur
+  // Taskloop reduction related
+  void *rec_taskred_data; // Data to pass to __kmpc_task_reduction_init or
+                          // __kmpc_taskred_init
+  kmp_int32 rec_num_taskred;
+} kmp_tdg_info_t;
+
+extern int __kmp_tdg_dot;
+extern kmp_int32 __kmp_max_tdgs;
+extern kmp_tdg_info_t **__kmp_global_tdgs;
+extern kmp_int32 __kmp_curr_tdg_idx;
+extern kmp_int32 __kmp_successors_size;
+extern std::atomic<kmp_int32> __kmp_tdg_task_id;
+extern kmp_int32 __kmp_num_tdg;
+#endif
+
 #ifdef BUILD_TIED_TASK_STACK
 
 /* Tied Task stack definitions */
@@ -2442,6 +2681,34 @@ typedef struct kmp_task_stack {
 #endif // BUILD_TIED_TASK_STACK
 
 typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+  /* Same fields as in the #else branch, but in reverse order */
+#if OMPX_TASKGRAPH
+  unsigned reserved31 : 5;
+  unsigned onced : 1;
+#else
+  unsigned reserved31 : 6;
+#endif
+  unsigned target : 1;
+  unsigned native : 1;
+  unsigned freed : 1;
+  unsigned complete : 1;
+  unsigned executing : 1;
+  unsigned started : 1;
+  unsigned team_serial : 1;
+  unsigned tasking_ser : 1;
+  unsigned task_serial : 1;
+  unsigned tasktype : 1;
+  unsigned reserved : 8;
+  unsigned hidden_helper : 1;
+  unsigned detachable : 1;
+  unsigned priority_specified : 1;
+  unsigned proxy : 1;
+  unsigned destructors_thunk : 1;
+  unsigned merged_if0 : 1;
+  unsigned final : 1;
+  unsigned tiedness : 1;
+#else
   /* Compiler flags */ /* Total compiler flags must be 16 bits */
   unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
   unsigned final : 1; /* task is final(1) so execute immediately */
@@ -2471,10 +2738,20 @@ typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
   unsigned complete : 1; /* 1==complete, 0==not complete   */
   unsigned freed : 1; /* 1==freed, 0==allocated        */
   unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
-  unsigned reserved31 : 7; /* reserved for library use */
-
+  unsigned target : 1;
+#if OMPX_TASKGRAPH
+  unsigned onced : 1; /* 1==ran once already, 0==never ran, record & replay purposes */
+  unsigned reserved31 : 5; /* reserved for library use */
+#else
+  unsigned reserved31 : 6; /* reserved for library use */
+#endif
+#endif
 } kmp_tasking_flags_t;
 
+typedef struct kmp_target_data {
+  void *async_handle; // libomptarget async handle for task completion query
+} kmp_target_data_t;
+
 struct kmp_taskdata { /* aligned during dynamic allocation       */
   kmp_int32 td_task_id; /* id, assigned by debugger                */
   kmp_tasking_flags_t td_flags; /* task flags                              */
@@ -2517,6 +2794,11 @@ struct kmp_taskdata { /* aligned during dynamic allocation       */
 #if OMPT_SUPPORT
   ompt_task_info_t ompt_task_info;
 #endif
+#if OMPX_TASKGRAPH
+  bool is_taskgraph = 0; // whether the task is within a TDG
+  kmp_tdg_info_t *tdg; // used to associate task with a TDG
+#endif
+  kmp_target_data_t td_target_data;
 }; // struct kmp_taskdata
 
 // Make sure padding above worked
@@ -2600,6 +2882,11 @@ union KMP_ALIGN_CACHE kmp_task_team {
   char tt_pad[KMP_PAD(kmp_base_task_team_t, CACHE_LINE)];
 };
 
+typedef struct kmp_task_team_list_t {
+  kmp_task_team_t *task_team;
+  kmp_task_team_list_t *next;
+} kmp_task_team_list_t;
+
 #if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
 // Free lists keep same-size free memory slots for fast memory allocation
 // routines
@@ -2680,11 +2967,19 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
 
 #if KMP_AFFINITY_SUPPORTED
   kmp_affin_mask_t *th_affin_mask; /* thread's current affinity mask */
+  kmp_affinity_ids_t th_topology_ids; /* thread's current topology ids */
+  kmp_affinity_attrs_t th_topology_attrs; /* thread's current topology attrs */
 #endif
   omp_allocator_handle_t th_def_allocator; /* default allocator */
   /* The data set by the primary thread at reinit, then R/W by the worker */
   KMP_ALIGN_CACHE int
       th_set_nproc; /* if > 0, then only use this request for the next fork */
+  int *th_set_nested_nth;
+  bool th_nt_strict; // num_threads clause has strict modifier
+  ident_t *th_nt_loc; // loc for strict modifier
+  int th_nt_sev; // error severity for strict modifier
+  const char *th_nt_msg; // error message for strict modifier
+  int th_set_nested_nth_sz;
 #if KMP_NESTED_HOT_TEAMS
   kmp_hot_team_ptr_t *th_hot_teams; /* array of hot teams */
 #endif
@@ -2735,10 +3030,6 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   kmp_task_team_t *th_task_team; // Task team struct
   kmp_taskdata_t *th_current_task; // Innermost Task being executed
   kmp_uint8 th_task_state; // alternating 0/1 for task team identification
-  kmp_uint8 *th_task_state_memo_stack; // Stack holding memos of th_task_state
-  // at nested levels
-  kmp_uint32 th_task_state_top; // Top element of th_task_state_memo_stack
-  kmp_uint32 th_task_state_stack_sz; // Size of th_task_state_memo_stack
   kmp_uint32 th_reap_state; // Non-zero indicates thread is not
   // tasking, thus safe to reap
 
@@ -2860,6 +3151,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   kmp_disp_t *t_dispatch; // thread's dispatch data
   kmp_task_team_t *t_task_team[2]; // Task team struct; switch between 2
   kmp_proc_bind_t t_proc_bind; // bind type for par region
+  int t_primary_task_state; // primary thread's task state saved
 #if USE_ITT_BUILD
   kmp_uint64 t_region_time; // region begin timestamp
 #endif /* USE_ITT_BUILD */
@@ -2929,8 +3221,15 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   void *t_stack_id; // team specific stack stitching id (for ittnotify)
 #endif /* USE_ITT_BUILD */
   distributedBarrier *b; // Distributed barrier data associated with team
+  kmp_nested_nthreads_t *t_nested_nth;
 } kmp_base_team_t;
 
+// Assert that the list structure fits and aligns within
+// the double task team pointer
+KMP_BUILD_ASSERT(sizeof(kmp_task_team_t *[2]) == sizeof(kmp_task_team_list_t));
+KMP_BUILD_ASSERT(alignof(kmp_task_team_t *[2]) ==
+                 alignof(kmp_task_team_list_t));
+
 union KMP_ALIGN_CACHE kmp_team {
   kmp_base_team_t t;
   double t_align; /* use worst case alignment */
@@ -3167,6 +3466,7 @@ extern int __kmp_sys_max_nth; /* system-imposed maximum number of threads */
 extern int __kmp_max_nth;
 // maximum total number of concurrently-existing threads in a contention group
 extern int __kmp_cg_max_nth;
+extern int __kmp_task_max_nth; // max threads used in a task
 extern int __kmp_teams_max_nth; // max threads used in a teams construct
 extern int __kmp_threads_capacity; /* capacity of the arrays __kmp_threads and
                                       __kmp_root */
@@ -3178,9 +3478,22 @@ extern int __kmp_tp_capacity; /* capacity of __kmp_threads if threadprivate is
                                  used (fixed) */
 extern int __kmp_tp_cached; /* whether threadprivate cache has been created
                                (__kmpc_threadprivate_cached()) */
-extern int __kmp_dflt_blocktime; /* number of milliseconds to wait before
+extern int __kmp_dflt_blocktime; /* number of microseconds to wait before
                                     blocking (env setting) */
+extern char __kmp_blocktime_units; /* 'm' or 'u' to note units specified */
 extern bool __kmp_wpolicy_passive; /* explicitly set passive wait policy */
+
+// Convert raw blocktime from ms to us if needed.
+static inline void __kmp_aux_convert_blocktime(int *bt) {
+  if (__kmp_blocktime_units == 'm') {
+    if (*bt > INT_MAX / 1000) {
+      *bt = INT_MAX / 1000;
+      KMP_INFORM(MaxValueUsing, "kmp_set_blocktime(ms)", bt);
+    }
+    *bt = *bt * 1000;
+  }
+}
+
 #if KMP_USE_MONITOR
 extern int
     __kmp_monitor_wakeups; /* number of times monitor wakes up per second */
@@ -3245,15 +3558,6 @@ extern enum mic_type __kmp_mic_type;
 extern double __kmp_load_balance_interval; // load balance algorithm interval
 #endif /* USE_LOAD_BALANCE */
 
-// OpenMP 3.1 - Nested num threads array
-typedef struct kmp_nested_nthreads_t {
-  int *nth;
-  int size;
-  int used;
-} kmp_nested_nthreads_t;
-
-extern kmp_nested_nthreads_t __kmp_nested_nth;
-
 #if KMP_USE_ADAPTIVE_LOCKS
 
 // Parameters for the speculative lock backoff system.
@@ -3428,6 +3732,9 @@ extern void __kmp_warn(char const *format, ...);
 
 extern void __kmp_set_num_threads(int new_nth, int gtid);
 
+extern bool __kmp_detect_shm();
+extern bool __kmp_detect_tmp();
+
 // Returns current thread (pointer to kmp_info_t). Current thread *must* be
 // registered.
 static inline kmp_info_t *__kmp_entry_thread() {
@@ -3485,6 +3792,11 @@ extern void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL);
   ___kmp_thread_free((th), (ptr)KMP_SRC_LOC_CURR)
 
 extern void __kmp_push_num_threads(ident_t *loc, int gtid, int num_threads);
+extern void __kmp_push_num_threads_list(ident_t *loc, int gtid,
+                                        kmp_uint32 list_length,
+                                        int *num_threads_list);
+extern void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
+                                         const char *msg);
 
 extern void __kmp_push_proc_bind(ident_t *loc, int gtid,
                                  kmp_proc_bind_t proc_bind);
@@ -3528,6 +3840,8 @@ extern void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid);
 extern void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid);
 extern void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid);
 
+extern void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid);
+
 #ifdef KMP_GOMP_COMPAT
 
 extern void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
@@ -3591,7 +3905,7 @@ extern void __kmp_check_stack_overlap(kmp_info_t *thr);
 extern void __kmp_expand_host_name(char *buffer, size_t size);
 extern void __kmp_expand_file_name(char *result, size_t rlen, char *pattern);
 
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64 || (KMP_OS_WINDOWS && KMP_ARCH_AARCH64)
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64 || (KMP_OS_WINDOWS && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM))
 extern void
 __kmp_initialize_system_tick(void); /* Initialize timer tick value */
 #endif
@@ -3605,11 +3919,12 @@ extern char *__kmp_affinity_print_mask(char *buf, int buf_len,
                                        kmp_affin_mask_t *mask);
 extern kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
                                                   kmp_affin_mask_t *mask);
-extern void __kmp_affinity_initialize(void);
+extern void __kmp_affinity_initialize(kmp_affinity_t &affinity);
 extern void __kmp_affinity_uninitialize(void);
 extern void __kmp_affinity_set_init_mask(
     int gtid, int isa_root); /* set affinity according to KMP_AFFINITY */
-extern void __kmp_affinity_set_place(int gtid);
+void __kmp_affinity_bind_init_mask(int gtid);
+extern void __kmp_affinity_bind_place(int gtid);
 extern void __kmp_affinity_determine_capable(const char *env_var);
 extern int __kmp_aux_set_affinity(void **mask);
 extern int __kmp_aux_get_affinity(void **mask);
@@ -3618,18 +3933,25 @@ extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
 extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
 extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
 extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+extern int __kmp_get_first_osid_with_ecore(void);
+#endif
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
+    KMP_OS_AIX
 extern int kmp_set_thread_affinity_mask_initial(void);
 #endif
 static inline void __kmp_assign_root_init_mask() {
   int gtid = __kmp_entry_gtid();
   kmp_root_t *r = __kmp_threads[gtid]->th.th_root;
   if (r->r.r_uber_thread == __kmp_threads[gtid] && !r->r.r_affinity_assigned) {
-    __kmp_affinity_set_init_mask(gtid, TRUE);
+    __kmp_affinity_set_init_mask(gtid, /*isa_root=*/TRUE);
+    __kmp_affinity_bind_init_mask(gtid);
     r->r.r_affinity_assigned = TRUE;
   }
 }
 static inline void __kmp_reset_root_init_mask(int gtid) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
   kmp_info_t *th = __kmp_threads[gtid];
   kmp_root_t *r = th->th.th_root;
   if (r->r.r_uber_thread == th && r->r.r_affinity_assigned) {
@@ -3816,9 +4138,10 @@ extern void __kmp_fulfill_event(kmp_event_t *event);
 extern void __kmp_free_task_team(kmp_info_t *thread,
                                  kmp_task_team_t *task_team);
 extern void __kmp_reap_task_teams(void);
+extern void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team);
+extern void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team);
 extern void __kmp_wait_to_unref_task_teams(void);
-extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team,
-                                  int always);
+extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team);
 extern void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team);
 extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
 #if USE_ITT_BUILD
@@ -3829,6 +4152,14 @@ extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
                                  int wait = 1);
 extern void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread,
                                   int gtid);
+#if KMP_DEBUG
+#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr)                         \
+  KMP_DEBUG_ASSERT(                                                            \
+      __kmp_tasking_mode != tskm_task_teams || team->t.t_nproc == 1 ||         \
+      thr->th.th_task_team == team->t.t_task_team[thr->th.th_task_state])
+#else
+#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr) /* Nothing */
+#endif
 
 extern int __kmp_is_address_mapped(void *addr);
 extern kmp_uint64 __kmp_hardware_timestamp(void);
@@ -3874,6 +4205,9 @@ KMP_EXPORT kmp_int32 __kmpc_bound_num_threads(ident_t *);
 KMP_EXPORT kmp_int32 __kmpc_ok_to_fork(ident_t *);
 KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs,
                                  kmpc_micro microtask, ...);
+KMP_EXPORT void __kmpc_fork_call_if(ident_t *loc, kmp_int32 nargs,
+                                    kmpc_micro microtask, kmp_int32 cond,
+                                    void *args);
 
 KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid);
@@ -3946,7 +4280,6 @@ KMP_EXPORT void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
 KMP_EXPORT kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
                                            kmp_task_t *new_task);
 KMP_EXPORT kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid);
-
 KMP_EXPORT kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid,
                                           int end_part);
 
@@ -3965,11 +4298,25 @@ KMP_EXPORT kmp_int32 __kmpc_omp_task_with_deps(
     ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 ndeps,
     kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
     kmp_depend_info_t *noalias_dep_list);
+
+KMP_EXPORT kmp_base_depnode_t *__kmpc_task_get_depnode(kmp_task_t *task);
+
+KMP_EXPORT kmp_depnode_list_t *__kmpc_task_get_successors(kmp_task_t *task);
+
 KMP_EXPORT void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid,
                                      kmp_int32 ndeps,
                                      kmp_depend_info_t *dep_list,
                                      kmp_int32 ndeps_noalias,
                                      kmp_depend_info_t *noalias_dep_list);
+/* __kmpc_omp_taskwait_deps_51 : Function for OpenMP 5.1 nowait clause.
+ *                               Placeholder for taskwait with nowait clause.*/
+KMP_EXPORT void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
+                                            kmp_int32 ndeps,
+                                            kmp_depend_info_t *dep_list,
+                                            kmp_int32 ndeps_noalias,
+                                            kmp_depend_info_t *noalias_dep_list,
+                                            kmp_int32 has_no_wait);
+
 extern kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
                                 bool serialize_immediate);
 
@@ -4011,6 +4358,10 @@ KMP_EXPORT int __kmp_get_max_teams(void);
 KMP_EXPORT void __kmp_set_teams_thread_limit(int limit);
 KMP_EXPORT int __kmp_get_teams_thread_limit(void);
 
+/* Interface target task integration */
+KMP_EXPORT void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid);
+KMP_EXPORT bool __kmpc_omp_has_task_team(kmp_int32 gtid);
+
 /* Lock interface routines (fast versions with gtid passed in) */
 KMP_EXPORT void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid,
                                  void **user_lock);
@@ -4037,6 +4388,20 @@ KMP_EXPORT void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
                                                 void **user_lock,
                                                 uintptr_t hint);
 
+#if OMPX_TASKGRAPH
+// Taskgraph's Record & Replay mechanism
+// __kmp_tdg_is_recording: check whether a given TDG is recording
+// status: the tdg's current status
+static inline bool __kmp_tdg_is_recording(kmp_tdg_status_t status) {
+  return status == KMP_TDG_RECORDING;
+}
+
+KMP_EXPORT kmp_int32 __kmpc_start_record_task(ident_t *loc, kmp_int32 gtid,
+                                              kmp_int32 input_flags,
+                                              kmp_int32 tdg_id);
+KMP_EXPORT void __kmpc_end_record_task(ident_t *loc, kmp_int32 gtid,
+                                       kmp_int32 input_flags, kmp_int32 tdg_id);
+#endif
 /* Interface to fast scalable reduce methods routines */
 
 KMP_EXPORT kmp_int32 __kmpc_reduce_nowait(
@@ -4072,12 +4437,26 @@ KMP_EXPORT kmp_int32 __kmpc_in_parallel(ident_t *loc);
 KMP_EXPORT void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
                                         kmp_int32 num_threads);
+KMP_EXPORT void __kmpc_push_num_threads_strict(ident_t *loc,
+                                               kmp_int32 global_tid,
+                                               kmp_int32 num_threads,
+                                               int severity,
+                                               const char *message);
+
+KMP_EXPORT void __kmpc_push_num_threads_list(ident_t *loc, kmp_int32 global_tid,
+                                             kmp_uint32 list_length,
+                                             kmp_int32 *num_threads_list);
+KMP_EXPORT void __kmpc_push_num_threads_list_strict(
+    ident_t *loc, kmp_int32 global_tid, kmp_uint32 list_length,
+    kmp_int32 *num_threads_list, int severity, const char *message);
 
 KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
                                       int proc_bind);
 KMP_EXPORT void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
                                       kmp_int32 num_teams,
                                       kmp_int32 num_threads);
+KMP_EXPORT void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
+                                        kmp_int32 thread_limit);
 /* Function for OpenMP 5.1 num_teams clause */
 KMP_EXPORT void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
                                          kmp_int32 num_teams_lb,
@@ -4103,13 +4482,6 @@ KMP_EXPORT void *__kmpc_threadprivate_cached(ident_t *loc, kmp_int32 global_tid,
                                              void *data, size_t size,
                                              void ***cache);
 
-// Symbols for MS mutual detection.
-extern int _You_must_link_with_exactly_one_OpenMP_library;
-extern int _You_must_link_with_Intel_OpenMP_library;
-#if KMP_OS_WINDOWS && (KMP_VERSION_MAJOR > 4)
-extern int _You_must_link_with_Microsoft_OpenMP_library;
-#endif
-
 // The routines below are not exported.
 // Consider making them 'static' in corresponding source files.
 void kmp_threadprivate_insert_private_data(int gtid, void *pc_addr,
@@ -4180,7 +4552,8 @@ extern int __kmpc_get_target_offload();
 typedef enum kmp_pause_status_t {
   kmp_not_paused = 0, // status is not paused, or, requesting resume
   kmp_soft_paused = 1, // status is soft-paused, or, requesting soft pause
-  kmp_hard_paused = 2 // status is hard-paused, or, requesting hard pause
+  kmp_hard_paused = 2, // status is hard-paused, or, requesting hard pause
+  kmp_stop_tool_paused = 3 // requesting stop_tool pause
 } kmp_pause_status_t;
 
 // This stores the pause state of the runtime
@@ -4234,6 +4607,9 @@ extern void __kmp_hidden_helper_main_thread_release();
 #define KMP_HIDDEN_HELPER_WORKER_THREAD(gtid)                                  \
   ((gtid) > 1 && (gtid) <= __kmp_hidden_helper_threads_num)
 
+#define KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)                                    \
+  ((gtid) == 1 && (gtid) <= __kmp_hidden_helper_threads_num)
+
 #define KMP_HIDDEN_HELPER_TEAM(team)                                           \
   (team->t.t_threads[0] == __kmp_hidden_helper_main_thread)
 
@@ -4356,6 +4732,8 @@ public:
       : f(nullptr) {
     open(filename, mode, env_var);
   }
+  kmp_safe_raii_file_t(const kmp_safe_raii_file_t &other) = delete;
+  kmp_safe_raii_file_t &operator=(const kmp_safe_raii_file_t &other) = delete;
   ~kmp_safe_raii_file_t() { close(); }
 
   /// Open filename using mode. This is automatically closed in the destructor.
diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
index b9a8d49d8da..624fb3b0761 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
@@ -38,6 +38,43 @@ static hierarchy_info machine_hierarchy;
 
 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
 
+#if KMP_AFFINITY_SUPPORTED
+// Helper class to see if place lists further restrict the fullMask
+class kmp_full_mask_modifier_t {
+  kmp_affin_mask_t *mask;
+
+public:
+  kmp_full_mask_modifier_t() {
+    KMP_CPU_ALLOC(mask);
+    KMP_CPU_ZERO(mask);
+  }
+  ~kmp_full_mask_modifier_t() {
+    KMP_CPU_FREE(mask);
+    mask = nullptr;
+  }
+  void include(const kmp_affin_mask_t *other) { KMP_CPU_UNION(mask, other); }
+  // If the new full mask is different from the current full mask,
+  // then switch them. Returns true if full mask was affected, false otherwise.
+  bool restrict_to_mask() {
+    // See if the new mask further restricts or changes the full mask
+    if (KMP_CPU_EQUAL(__kmp_affin_fullMask, mask) || KMP_CPU_ISEMPTY(mask))
+      return false;
+    return __kmp_topology->restrict_to_mask(mask);
+  }
+};
+
+static inline const char *
+__kmp_get_affinity_env_var(const kmp_affinity_t &affinity,
+                           bool for_binding = false) {
+  if (affinity.flags.omp_places) {
+    if (for_binding)
+      return "OMP_PROC_BIND";
+    return "OMP_PLACES";
+  }
+  return affinity.env_var;
+}
+#endif // KMP_AFFINITY_SUPPORTED
+
 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
   kmp_uint32 depth;
   // The test below is true if affinity is available, but set to "none". Need to
@@ -90,8 +127,12 @@ const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
     return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
   case KMP_HW_PROC_GROUP:
     return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
+  case KMP_HW_UNKNOWN:
+  case KMP_HW_LAST:
+    return KMP_I18N_STR(Unknown);
   }
-  return KMP_I18N_STR(Unknown);
+  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
 }
 
 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
@@ -120,13 +161,18 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
     return ((plural) ? "threads" : "thread");
   case KMP_HW_PROC_GROUP:
     return ((plural) ? "proc_groups" : "proc_group");
+  case KMP_HW_UNKNOWN:
+  case KMP_HW_LAST:
+    return ((plural) ? "unknowns" : "unknown");
   }
-  return ((plural) ? "unknowns" : "unknown");
+  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
 }
 
 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
   switch (type) {
   case KMP_HW_CORE_TYPE_UNKNOWN:
+  case KMP_HW_MAX_NUM_CORE_TYPES:
     return "unknown";
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   case KMP_HW_CORE_TYPE_ATOM:
@@ -135,19 +181,19 @@ const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
     return "Intel(R) Core(TM) processor";
 #endif
   }
-  return "unknown";
+  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
 }
 
 #if KMP_AFFINITY_SUPPORTED
 // If affinity is supported, check the affinity
 // verbose and warning flags before printing warning
-#define KMP_AFF_WARNING(...)                                                   \
-  if (__kmp_affinity_verbose ||                                                \
-      (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {   \
+#define KMP_AFF_WARNING(s, ...)                                                \
+  if (s.flags.verbose || (s.flags.warnings && (s.type != affinity_none))) {    \
     KMP_WARNING(__VA_ARGS__);                                                  \
   }
 #else
-#define KMP_AFF_WARNING KMP_WARNING
+#define KMP_AFF_WARNING(s, ...) KMP_WARNING(__VA_ARGS__)
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -157,7 +203,26 @@ int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
   const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
   int depth = __kmp_topology->get_depth();
   for (int level = 0; level < depth; ++level) {
-    if (ahwthread->ids[level] < bhwthread->ids[level])
+    // Reverse sort (higher efficiencies earlier in list) cores by core
+    // efficiency if available.
+    if (__kmp_is_hybrid_cpu() &&
+        __kmp_topology->get_type(level) == KMP_HW_CORE &&
+        ahwthread->attrs.is_core_eff_valid() &&
+        bhwthread->attrs.is_core_eff_valid()) {
+      if (ahwthread->attrs.get_core_eff() < bhwthread->attrs.get_core_eff())
+        return 1;
+      if (ahwthread->attrs.get_core_eff() > bhwthread->attrs.get_core_eff())
+        return -1;
+    }
+    if (ahwthread->ids[level] == bhwthread->ids[level])
+      continue;
+    // If the hardware id is unknown for this level, then place hardware thread
+    // further down in the sorted list as it should take last priority
+    if (ahwthread->ids[level] == UNKNOWN_ID)
+      return 1;
+    else if (bhwthread->ids[level] == UNKNOWN_ID)
+      return -1;
+    else if (ahwthread->ids[level] < bhwthread->ids[level])
       return -1;
     else if (ahwthread->ids[level] > bhwthread->ids[level])
       return 1;
@@ -175,9 +240,10 @@ int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
   const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
   const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
   int depth = __kmp_topology->get_depth();
-  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
-  KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth);
-  for (i = 0; i < __kmp_affinity_compact; i++) {
+  int compact = __kmp_topology->compact;
+  KMP_DEBUG_ASSERT(compact >= 0);
+  KMP_DEBUG_ASSERT(compact <= depth);
+  for (i = 0; i < compact; i++) {
     int j = depth - i - 1;
     if (aa->sub_ids[j] < bb->sub_ids[j])
       return -1;
@@ -185,7 +251,7 @@ int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
       return 1;
   }
   for (; i < depth; i++) {
-    int j = i - __kmp_affinity_compact;
+    int j = i - compact;
     if (aa->sub_ids[j] < bb->sub_ids[j])
       return -1;
     if (aa->sub_ids[j] > bb->sub_ids[j])
@@ -199,7 +265,7 @@ void kmp_hw_thread_t::print() const {
   int depth = __kmp_topology->get_depth();
   printf("%4d ", os_id);
   for (int i = 0; i < depth; ++i) {
-    printf("%4d ", ids[i]);
+    printf("%4d (%d) ", ids[i], sub_ids[i]);
   }
   if (attrs) {
     if (attrs.is_core_type_valid())
@@ -207,6 +273,8 @@ void kmp_hw_thread_t::print() const {
     if (attrs.is_core_eff_valid())
       printf(" (eff=%d)", attrs.get_core_eff());
   }
+  if (leader)
+    printf(" (leader)");
   printf("\n");
 }
 
@@ -215,7 +283,7 @@ void kmp_hw_thread_t::print() const {
 
 // Add a layer to the topology based on the ids. Assume the topology
 // is perfectly nested (i.e., so no object has more than one parent)
-void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
+void kmp_topology_t::insert_layer(kmp_hw_t type, const int *ids) {
   // Figure out where the layer should go by comparing the ids of the current
   // layers with the new ids
   int target_layer;
@@ -276,8 +344,11 @@ void kmp_topology_t::_insert_windows_proc_groups() {
     ids[i] = __kmp_get_proc_group(mask);
   }
   KMP_CPU_FREE(mask);
-  _insert_layer(KMP_HW_PROC_GROUP, ids);
+  insert_layer(KMP_HW_PROC_GROUP, ids);
   __kmp_free(ids);
+
+  // sort topology after adding proc groups
+  __kmp_topology->sort_ids();
 }
 #endif
 
@@ -413,10 +484,13 @@ void kmp_topology_t::_gather_enumeration_information() {
       int id = hw_thread.ids[layer];
       if (id != previous_id[layer]) {
         // Add an additional increment to each count
-        for (int l = layer; l < depth; ++l)
-          count[l]++;
+        for (int l = layer; l < depth; ++l) {
+          if (hw_thread.ids[l] != kmp_hw_thread_t::UNKNOWN_ID)
+            count[l]++;
+        }
         // Keep track of topology layer ratio statistics
-        max[layer]++;
+        if (hw_thread.ids[layer] != kmp_hw_thread_t::UNKNOWN_ID)
+          max[layer]++;
         for (int l = layer + 1; l < depth; ++l) {
           if (max[l] > ratio[l])
             ratio[l] = max[l];
@@ -584,6 +658,7 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
   retval->count = arr + 2 * (size_t)KMP_HW_LAST;
   retval->num_core_efficiencies = 0;
   retval->num_core_types = 0;
+  retval->compact = 0;
   for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
     retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
   KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
@@ -674,7 +749,11 @@ void kmp_topology_t::print(const char *env_var) const {
   kmp_hw_t print_types[KMP_HW_LAST + 2];
 
   // Num Available Threads
-  KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
+  if (num_hw_threads) {
+    KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
+  } else {
+    KMP_INFORM(AvailableOSProc, env_var, __kmp_xproc);
+  }
 
   // Uniform or not
   if (is_uniform()) {
@@ -776,6 +855,8 @@ void kmp_topology_t::print(const char *env_var) const {
   for (int i = 0; i < num_hw_threads; i++) {
     __kmp_str_buf_clear(&buf);
     for (int level = 0; level < depth; ++level) {
+      if (hw_threads[i].ids[level] == kmp_hw_thread_t::UNKNOWN_ID)
+        continue;
       kmp_hw_t type = types[level];
       __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
       __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
@@ -790,41 +871,45 @@ void kmp_topology_t::print(const char *env_var) const {
   __kmp_str_buf_free(&buf);
 }
 
-void kmp_topology_t::canonicalize() {
-#if KMP_GROUP_AFFINITY
-  _insert_windows_proc_groups();
-#endif
-  _remove_radix1_layers();
-  _gather_enumeration_information();
-  _discover_uniformity();
-  _set_sub_ids();
-  _set_globals();
-  _set_last_level_cache();
-
-#if KMP_MIC_SUPPORTED
-  // Manually Add L2 = Tile equivalence
-  if (__kmp_mic_type == mic3) {
-    if (get_level(KMP_HW_L2) != -1)
-      set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
-    else if (get_level(KMP_HW_TILE) != -1)
-      set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
-  }
-#endif
-
-  // Perform post canonicalization checking
-  KMP_ASSERT(depth > 0);
-  for (int level = 0; level < depth; ++level) {
-    // All counts, ratios, and types must be valid
-    KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
-    KMP_ASSERT_VALID_HW_TYPE(types[level]);
-    // Detected types must point to themselves
-    KMP_ASSERT(equivalent[types[level]] == types[level]);
-  }
-
 #if KMP_AFFINITY_SUPPORTED
+void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const {
+  const char *env_var = __kmp_get_affinity_env_var(affinity);
+  // If requested hybrid CPU attributes for granularity (either OMP_PLACES or
+  // KMP_AFFINITY), but none exist, then reset granularity and have below method
+  // select a granularity and warn user.
+  if (!__kmp_is_hybrid_cpu()) {
+    if (affinity.core_attr_gran.valid) {
+      // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores
+      // instead
+      KMP_AFF_WARNING(
+          affinity, AffIgnoringNonHybrid, env_var,
+          __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
+      affinity.gran = KMP_HW_CORE;
+      affinity.gran_levels = -1;
+      affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
+      affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
+    } else if (affinity.flags.core_types_gran ||
+               affinity.flags.core_effs_gran) {
+      // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead
+      if (affinity.flags.omp_places) {
+        KMP_AFF_WARNING(
+            affinity, AffIgnoringNonHybrid, env_var,
+            __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
+      } else {
+        // KMP_AFFINITY=granularity=core_type|core_eff,...
+        KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
+                        "Intel(R) Hybrid Technology core attribute",
+                        __kmp_hw_get_catalog_string(KMP_HW_CORE));
+      }
+      affinity.gran = KMP_HW_CORE;
+      affinity.gran_levels = -1;
+      affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
+      affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
+    }
+  }
   // Set the number of affinity granularity levels
-  if (__kmp_affinity_gran_levels < 0) {
-    kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran);
+  if (affinity.gran_levels < 0) {
+    kmp_hw_t gran_type = get_equivalent_type(affinity.gran);
     // Check if user's granularity request is valid
     if (gran_type == KMP_HW_UNKNOWN) {
       // First try core, then thread, then package
@@ -837,10 +922,10 @@ void kmp_topology_t::canonicalize() {
       }
       KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
       // Warn user what granularity setting will be used instead
-      KMP_AFF_WARNING(AffGranularityBad, "KMP_AFFINITY",
-                      __kmp_hw_get_catalog_string(__kmp_affinity_gran),
+      KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
+                      __kmp_hw_get_catalog_string(affinity.gran),
                       __kmp_hw_get_catalog_string(gran_type));
-      __kmp_affinity_gran = gran_type;
+      affinity.gran = gran_type;
     }
 #if KMP_GROUP_AFFINITY
     // If more than one processor group exists, and the level of
@@ -855,17 +940,49 @@ void kmp_topology_t::canonicalize() {
       int proc_group_depth = get_level(KMP_HW_PROC_GROUP);
       if (gran_depth >= 0 && proc_group_depth >= 0 &&
           gran_depth < proc_group_depth) {
-        KMP_AFF_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY",
-                        __kmp_hw_get_catalog_string(__kmp_affinity_gran));
-        __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP;
+        KMP_AFF_WARNING(affinity, AffGranTooCoarseProcGroup, env_var,
+                        __kmp_hw_get_catalog_string(affinity.gran));
+        affinity.gran = gran_type = KMP_HW_PROC_GROUP;
       }
     }
 #endif
-    __kmp_affinity_gran_levels = 0;
+    affinity.gran_levels = 0;
     for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
-      __kmp_affinity_gran_levels++;
+      affinity.gran_levels++;
+  }
+}
+#endif
+
+void kmp_topology_t::canonicalize() {
+#if KMP_GROUP_AFFINITY
+  _insert_windows_proc_groups();
+#endif
+  _remove_radix1_layers();
+  _gather_enumeration_information();
+  _discover_uniformity();
+  _set_sub_ids();
+  _set_globals();
+  _set_last_level_cache();
+
+#if KMP_MIC_SUPPORTED
+  // Manually Add L2 = Tile equivalence
+  if (__kmp_mic_type == mic3) {
+    if (get_level(KMP_HW_L2) != -1)
+      set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
+    else if (get_level(KMP_HW_TILE) != -1)
+      set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
+  }
+#endif
+
+  // Perform post canonicalization checking
+  KMP_ASSERT(depth > 0);
+  for (int level = 0; level < depth; ++level) {
+    // All counts, ratios, and types must be valid
+    KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
+    KMP_ASSERT_VALID_HW_TYPE(types[level]);
+    // Detected types must point to themselves
+    KMP_ASSERT(equivalent[types[level]] == types[level]);
   }
-#endif // KMP_AFFINITY_SUPPORTED
 }
 
 // Canonicalize an explicit packages X cores/pkg X threads/core topology
@@ -894,41 +1011,7 @@ void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
   _discover_uniformity();
 }
 
-// Represents running sub IDs for a single core attribute where
-// attribute values have SIZE possibilities.
-template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
-  int last_level; // last level in topology to consider for sub_ids
-  int sub_id[SIZE]; // The sub ID for a given attribute value
-  int prev_sub_id[KMP_HW_LAST];
-  IndexFunc indexer;
-
-public:
-  kmp_sub_ids_t(int last_level) : last_level(last_level) {
-    KMP_ASSERT(last_level < KMP_HW_LAST);
-    for (size_t i = 0; i < SIZE; ++i)
-      sub_id[i] = -1;
-    for (size_t i = 0; i < KMP_HW_LAST; ++i)
-      prev_sub_id[i] = -1;
-  }
-  void update(const kmp_hw_thread_t &hw_thread) {
-    int idx = indexer(hw_thread);
-    KMP_ASSERT(idx < (int)SIZE);
-    for (int level = 0; level <= last_level; ++level) {
-      if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
-        if (level < last_level)
-          sub_id[idx] = -1;
-        sub_id[idx]++;
-        break;
-      }
-    }
-    for (int level = 0; level <= last_level; ++level)
-      prev_sub_id[level] = hw_thread.sub_ids[level];
-  }
-  int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
-    return sub_id[indexer(hw_thread)];
-  }
-};
-
+#if KMP_AFFINITY_SUPPORTED
 static kmp_str_buf_t *
 __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
                                  bool plural) {
@@ -944,6 +1027,41 @@ __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
   return buf;
 }
 
+bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) {
+  // Apply the filter
+  bool affected;
+  int new_index = 0;
+  for (int i = 0; i < num_hw_threads; ++i) {
+    int os_id = hw_threads[i].os_id;
+    if (KMP_CPU_ISSET(os_id, mask)) {
+      if (i != new_index)
+        hw_threads[new_index] = hw_threads[i];
+      new_index++;
+    } else {
+      KMP_CPU_CLR(os_id, __kmp_affin_fullMask);
+      __kmp_avail_proc--;
+    }
+  }
+
+  KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
+  affected = (num_hw_threads != new_index);
+  num_hw_threads = new_index;
+
+  // Post hardware subset canonicalization
+  if (affected) {
+    _gather_enumeration_information();
+    _discover_uniformity();
+    _set_globals();
+    _set_last_level_cache();
+#if KMP_OS_WINDOWS
+    // Copy filtered full mask if topology has single processor group
+    if (__kmp_num_proc_groups <= 1)
+#endif
+      __kmp_affin_origMask->copy(__kmp_affin_fullMask);
+  }
+  return affected;
+}
+
 // Apply the KMP_HW_SUBSET envirable to the topology
 // Returns true if KMP_HW_SUBSET filtered any processors
 // otherwise, returns false
@@ -955,9 +1073,12 @@ bool kmp_topology_t::filter_hw_subset() {
   // First, sort the KMP_HW_SUBSET items by the machine topology
   __kmp_hw_subset->sort();
 
+  __kmp_hw_subset->canonicalize(__kmp_topology);
+
   // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
   bool using_core_types = false;
   bool using_core_effs = false;
+  bool is_absolute = __kmp_hw_subset->is_absolute();
   int hw_subset_depth = __kmp_hw_subset->get_depth();
   kmp_hw_t specified[KMP_HW_LAST];
   int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
@@ -978,7 +1099,7 @@ bool kmp_topology_t::filter_hw_subset() {
     if (equivalent_type != KMP_HW_UNKNOWN) {
       __kmp_hw_subset->at(i).type = equivalent_type;
     } else {
-      KMP_AFF_WARNING(AffHWSubsetNotExistGeneric,
+      KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetNotExistGeneric,
                       __kmp_hw_get_catalog_string(type));
       return false;
     }
@@ -986,7 +1107,8 @@ bool kmp_topology_t::filter_hw_subset() {
     // Check to see if current layer has already been
     // specified either directly or through an equivalent type
     if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
-      KMP_AFF_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type),
+      KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetEqvLayers,
+                      __kmp_hw_get_catalog_string(type),
                       __kmp_hw_get_catalog_string(specified[equivalent_type]));
       return false;
     }
@@ -994,12 +1116,14 @@ bool kmp_topology_t::filter_hw_subset() {
 
     // Check to see if each layer's num & offset parameters are valid
     max_count = get_ratio(level);
-    if (max_count < 0 ||
-        (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
-      bool plural = (num > 1);
-      KMP_AFF_WARNING(AffHWSubsetManyGeneric,
-                      __kmp_hw_get_catalog_string(type, plural));
-      return false;
+    if (!is_absolute) {
+      if (max_count < 0 ||
+          (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
+        bool plural = (num > 1);
+        KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
+                        __kmp_hw_get_catalog_string(type, plural));
+        return false;
+      }
     }
 
     // Check to see if core attributes are consistent
@@ -1020,21 +1144,24 @@ bool kmp_topology_t::filter_hw_subset() {
       if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
         if (item.num_attrs == 1) {
           if (using_core_effs) {
-            KMP_AFF_WARNING(AffHWSubsetIgnoringAttr, "efficiency");
+            KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
+                            "efficiency");
           } else {
-            KMP_AFF_WARNING(AffHWSubsetIgnoringAttr, "core_type");
+            KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
+                            "core_type");
           }
           using_core_effs = false;
           using_core_types = false;
         } else {
-          KMP_AFF_WARNING(AffHWSubsetAttrsNonHybrid);
+          KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrsNonHybrid);
           return false;
         }
       }
 
       // Check if using both core types and core efficiencies together
       if (using_core_types && using_core_effs) {
-        KMP_AFF_WARNING(AffHWSubsetIncompat, "core_type", "efficiency");
+        KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat, "core_type",
+                        "efficiency");
         return false;
       }
 
@@ -1059,7 +1186,7 @@ bool kmp_topology_t::filter_hw_subset() {
       }
 
       // Check that the number of requested cores with attributes is valid
-      if (using_core_types || using_core_effs) {
+      if ((using_core_types || using_core_effs) && !is_absolute) {
         for (int j = 0; j < item.num_attrs; ++j) {
           int num = item.num[j];
           int offset = item.offset[j];
@@ -1070,7 +1197,7 @@ bool kmp_topology_t::filter_hw_subset() {
                 (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
               kmp_str_buf_t buf;
               __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
-              KMP_AFF_WARNING(AffHWSubsetManyGeneric, buf.str);
+              KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric, buf.str);
               __kmp_str_buf_free(&buf);
               return false;
             }
@@ -1092,7 +1219,7 @@ bool kmp_topology_t::filter_hw_subset() {
             }
             kmp_str_buf_t buf;
             __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
-            KMP_AFF_WARNING(AffHWSubsetIncompat,
+            KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat,
                             __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
             __kmp_str_buf_free(&buf);
             return false;
@@ -1105,7 +1232,7 @@ bool kmp_topology_t::filter_hw_subset() {
               kmp_str_buf_t buf;
               __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
                                                item.num[j] > 0);
-              KMP_AFF_WARNING(AffHWSubsetAttrRepeat, buf.str);
+              KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrRepeat, buf.str);
               __kmp_str_buf_free(&buf);
               return false;
             }
@@ -1115,43 +1242,92 @@ bool kmp_topology_t::filter_hw_subset() {
     }
   }
 
-  struct core_type_indexer {
-    int operator()(const kmp_hw_thread_t &t) const {
-      switch (t.attrs.get_core_type()) {
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
-      case KMP_HW_CORE_TYPE_ATOM:
-        return 1;
-      case KMP_HW_CORE_TYPE_CORE:
-        return 2;
-#endif
-      case KMP_HW_CORE_TYPE_UNKNOWN:
-        return 0;
-      }
-      KMP_ASSERT(0);
-      return 0;
+  // For keeping track of sub_ids for an absolute KMP_HW_SUBSET
+  // or core attributes (core type or efficiency)
+  int prev_sub_ids[KMP_HW_LAST];
+  int abs_sub_ids[KMP_HW_LAST];
+  int core_eff_sub_ids[KMP_HW_MAX_NUM_CORE_EFFS];
+  int core_type_sub_ids[KMP_HW_MAX_NUM_CORE_TYPES];
+  for (size_t i = 0; i < KMP_HW_LAST; ++i) {
+    abs_sub_ids[i] = -1;
+    prev_sub_ids[i] = -1;
+  }
+  for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_EFFS; ++i)
+    core_eff_sub_ids[i] = -1;
+  for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
+    core_type_sub_ids[i] = -1;
+
+  // Determine which hardware threads should be filtered.
+
+  // Helpful to determine if a topology layer is targeted by an absolute subset
+  auto is_targeted = [&](int level) {
+    if (is_absolute) {
+      for (int i = 0; i < hw_subset_depth; ++i)
+        if (topology_levels[i] == level)
+          return true;
+      return false;
     }
+    // If not absolute KMP_HW_SUBSET, then every layer is seen as targeted
+    return true;
   };
-  struct core_eff_indexer {
-    int operator()(const kmp_hw_thread_t &t) const {
-      return t.attrs.get_core_eff();
+
+  // Helpful to index into core type sub Ids array
+  auto get_core_type_index = [](const kmp_hw_thread_t &t) {
+    switch (t.attrs.get_core_type()) {
+    case KMP_HW_CORE_TYPE_UNKNOWN:
+    case KMP_HW_MAX_NUM_CORE_TYPES:
+      return 0;
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    case KMP_HW_CORE_TYPE_ATOM:
+      return 1;
+    case KMP_HW_CORE_TYPE_CORE:
+      return 2;
+#endif
     }
+    KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
+    KMP_BUILTIN_UNREACHABLE;
   };
 
-  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
-      core_level);
-  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
-      core_level);
+  // Helpful to index into core efficiencies sub Ids array
+  auto get_core_eff_index = [](const kmp_hw_thread_t &t) {
+    return t.attrs.get_core_eff();
+  };
 
-  // Determine which hardware threads should be filtered.
   int num_filtered = 0;
-  bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads);
+  kmp_affin_mask_t *filtered_mask;
+  KMP_CPU_ALLOC(filtered_mask);
+  KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask);
   for (int i = 0; i < num_hw_threads; ++i) {
     kmp_hw_thread_t &hw_thread = hw_threads[i];
-    // Update type_sub_id
-    if (using_core_types)
-      core_type_sub_ids.update(hw_thread);
-    if (using_core_effs)
-      core_eff_sub_ids.update(hw_thread);
+
+    // Figure out the absolute sub ids and core eff/type sub ids
+    if (is_absolute || using_core_effs || using_core_types) {
+      for (int level = 0; level < get_depth(); ++level) {
+        if (hw_thread.sub_ids[level] != prev_sub_ids[level]) {
+          bool found_targeted = false;
+          for (int j = level; j < get_depth(); ++j) {
+            bool targeted = is_targeted(j);
+            if (!found_targeted && targeted) {
+              found_targeted = true;
+              abs_sub_ids[j]++;
+              if (j == core_level && using_core_effs)
+                core_eff_sub_ids[get_core_eff_index(hw_thread)]++;
+              if (j == core_level && using_core_types)
+                core_type_sub_ids[get_core_type_index(hw_thread)]++;
+            } else if (targeted) {
+              abs_sub_ids[j] = 0;
+              if (j == core_level && using_core_effs)
+                core_eff_sub_ids[get_core_eff_index(hw_thread)] = 0;
+              if (j == core_level && using_core_types)
+                core_type_sub_ids[get_core_type_index(hw_thread)] = 0;
+            }
+          }
+          break;
+        }
+      }
+      for (int level = 0; level < get_depth(); ++level)
+        prev_sub_ids[level] = hw_thread.sub_ids[level];
+    }
 
     // Check to see if this hardware thread should be filtered
     bool should_be_filtered = false;
@@ -1186,71 +1362,60 @@ bool kmp_topology_t::filter_hw_subset() {
         int num = hw_subset_item.num[attr_idx];
         int offset = hw_subset_item.offset[attr_idx];
         if (using_core_types)
-          sub_id = core_type_sub_ids.get_sub_id(hw_thread);
+          sub_id = core_type_sub_ids[get_core_type_index(hw_thread)];
         else
-          sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
+          sub_id = core_eff_sub_ids[get_core_eff_index(hw_thread)];
         if (sub_id < offset ||
             (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
           should_be_filtered = true;
           break;
         }
       } else {
+        int sub_id;
         int num = hw_subset_item.num[0];
         int offset = hw_subset_item.offset[0];
-        if (hw_thread.sub_ids[level] < offset ||
-            (num != kmp_hw_subset_t::USE_ALL &&
-             hw_thread.sub_ids[level] >= offset + num)) {
+        if (is_absolute)
+          sub_id = abs_sub_ids[level];
+        else
+          sub_id = hw_thread.sub_ids[level];
+        if (hw_thread.ids[level] == kmp_hw_thread_t::UNKNOWN_ID ||
+            sub_id < offset ||
+            (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
           should_be_filtered = true;
           break;
         }
       }
     }
     // Collect filtering information
-    filtered[i] = should_be_filtered;
-    if (should_be_filtered)
+    if (should_be_filtered) {
+      KMP_CPU_CLR(hw_thread.os_id, filtered_mask);
       num_filtered++;
+    }
   }
 
   // One last check that we shouldn't allow filtering entire machine
   if (num_filtered == num_hw_threads) {
-    KMP_AFF_WARNING(AffHWSubsetAllFiltered);
-    __kmp_free(filtered);
+    KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered);
     return false;
   }
 
   // Apply the filter
-  int new_index = 0;
-  for (int i = 0; i < num_hw_threads; ++i) {
-    if (!filtered[i]) {
-      if (i != new_index)
-        hw_threads[new_index] = hw_threads[i];
-      new_index++;
-    } else {
-#if KMP_AFFINITY_SUPPORTED
-      KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask);
-#endif
-      __kmp_avail_proc--;
-    }
-  }
-
-  KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
-  num_hw_threads = new_index;
-
-  // Post hardware subset canonicalization
-  _gather_enumeration_information();
-  _discover_uniformity();
-  _set_globals();
-  _set_last_level_cache();
-  __kmp_free(filtered);
+  restrict_to_mask(filtered_mask);
   return true;
 }
 
-bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
+bool kmp_topology_t::is_close(int hwt1, int hwt2,
+                              const kmp_affinity_t &stgs) const {
+  int hw_level = stgs.gran_levels;
   if (hw_level >= depth)
     return true;
   bool retval = true;
   const kmp_hw_thread_t &t1 = hw_threads[hwt1];
   const kmp_hw_thread_t &t2 = hw_threads[hwt2];
+  if (stgs.flags.core_types_gran)
+    return t1.attrs.get_core_type() == t2.attrs.get_core_type();
+  if (stgs.flags.core_effs_gran)
+    return t1.attrs.get_core_eff() == t2.attrs.get_core_eff();
   for (int i = 0; i < (depth - hw_level); ++i) {
     if (t1.ids[i] != t2.ids[i])
       return false;
@@ -1260,30 +1425,6 @@ bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if KMP_AFFINITY_SUPPORTED
-class kmp_affinity_raii_t {
-  kmp_affin_mask_t *mask;
-  bool restored;
-
-public:
-  kmp_affinity_raii_t() : restored(false) {
-    KMP_CPU_ALLOC(mask);
-    KMP_ASSERT(mask != NULL);
-    __kmp_get_system_affinity(mask, TRUE);
-  }
-  void restore() {
-    __kmp_set_system_affinity(mask, TRUE);
-    KMP_CPU_FREE(mask);
-    restored = true;
-  }
-  ~kmp_affinity_raii_t() {
-    if (!restored) {
-      __kmp_set_system_affinity(mask, TRUE);
-      KMP_CPU_FREE(mask);
-    }
-  }
-};
-
 bool KMPAffinity::picked_api = false;
 
 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
@@ -1301,7 +1442,7 @@ void KMPAffinity::pick_api() {
   // Only use Hwloc if affinity isn't explicitly disabled and
   // user requests Hwloc topology method
   if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
-      __kmp_affinity_type != affinity_disabled) {
+      __kmp_affinity.type != affinity_disabled) {
     affinity_dispatch = new KMPHwlocAffinity();
   } else
 #endif
@@ -1448,15 +1589,13 @@ kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
   return buf;
 }
 
-// Return (possibly empty) affinity mask representing the offline CPUs
-// Caller must free the mask
-kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
-  kmp_affin_mask_t *offline;
-  KMP_CPU_ALLOC(offline);
-  KMP_CPU_ZERO(offline);
+static kmp_affin_mask_t *__kmp_parse_cpu_list(const char *path) {
+  kmp_affin_mask_t *mask;
+  KMP_CPU_ALLOC(mask);
+  KMP_CPU_ZERO(mask);
 #if KMP_OS_LINUX
   int n, begin_cpu, end_cpu;
-  kmp_safe_raii_file_t offline_file;
+  kmp_safe_raii_file_t file;
   auto skip_ws = [](FILE *f) {
     int c;
     do {
@@ -1465,29 +1604,29 @@ kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
     if (c != EOF)
       ungetc(c, f);
   };
-  // File contains CSV of integer ranges representing the offline CPUs
+  // File contains CSV of integer ranges representing the CPUs
   // e.g., 1,2,4-7,9,11-15
-  int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
+  int status = file.try_open(path, "r");
   if (status != 0)
-    return offline;
-  while (!feof(offline_file)) {
-    skip_ws(offline_file);
-    n = fscanf(offline_file, "%d", &begin_cpu);
+    return mask;
+  while (!feof(file)) {
+    skip_ws(file);
+    n = fscanf(file, "%d", &begin_cpu);
     if (n != 1)
       break;
-    skip_ws(offline_file);
-    int c = fgetc(offline_file);
+    skip_ws(file);
+    int c = fgetc(file);
     if (c == EOF || c == ',') {
       // Just single CPU
       end_cpu = begin_cpu;
     } else if (c == '-') {
       // Range of CPUs
-      skip_ws(offline_file);
-      n = fscanf(offline_file, "%d", &end_cpu);
+      skip_ws(file);
+      n = fscanf(file, "%d", &end_cpu);
       if (n != 1)
         break;
-      skip_ws(offline_file);
-      c = fgetc(offline_file); // skip ','
+      skip_ws(file);
+      c = fgetc(file); // skip ','
     } else {
       // Syntax problem
       break;
@@ -1497,13 +1636,19 @@ kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
         end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
       continue;
     }
-    // Insert [begin_cpu, end_cpu] into offline mask
+    // Insert [begin_cpu, end_cpu] into mask
     for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
-      KMP_CPU_SET(cpu, offline);
+      KMP_CPU_SET(cpu, mask);
     }
   }
 #endif
-  return offline;
+  return mask;
+}
+
+// Return (possibly empty) affinity mask representing the offline CPUs
+// Caller must free the mask
+kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
+  return __kmp_parse_cpu_list("/sys/devices/system/cpu/offline");
 }
 
 // Return the number of available procs
@@ -1592,6 +1737,7 @@ static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
   case HWLOC_OBJ_PU:
     return KMP_HW_THREAD;
   case HWLOC_OBJ_GROUP:
+#if HWLOC_API_VERSION >= 0x00020000
     if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
       return KMP_HW_DIE;
     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
@@ -1600,6 +1746,7 @@ static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
       return KMP_HW_MODULE;
     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
       return KMP_HW_PROC_GROUP;
+#endif
     return KMP_HW_UNKNOWN;
 #if HWLOC_API_VERSION >= 0x00020100
   case HWLOC_OBJ_DIE:
@@ -1663,14 +1810,14 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
 
   hwloc_topology_t tp = __kmp_hwloc_topology;
   *msg_id = kmp_i18n_null;
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
   }
 
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
     // available from hwloc on the current thread, and __kmp_xproc.
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
     // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
     hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
     if (o != NULL)
@@ -1682,6 +1829,8 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
     else
       __kmp_nThreadsPerCore = 1; // no CORE found
+    if (__kmp_nThreadsPerCore == 0)
+      __kmp_nThreadsPerCore = 1;
     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
     if (nCoresPerPkg == 0)
       nCoresPerPkg = 1; // to prevent possible division by 0
@@ -1689,6 +1838,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
     return true;
   }
 
+#if HWLOC_API_VERSION >= 0x00020400
   // Handle multiple types of cores if they exist on the system
   int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
 
@@ -1727,19 +1877,14 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       }
     }
   }
+#endif
 
   root = hwloc_get_root_obj(tp);
 
   // Figure out the depth and types in the topology
   depth = 0;
-  pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
-  KMP_ASSERT(pu);
-  obj = pu;
-  types[depth] = KMP_HW_THREAD;
-  hwloc_types[depth] = obj->type;
-  depth++;
-  while (obj != root && obj != NULL) {
-    obj = obj->parent;
+  obj = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
+  while (obj && obj != root) {
 #if HWLOC_API_VERSION >= 0x00020000
     if (obj->memory_arity) {
       hwloc_obj_t memory;
@@ -1761,6 +1906,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       hwloc_types[depth] = obj->type;
       depth++;
     }
+    obj = obj->parent;
   }
   KMP_ASSERT(depth > 0);
 
@@ -1787,7 +1933,9 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       hw_thread.clear();
       hw_thread.ids[index] = pu->logical_index;
       hw_thread.os_id = pu->os_index;
+      hw_thread.original_idx = hw_thread_index;
       // If multiple core types, then set that attribute for the hardware thread
+#if HWLOC_API_VERSION >= 0x00020400
       if (cpukinds) {
         int cpukind_index = -1;
         for (int i = 0; i < nr_cpu_kinds; ++i) {
@@ -1801,6 +1949,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
           hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
         }
       }
+#endif
       index--;
     }
     obj = pu;
@@ -1825,7 +1974,6 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
             hw_thread.ids[index + 1] = sub_id;
             index--;
           }
-          prev = memory;
         }
         prev = obj;
       }
@@ -1845,12 +1993,14 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       hw_thread_index++;
   }
 
+#if HWLOC_API_VERSION >= 0x00020400
   // Free the core types information
   if (cpukinds) {
     for (int idx = 0; idx < nr_cpu_kinds; ++idx)
       hwloc_bitmap_free(cpukinds[idx].mask);
     __kmp_free(cpukinds);
   }
+#endif
   __kmp_topology->sort_ids();
   return true;
 }
@@ -1864,15 +2014,15 @@ static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
   int depth = 3;
   kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
 
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
   }
 
-  // Even if __kmp_affinity_type == affinity_none, this routine might still
-  // called to set __kmp_ncores, as well as
+  // Even if __kmp_affinity.type == affinity_none, this routine might still
+  // be called to set __kmp_ncores, as well as
   // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
   if (!KMP_AFFINITY_CAPABLE()) {
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
     __kmp_ncores = nPackages = __kmp_xproc;
     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
     return true;
@@ -1897,12 +2047,13 @@ static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
     kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
     hw_thread.clear();
     hw_thread.os_id = i;
+    hw_thread.original_idx = avail_ct;
     hw_thread.ids[0] = i;
     hw_thread.ids[1] = 0;
     hw_thread.ids[2] = 0;
     avail_ct++;
   }
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
   }
   return true;
@@ -1919,13 +2070,13 @@ static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
   kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
   const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
 
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
   }
 
   // If we aren't affinity capable, then use flat topology
   if (!KMP_AFFINITY_CAPABLE()) {
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
     nPackages = __kmp_num_proc_groups;
     __kmp_nThreadsPerCore = 1;
     __kmp_ncores = __kmp_xproc;
@@ -1942,11 +2093,13 @@ static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
       continue;
     }
-    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
     hw_thread.clear();
     hw_thread.os_id = i;
+    hw_thread.original_idx = avail_ct;
     hw_thread.ids[0] = i / BITS_PER_GROUP;
     hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
+    avail_ct++;
   }
   return true;
 }
@@ -2002,15 +2155,43 @@ static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
   return 0;
 }
 
-class kmp_cache_info_t {
+class cpuid_cache_info_t {
 public:
   struct info_t {
-    unsigned level, mask;
+    unsigned level = 0;
+    unsigned mask = 0;
+    bool operator==(const info_t &rhs) const {
+      return level == rhs.level && mask == rhs.mask;
+    }
+    bool operator!=(const info_t &rhs) const { return !operator==(rhs); }
   };
-  kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
+  cpuid_cache_info_t() : depth(0) {
+    table[MAX_CACHE_LEVEL].level = 0;
+    table[MAX_CACHE_LEVEL].mask = 0;
+  }
   size_t get_depth() const { return depth; }
   info_t &operator[](size_t index) { return table[index]; }
   const info_t &operator[](size_t index) const { return table[index]; }
+  bool operator==(const cpuid_cache_info_t &rhs) const {
+    if (rhs.depth != depth)
+      return false;
+    for (size_t i = 0; i < depth; ++i)
+      if (table[i] != rhs.table[i])
+        return false;
+    return true;
+  }
+  bool operator!=(const cpuid_cache_info_t &rhs) const {
+    return !operator==(rhs);
+  }
+  // Get cache information assocaited with L1, L2, L3 cache, etc.
+  // If level does not exist, then return the "NULL" level (level 0)
+  const info_t &get_level(unsigned level) const {
+    for (size_t i = 0; i < depth; ++i) {
+      if (table[i].level == level)
+        return table[i];
+    }
+    return table[MAX_CACHE_LEVEL];
+  }
 
   static kmp_hw_t get_topology_type(unsigned level) {
     KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
@@ -2024,13 +2205,6 @@ public:
     }
     return KMP_HW_UNKNOWN;
   }
-
-private:
-  static const int MAX_CACHE_LEVEL = 3;
-
-  size_t depth;
-  info_t table[MAX_CACHE_LEVEL];
-
   void get_leaf4_levels() {
     unsigned level = 0;
     while (depth < MAX_CACHE_LEVEL) {
@@ -2055,6 +2229,11 @@ private:
       level++;
     }
   }
+  static const int MAX_CACHE_LEVEL = 3;
+
+private:
+  size_t depth;
+  info_t table[MAX_CACHE_LEVEL + 1];
 };
 
 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
@@ -2065,7 +2244,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
   kmp_cpuid buf;
   *msg_id = kmp_i18n_null;
 
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
   }
 
@@ -2084,7 +2263,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
     // available from cpuid on the current thread, and __kmp_xproc.
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
 
     // Get an upper bound on the number of threads per package using cpuid(1).
     // On some OS/chps combinations where HT is supported by the chip but is
@@ -2136,7 +2315,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
 
   // From here on, we can assume that it is safe to call
   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
-  // __kmp_affinity_type = affinity_none.
+  // __kmp_affinity.type = affinity_none.
 
   // Save the affinity mask for the current thread.
   kmp_affinity_raii_t previous_affinity;
@@ -2362,6 +2541,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
       hw_thread.ids[idx++] = threadInfo[i].threadId;
     }
     hw_thread.os_id = os;
+    hw_thread.original_idx = i;
   }
 
   __kmp_free(threadInfo);
@@ -2417,15 +2597,13 @@ enum {
   INTEL_LEVEL_TYPE_INVALID = 0, // Package level
   INTEL_LEVEL_TYPE_SMT = 1,
   INTEL_LEVEL_TYPE_CORE = 2,
-  INTEL_LEVEL_TYPE_TILE = 3,
-  INTEL_LEVEL_TYPE_MODULE = 4,
+  INTEL_LEVEL_TYPE_MODULE = 3,
+  INTEL_LEVEL_TYPE_TILE = 4,
   INTEL_LEVEL_TYPE_DIE = 5,
   INTEL_LEVEL_TYPE_LAST = 6,
 };
-
-struct cpuid_level_info_t {
-  unsigned level_type, mask, mask_width, nitems, cache_mask;
-};
+KMP_BUILD_ASSERT(INTEL_LEVEL_TYPE_LAST < sizeof(unsigned) * CHAR_BIT);
+#define KMP_LEAF_1F_KNOWN_LEVELS ((1u << INTEL_LEVEL_TYPE_LAST) - 1u)
 
 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
   switch (intel_type) {
@@ -2445,16 +2623,78 @@ static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
   return KMP_HW_UNKNOWN;
 }
 
-// This function takes the topology leaf, a levels array to store the levels
-// detected and a bitmap of the known levels.
-// Returns the number of levels in the topology
-static unsigned
-__kmp_x2apicid_get_levels(int leaf,
-                          cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
-                          kmp_uint64 known_levels) {
+static int __kmp_topology_type_2_intel_type(kmp_hw_t type) {
+  switch (type) {
+  case KMP_HW_SOCKET:
+    return INTEL_LEVEL_TYPE_INVALID;
+  case KMP_HW_THREAD:
+    return INTEL_LEVEL_TYPE_SMT;
+  case KMP_HW_CORE:
+    return INTEL_LEVEL_TYPE_CORE;
+  case KMP_HW_TILE:
+    return INTEL_LEVEL_TYPE_TILE;
+  case KMP_HW_MODULE:
+    return INTEL_LEVEL_TYPE_MODULE;
+  case KMP_HW_DIE:
+    return INTEL_LEVEL_TYPE_DIE;
+  default:
+    return INTEL_LEVEL_TYPE_INVALID;
+  }
+}
+
+struct cpuid_level_info_t {
+  unsigned level_type, mask, mask_width, nitems, cache_mask;
+};
+
+class cpuid_topo_desc_t {
+  unsigned desc = 0;
+
+public:
+  void clear() { desc = 0; }
+  bool contains(int intel_type) const {
+    KMP_DEBUG_ASSERT(intel_type >= 0 && intel_type < INTEL_LEVEL_TYPE_LAST);
+    if ((1u << intel_type) & desc)
+      return true;
+    return false;
+  }
+  bool contains_topology_type(kmp_hw_t type) const {
+    KMP_DEBUG_ASSERT(type >= 0 && type < KMP_HW_LAST);
+    int intel_type = __kmp_topology_type_2_intel_type(type);
+    return contains(intel_type);
+  }
+  bool contains(cpuid_topo_desc_t rhs) const {
+    return ((desc | rhs.desc) == desc);
+  }
+  void add(int intel_type) { desc |= (1u << intel_type); }
+  void add(cpuid_topo_desc_t rhs) { desc |= rhs.desc; }
+};
+
+struct cpuid_proc_info_t {
+  // Topology info
+  int os_id;
+  unsigned apic_id;
+  unsigned depth;
+  // Hybrid info
+  unsigned native_model_id;
+  int efficiency;
+  kmp_hw_core_type_t type;
+  cpuid_topo_desc_t description;
+
+  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
+};
+
+// This function takes the topology leaf, an info pointer to store the levels
+// detected, and writable descriptors for the total topology.
+// Returns whether total types, depth, or description were modified.
+static bool __kmp_x2apicid_get_levels(int leaf, cpuid_proc_info_t *info,
+                                      kmp_hw_t total_types[KMP_HW_LAST],
+                                      int *total_depth,
+                                      cpuid_topo_desc_t *total_description) {
   unsigned level, levels_index;
   unsigned level_type, mask_width, nitems;
   kmp_cpuid buf;
+  cpuid_level_info_t(&levels)[INTEL_LEVEL_TYPE_LAST] = info->levels;
+  bool retval = false;
 
   // New algorithm has known topology layers act as highest unknown topology
   // layers when unknown topology layers exist.
@@ -2469,10 +2709,12 @@ __kmp_x2apicid_get_levels(int leaf,
     level_type = __kmp_extract_bits<8, 15>(buf.ecx);
     mask_width = __kmp_extract_bits<0, 4>(buf.eax);
     nitems = __kmp_extract_bits<0, 15>(buf.ebx);
-    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
-      return 0;
+    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) {
+      info->depth = 0;
+      return retval;
+    }
 
-    if (known_levels & (1ull << level_type)) {
+    if (KMP_LEAF_1F_KNOWN_LEVELS & (1u << level_type)) {
       // Add a new level to the topology
       KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
       levels[levels_index].level_type = level_type;
@@ -2488,6 +2730,26 @@ __kmp_x2apicid_get_levels(int leaf,
     }
     level++;
   } while (level_type != INTEL_LEVEL_TYPE_INVALID);
+  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
+  info->description.clear();
+  info->depth = levels_index;
+
+  // If types, depth, and total_description are uninitialized,
+  // then initialize them now
+  if (*total_depth == 0) {
+    *total_depth = info->depth;
+    total_description->clear();
+    for (int i = *total_depth - 1, j = 0; i >= 0; --i, ++j) {
+      total_types[j] =
+          __kmp_intel_type_2_topology_type(info->levels[i].level_type);
+      total_description->add(info->levels[i].level_type);
+    }
+    retval = true;
+  }
+
+  // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
+  if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID)
+    return 0;
 
   // Set the masks to & with apicid
   for (unsigned i = 0; i < levels_index; ++i) {
@@ -2497,42 +2759,65 @@ __kmp_x2apicid_get_levels(int leaf,
       for (unsigned j = 0; j < i; ++j)
         levels[i].mask ^= levels[j].mask;
     } else {
-      KMP_DEBUG_ASSERT(levels_index > 0);
+      KMP_DEBUG_ASSERT(i > 0);
       levels[i].mask = (-1) << levels[i - 1].mask_width;
       levels[i].cache_mask = 0;
     }
+    info->description.add(info->levels[i].level_type);
   }
-  return levels_index;
+
+  // If this processor has level type not on other processors, then make
+  // sure to include it in total types, depth, and description.
+  // One assumption here is that the first type, i.e. socket, is known.
+  // Another assumption is that types array is always large enough to fit any
+  // new layers since its length is KMP_HW_LAST.
+  if (!total_description->contains(info->description)) {
+    for (int i = info->depth - 1, j = 0; i >= 0; --i, ++j) {
+      // If this level is known already, then skip it.
+      if (total_description->contains(levels[i].level_type))
+        continue;
+      // Unknown level, insert before last known level
+      kmp_hw_t curr_type =
+          __kmp_intel_type_2_topology_type(levels[i].level_type);
+      KMP_ASSERT(j != 0 && "Bad APIC Id information");
+      // Move over all known levels to make room for new level
+      for (int k = info->depth - 1; k >= j; --k) {
+        KMP_DEBUG_ASSERT(k + 1 < KMP_HW_LAST);
+        total_types[k + 1] = total_types[k];
+      }
+      // Insert new level
+      total_types[j] = curr_type;
+      (*total_depth)++;
+    }
+    total_description->add(info->description);
+    retval = true;
+  }
+  return retval;
 }
 
 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
 
-  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
   kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
-  unsigned levels_index;
   kmp_cpuid buf;
-  kmp_uint64 known_levels;
-  int topology_leaf, highest_leaf, apic_id;
+  int topology_leaf, highest_leaf;
   int num_leaves;
+  int depth = 0;
+  cpuid_topo_desc_t total_description;
   static int leaves[] = {0, 0};
 
-  kmp_i18n_id_t leaf_message_id;
+  // If affinity is disabled, __kmp_avail_proc may be zero
+  int ninfos = (__kmp_avail_proc > 0 ? __kmp_avail_proc : 1);
+  cpuid_proc_info_t *proc_info = (cpuid_proc_info_t *)__kmp_allocate(
+      (sizeof(cpuid_proc_info_t) + sizeof(cpuid_cache_info_t)) * ninfos);
+  cpuid_cache_info_t *cache_info = (cpuid_cache_info_t *)(proc_info + ninfos);
 
-  KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
+  kmp_i18n_id_t leaf_message_id;
 
   *msg_id = kmp_i18n_null;
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
   }
 
-  // Figure out the known topology levels
-  known_levels = 0ull;
-  for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
-    if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
-      known_levels |= (1ull << i);
-    }
-  }
-
   // Get the highest cpuid leaf supported
   __kmp_x86_cpuid(0, 0, &buf);
   highest_leaf = buf.eax;
@@ -2566,16 +2851,18 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
     if (buf.ebx == 0)
       continue;
     topology_leaf = leaf;
-    levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
-    if (levels_index == 0)
+    __kmp_x2apicid_get_levels(leaf, &proc_info[0], types, &depth,
+                              &total_description);
+    if (depth == 0)
       continue;
     break;
   }
-  if (topology_leaf == -1 || levels_index == 0) {
+  if (topology_leaf == -1 || depth == 0) {
     *msg_id = leaf_message_id;
+    __kmp_free(proc_info);
     return false;
   }
-  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
+  KMP_ASSERT(depth <= INTEL_LEVEL_TYPE_LAST);
 
   // The algorithm used starts by setting the affinity to each available thread
   // and retrieving info from the cpuid instruction, so if we are not capable of
@@ -2585,46 +2872,23 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
     // available from cpuid on the current thread, and __kmp_xproc.
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
-    for (unsigned i = 0; i < levels_index; ++i) {
-      if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
-        __kmp_nThreadsPerCore = levels[i].nitems;
-      } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
-        nCoresPerPkg = levels[i].nitems;
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
+    for (int i = 0; i < depth; ++i) {
+      if (proc_info[0].levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
+        __kmp_nThreadsPerCore = proc_info[0].levels[i].nitems;
+      } else if (proc_info[0].levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
+        nCoresPerPkg = proc_info[0].levels[i].nitems;
       }
     }
     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    __kmp_free(proc_info);
     return true;
   }
 
-  // Allocate the data structure to be returned.
-  int depth = levels_index;
-  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
-    types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
-  __kmp_topology =
-      kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
-
-  // Insert equivalent cache types if they exist
-  kmp_cache_info_t cache_info;
-  for (size_t i = 0; i < cache_info.get_depth(); ++i) {
-    const kmp_cache_info_t::info_t &info = cache_info[i];
-    unsigned cache_mask = info.mask;
-    unsigned cache_level = info.level;
-    for (unsigned j = 0; j < levels_index; ++j) {
-      unsigned hw_cache_mask = levels[j].cache_mask;
-      kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
-      if (hw_cache_mask == cache_mask && j < levels_index - 1) {
-        kmp_hw_t type =
-            __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
-        __kmp_topology->set_equivalent_type(cache_type, type);
-      }
-    }
-  }
-
   // From here on, we can assume that it is safe to call
   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
-  // __kmp_affinity_type = affinity_none.
+  // __kmp_affinity.type = affinity_none.
 
   // Save the affinity mask for the current thread.
   kmp_affinity_raii_t previous_affinity;
@@ -2633,56 +2897,167 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
   // to it, and obtaining the pertinent information using the cpuid instr.
   unsigned int proc;
   int hw_thread_index = 0;
-  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
-    cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
-    unsigned my_levels_index;
+  bool uniform_caches = true;
 
+  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
     // Skip this proc if it is not included in the machine model.
     if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
       continue;
     }
     KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
 
+    // Gather topology information
     __kmp_affinity_dispatch->bind_thread(proc);
-
-    // New algorithm
     __kmp_x86_cpuid(topology_leaf, 0, &buf);
-    apic_id = buf.edx;
-    kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
-    my_levels_index =
-        __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
-    if (my_levels_index == 0 || my_levels_index != levels_index) {
+    proc_info[hw_thread_index].os_id = proc;
+    proc_info[hw_thread_index].apic_id = buf.edx;
+    __kmp_x2apicid_get_levels(topology_leaf, &proc_info[hw_thread_index], types,
+                              &depth, &total_description);
+    if (proc_info[hw_thread_index].depth == 0) {
       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+      __kmp_free(proc_info);
       return false;
     }
-    hw_thread.clear();
-    hw_thread.os_id = proc;
-    // Put in topology information
-    for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
-      hw_thread.ids[idx] = apic_id & my_levels[j].mask;
-      if (j > 0) {
-        hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
-      }
-    }
+    // Gather cache information and insert afterwards
+    cache_info[hw_thread_index].get_leaf4_levels();
+    if (uniform_caches && hw_thread_index > 0)
+      if (cache_info[0] != cache_info[hw_thread_index])
+        uniform_caches = false;
     // Hybrid information
     if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
-      kmp_hw_core_type_t type;
-      unsigned native_model_id;
-      int efficiency;
-      __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
-      hw_thread.attrs.set_core_type(type);
-      hw_thread.attrs.set_core_eff(efficiency);
+      __kmp_get_hybrid_info(&proc_info[hw_thread_index].type,
+                            &proc_info[hw_thread_index].efficiency,
+                            &proc_info[hw_thread_index].native_model_id);
     }
     hw_thread_index++;
   }
   KMP_ASSERT(hw_thread_index > 0);
+  previous_affinity.restore();
+
+  // Allocate the data structure to be returned.
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
+
+  // Create topology Ids and hybrid types in __kmp_topology
+  for (int i = 0; i < __kmp_topology->get_num_hw_threads(); ++i) {
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+    hw_thread.clear();
+    hw_thread.os_id = proc_info[i].os_id;
+    hw_thread.original_idx = i;
+    unsigned apic_id = proc_info[i].apic_id;
+    // Put in topology information
+    for (int j = 0, idx = depth - 1; j < depth; ++j, --idx) {
+      if (!(proc_info[i].description.contains_topology_type(
+              __kmp_topology->get_type(j)))) {
+        hw_thread.ids[idx] = kmp_hw_thread_t::UNKNOWN_ID;
+      } else {
+        hw_thread.ids[idx] = apic_id & proc_info[i].levels[j].mask;
+        if (j > 0) {
+          hw_thread.ids[idx] >>= proc_info[i].levels[j - 1].mask_width;
+        }
+      }
+    }
+    hw_thread.attrs.set_core_type(proc_info[i].type);
+    hw_thread.attrs.set_core_eff(proc_info[i].efficiency);
+  }
+
   __kmp_topology->sort_ids();
+
+  // Change Ids to logical Ids
+  for (int j = 0; j < depth - 1; ++j) {
+    int new_id = 0;
+    int prev_id = __kmp_topology->at(0).ids[j];
+    int curr_id = __kmp_topology->at(0).ids[j + 1];
+    __kmp_topology->at(0).ids[j + 1] = new_id;
+    for (int i = 1; i < __kmp_topology->get_num_hw_threads(); ++i) {
+      kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+      if (hw_thread.ids[j] == prev_id && hw_thread.ids[j + 1] == curr_id) {
+        hw_thread.ids[j + 1] = new_id;
+      } else if (hw_thread.ids[j] == prev_id &&
+                 hw_thread.ids[j + 1] != curr_id) {
+        curr_id = hw_thread.ids[j + 1];
+        hw_thread.ids[j + 1] = ++new_id;
+      } else {
+        prev_id = hw_thread.ids[j];
+        curr_id = hw_thread.ids[j + 1];
+        hw_thread.ids[j + 1] = ++new_id;
+      }
+    }
+  }
+
+  // First check for easy cache placement. This occurs when caches are
+  // equivalent to a layer in the CPUID leaf 0xb or 0x1f topology.
+  if (uniform_caches) {
+    for (size_t i = 0; i < cache_info[0].get_depth(); ++i) {
+      unsigned cache_mask = cache_info[0][i].mask;
+      unsigned cache_level = cache_info[0][i].level;
+      KMP_ASSERT(cache_level <= cpuid_cache_info_t::MAX_CACHE_LEVEL);
+      kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(cache_level);
+      __kmp_topology->set_equivalent_type(cache_type, cache_type);
+      for (int j = 0; j < depth; ++j) {
+        unsigned hw_cache_mask = proc_info[0].levels[j].cache_mask;
+        if (hw_cache_mask == cache_mask && j < depth - 1) {
+          kmp_hw_t type = __kmp_intel_type_2_topology_type(
+              proc_info[0].levels[j + 1].level_type);
+          __kmp_topology->set_equivalent_type(cache_type, type);
+        }
+      }
+    }
+  } else {
+    // If caches are non-uniform, then record which caches exist.
+    for (int i = 0; i < __kmp_topology->get_num_hw_threads(); ++i) {
+      for (size_t j = 0; j < cache_info[i].get_depth(); ++j) {
+        unsigned cache_level = cache_info[i][j].level;
+        kmp_hw_t cache_type =
+            cpuid_cache_info_t::get_topology_type(cache_level);
+        if (__kmp_topology->get_equivalent_type(cache_type) == KMP_HW_UNKNOWN)
+          __kmp_topology->set_equivalent_type(cache_type, cache_type);
+      }
+    }
+  }
+
+  // See if any cache level needs to be added manually through cache Ids
+  bool unresolved_cache_levels = false;
+  for (unsigned level = 1; level <= cpuid_cache_info_t::MAX_CACHE_LEVEL;
+       ++level) {
+    kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(level);
+    // This also filters out caches which may not be in the topology
+    // since the equivalent type might be KMP_HW_UNKNOWN.
+    if (__kmp_topology->get_equivalent_type(cache_type) == cache_type) {
+      unresolved_cache_levels = true;
+      break;
+    }
+  }
+
+  // Insert unresolved cache layers into machine topology using cache Ids
+  if (unresolved_cache_levels) {
+    int num_hw_threads = __kmp_topology->get_num_hw_threads();
+    int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
+    for (unsigned l = 1; l <= cpuid_cache_info_t::MAX_CACHE_LEVEL; ++l) {
+      kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(l);
+      if (__kmp_topology->get_equivalent_type(cache_type) != cache_type)
+        continue;
+      for (int i = 0; i < num_hw_threads; ++i) {
+        int original_idx = __kmp_topology->at(i).original_idx;
+        ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
+        const cpuid_cache_info_t::info_t &info =
+            cache_info[original_idx].get_level(l);
+        // if cache level not in topology for this processor, then skip
+        if (info.level == 0)
+          continue;
+        ids[i] = info.mask & proc_info[original_idx].apic_id;
+      }
+      __kmp_topology->insert_layer(cache_type, ids);
+    }
+  }
+
   if (!__kmp_topology->check_ids()) {
     kmp_topology_t::deallocate(__kmp_topology);
     __kmp_topology = nullptr;
     *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
+    __kmp_free(proc_info);
     return false;
   }
+  __kmp_free(proc_info);
   return true;
 }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
@@ -2716,14 +3091,16 @@ static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
 // Set the array sizes for the hierarchy layers
 static void __kmp_dispatch_set_hierarchy_values() {
   // Set the maximum number of L1's to number of cores
-  // Set the maximum number of L2's to to either number of cores / 2 for
+  // Set the maximum number of L2's to either number of cores / 2 for
   // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
   // Or the number of cores for Intel(R) Xeon(R) processors
   // Set the maximum number of NUMA nodes and L3's to number of packages
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) &&   \
+#if KMP_ARCH_X86_64 &&                                                         \
+    (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||    \
+     KMP_OS_WINDOWS) &&                                                        \
     KMP_MIC_SUPPORTED
   if (__kmp_mic_type >= mic3)
     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
@@ -2738,7 +3115,9 @@ static void __kmp_dispatch_set_hierarchy_values() {
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
       __kmp_nThreadsPerCore;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) &&   \
+#if KMP_ARCH_X86_64 &&                                                         \
+    (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||    \
+     KMP_OS_WINDOWS) &&                                                        \
     KMP_MIC_SUPPORTED
   if (__kmp_mic_type >= mic3)
     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
@@ -2800,15 +3179,51 @@ static inline const char *__kmp_cpuinfo_get_envvar() {
   return envvar;
 }
 
+static bool __kmp_package_id_from_core_siblings_list(unsigned **threadInfo,
+                                                     unsigned num_avail,
+                                                     unsigned idx) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return false;
+
+  char path[256];
+  KMP_SNPRINTF(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%u/topology/core_siblings_list",
+               threadInfo[idx][osIdIndex]);
+  kmp_affin_mask_t *siblings = __kmp_parse_cpu_list(path);
+  for (unsigned i = 0; i < num_avail; ++i) {
+    unsigned cpu_id = threadInfo[i][osIdIndex];
+    KMP_ASSERT(cpu_id < __kmp_affin_mask_size * CHAR_BIT);
+    if (!KMP_CPU_ISSET(cpu_id, siblings))
+      continue;
+    if (threadInfo[i][pkgIdIndex] == UINT_MAX) {
+      // Arbitrarily pick the first index we encounter, it only matters that
+      // the value is the same for all siblings.
+      threadInfo[i][pkgIdIndex] = idx;
+    } else if (threadInfo[i][pkgIdIndex] != idx) {
+      // Contradictory sibling lists.
+      KMP_CPU_FREE(siblings);
+      return false;
+    }
+  }
+  KMP_ASSERT(threadInfo[idx][pkgIdIndex] != UINT_MAX);
+  KMP_CPU_FREE(siblings);
+  return true;
+}
+
 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
-// affinity map.
+// affinity map. On AIX, the map is obtained through system SRAD (Scheduler
+// Resource Allocation Domain).
 static bool __kmp_affinity_create_cpuinfo_map(int *line,
                                               kmp_i18n_id_t *const msg_id) {
+  *msg_id = kmp_i18n_null;
+
+#if KMP_OS_AIX
+  unsigned num_records = __kmp_xproc;
+#else
   const char *filename = __kmp_cpuinfo_get_filename();
   const char *envvar = __kmp_cpuinfo_get_envvar();
-  *msg_id = kmp_i18n_null;
 
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
   }
 
@@ -2865,6 +3280,7 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
     *msg_id = kmp_i18n_str_CantRewindCpuinfo;
     return false;
   }
+#endif // KMP_OS_AIX
 
   // Allocate the array of records to store the proc info in.  The dummy
   // element at the end makes the logic in filling them out easier to code.
@@ -2894,8 +3310,96 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
     INIT_PROC_INFO(threadInfo[i]);
   }
 
+#if KMP_OS_AIX
+  int smt_threads;
+  lpar_info_format1_t cpuinfo;
+  unsigned num_avail = __kmp_xproc;
+
+  if (__kmp_affinity.flags.verbose)
+    KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "system info for topology");
+
+  // Get the number of SMT threads per core.
+  smt_threads = syssmt(GET_NUMBER_SMT_SETS, 0, 0, NULL);
+
+  // Allocate a resource set containing available system resourses.
+  rsethandle_t sys_rset = rs_alloc(RS_SYSTEM);
+  if (sys_rset == NULL) {
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+  // Allocate a resource set for the SRAD info.
+  rsethandle_t srad = rs_alloc(RS_EMPTY);
+  if (srad == NULL) {
+    rs_free(sys_rset);
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+
+  // Get the SRAD system detail level.
+  int sradsdl = rs_getinfo(NULL, R_SRADSDL, 0);
+  if (sradsdl < 0) {
+    rs_free(sys_rset);
+    rs_free(srad);
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+  // Get the number of RADs at that SRAD SDL.
+  int num_rads = rs_numrads(sys_rset, sradsdl, 0);
+  if (num_rads < 0) {
+    rs_free(sys_rset);
+    rs_free(srad);
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+
+  // Get the maximum number of procs that may be contained in a resource set.
+  int max_procs = rs_getinfo(NULL, R_MAXPROCS, 0);
+  if (max_procs < 0) {
+    rs_free(sys_rset);
+    rs_free(srad);
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+
+  int cur_rad = 0;
+  int num_set = 0;
+  for (int srad_idx = 0; cur_rad < num_rads && srad_idx < VMI_MAXRADS;
+       ++srad_idx) {
+    // Check if the SRAD is available in the RSET.
+    if (rs_getrad(sys_rset, srad, sradsdl, srad_idx, 0) < 0)
+      continue;
+
+    for (int cpu = 0; cpu < max_procs; cpu++) {
+      // Set the info for the cpu if it is in the SRAD.
+      if (rs_op(RS_TESTRESOURCE, srad, NULL, R_PROCS, cpu)) {
+        threadInfo[cpu][osIdIndex] = cpu;
+        threadInfo[cpu][pkgIdIndex] = cur_rad;
+        threadInfo[cpu][coreIdIndex] = cpu / smt_threads;
+        ++num_set;
+        if (num_set >= num_avail) {
+          // Done if all available CPUs have been set.
+          break;
+        }
+      }
+    }
+    ++cur_rad;
+  }
+  rs_free(sys_rset);
+  rs_free(srad);
+
+  // The topology is already sorted.
+
+#else // !KMP_OS_AIX
   unsigned num_avail = 0;
   *line = 0;
+#if KMP_ARCH_S390X
+  bool reading_s390x_sys_info = true;
+#endif
   while (!feof(f)) {
     // Create an inner scoping level, so that all the goto targets at the end of
     // the loop appear in an outer scoping level. This avoids warnings about
@@ -2931,7 +3435,31 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
       }
       (*line)++;
 
+#if KMP_ARCH_LOONGARCH64
+      // The parsing logic of /proc/cpuinfo in this function highly depends on
+      // the blank lines between each processor info block. But on LoongArch a
+      // blank line exists before the first processor info block (i.e. after the
+      // "system type" line). This blank line was added because the "system
+      // type" line is unrelated to any of the CPUs. We must skip this line so
+      // that the original logic works on LoongArch.
+      if (*buf == '\n' && *line == 2)
+        continue;
+#endif
+#if KMP_ARCH_S390X
+      // s390x /proc/cpuinfo starts with a variable number of lines containing
+      // the overall system information. Skip them.
+      if (reading_s390x_sys_info) {
+        if (*buf == '\n')
+          reading_s390x_sys_info = false;
+        continue;
+      }
+#endif
+
+#if KMP_ARCH_S390X
+      char s1[] = "cpu number";
+#else
       char s1[] = "processor";
+#endif
       if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
         CHECK_LINE;
         char *p = strchr(buf + sizeof(s1) - 1, ':');
@@ -2957,6 +3485,23 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
             threadInfo[num_avail][osIdIndex]);
         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
 
+#if KMP_ARCH_S390X
+        // Disambiguate physical_package_id.
+        unsigned book_id;
+        KMP_SNPRINTF(path, sizeof(path),
+                     "/sys/devices/system/cpu/cpu%u/topology/book_id",
+                     threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &book_id);
+        threadInfo[num_avail][pkgIdIndex] |= (book_id << 8);
+
+        unsigned drawer_id;
+        KMP_SNPRINTF(path, sizeof(path),
+                     "/sys/devices/system/cpu/cpu%u/topology/drawer_id",
+                     threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &drawer_id);
+        threadInfo[num_avail][pkgIdIndex] |= (drawer_id << 16);
+#endif
+
         KMP_SNPRINTF(path, sizeof(path),
                      "/sys/devices/system/cpu/cpu%u/topology/core_id",
                      threadInfo[num_avail][osIdIndex]);
@@ -3040,21 +3585,17 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
         return false;
       }
 
-      // Check for missing fields.  The osId field must be there, and we
-      // currently require that the physical id field is specified, also.
+      // Check for missing fields.  The osId field must be there. The physical
+      // id field will be checked later.
       if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_MissingProcField;
         return false;
       }
-      if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
-        CLEANUP_THREAD_INFO;
-        *msg_id = kmp_i18n_str_MissingPhysicalIDField;
-        return false;
-      }
 
       // Skip this proc if it is not included in the machine model.
-      if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
+      if (KMP_AFFINITY_CAPABLE() &&
+          !KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
                          __kmp_affin_fullMask)) {
         INIT_PROC_INFO(threadInfo[num_avail]);
         continue;
@@ -3080,6 +3621,18 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
   }
   *line = 0;
 
+  // At least on powerpc, Linux may return -1 for physical_package_id. Try
+  // to reconstruct topology from core_siblings_list in that case.
+  for (i = 0; i < num_avail; ++i) {
+    if (threadInfo[i][pkgIdIndex] == UINT_MAX) {
+      if (!__kmp_package_id_from_core_siblings_list(threadInfo, num_avail, i)) {
+        CLEANUP_THREAD_INFO;
+        *msg_id = kmp_i18n_str_MissingPhysicalIDField;
+        return false;
+      }
+    }
+  }
+
 #if KMP_MIC && REDUCE_TEAM_SIZE
   unsigned teamSize = 0;
 #endif // KMP_MIC && REDUCE_TEAM_SIZE
@@ -3096,6 +3649,8 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
   qsort(threadInfo, num_avail, sizeof(*threadInfo),
         __kmp_affinity_cmp_ProcCpuInfo_phys_id);
 
+#endif // KMP_OS_AIX
+
   // The table is now sorted by pkgId / coreId / threadId, but we really don't
   // know the radix of any of the fields. pkgId's may be sparsely assigned among
   // the chips on a system. Although coreId's are usually assigned
@@ -3210,7 +3765,7 @@ restart_radix_check:
         return false;
       }
 
-      // If the thread ids were not specified and we see entries entries that
+      // If the thread ids were not specified and we see entries that
       // are duplicates, start the loop over and assign the thread ids manually.
       assign_thread_ids = true;
       goto restart_radix_check;
@@ -3239,7 +3794,7 @@ restart_radix_check:
   // not enabled.
   __kmp_ncores = totals[coreIdIndex];
   if (!KMP_AFFINITY_CAPABLE()) {
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
     return true;
   }
 
@@ -3301,10 +3856,10 @@ restart_radix_check:
   for (i = 0; i < num_avail; ++i) {
     unsigned os = threadInfo[i][osIdIndex];
     int src_index;
-    int dst_index = 0;
     kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
     hw_thread.clear();
     hw_thread.os_id = os;
+    hw_thread.original_idx = i;
 
     idx = 0;
     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
@@ -3318,7 +3873,6 @@ restart_radix_check:
       } else if (src_index == threadIdIndex) {
         hw_thread.ids[threadLevel] = threadInfo[i][src_index];
       }
-      dst_index++;
     }
   }
 
@@ -3329,6 +3883,32 @@ restart_radix_check:
   __kmp_free(counts);
   CLEANUP_THREAD_INFO;
   __kmp_topology->sort_ids();
+
+  int tlevel = __kmp_topology->get_level(KMP_HW_THREAD);
+  if (tlevel > 0) {
+    // If the thread level does not have ids, then put them in.
+    if (__kmp_topology->at(0).ids[tlevel] == kmp_hw_thread_t::UNKNOWN_ID) {
+      __kmp_topology->at(0).ids[tlevel] = 0;
+    }
+    for (int i = 1; i < __kmp_topology->get_num_hw_threads(); ++i) {
+      kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+      if (hw_thread.ids[tlevel] != kmp_hw_thread_t::UNKNOWN_ID)
+        continue;
+      kmp_hw_thread_t &prev_hw_thread = __kmp_topology->at(i - 1);
+      // Check if socket, core, anything above thread level changed.
+      // If the ids did change, then restart thread id at 0
+      // Otherwise, set thread id to prev thread's id + 1
+      for (int j = 0; j < tlevel; ++j) {
+        if (hw_thread.ids[j] != prev_hw_thread.ids[j]) {
+          hw_thread.ids[tlevel] = 0;
+          break;
+        }
+      }
+      if (hw_thread.ids[tlevel] == kmp_hw_thread_t::UNKNOWN_ID)
+        hw_thread.ids[tlevel] = prev_hw_thread.ids[tlevel] + 1;
+    }
+  }
+
   if (!__kmp_topology->check_ids()) {
     kmp_topology_t::deallocate(__kmp_topology);
     __kmp_topology = nullptr;
@@ -3341,16 +3921,25 @@ restart_radix_check:
 // Create and return a table of affinity masks, indexed by OS thread ID.
 // This routine handles OR'ing together all the affinity masks of threads
 // that are sufficiently close, if granularity > fine.
-static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
-                                            unsigned *numUnique) {
+template <typename FindNextFunctionType>
+static void __kmp_create_os_id_masks(unsigned *numUnique,
+                                     kmp_affinity_t &affinity,
+                                     FindNextFunctionType find_next) {
   // First form a table of affinity masks in order of OS thread id.
   int maxOsId;
   int i;
   int numAddrs = __kmp_topology->get_num_hw_threads();
   int depth = __kmp_topology->get_depth();
+  const char *env_var = __kmp_get_affinity_env_var(affinity);
   KMP_ASSERT(numAddrs);
   KMP_ASSERT(depth);
 
+  i = find_next(-1);
+  // If could not find HW thread location that satisfies find_next conditions,
+  // then return and fallback to increment find_next.
+  if (i >= numAddrs)
+    return;
+
   maxOsId = 0;
   for (i = numAddrs - 1;; --i) {
     int osId = __kmp_topology->at(i).os_id;
@@ -3360,14 +3949,14 @@ static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
     if (i == 0)
       break;
   }
-  kmp_affin_mask_t *osId2Mask;
-  KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
-  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
-  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
-    KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
+  affinity.num_os_id_masks = maxOsId + 1;
+  KMP_CPU_ALLOC_ARRAY(affinity.os_id_masks, affinity.num_os_id_masks);
+  KMP_ASSERT(affinity.gran_levels >= 0);
+  if (affinity.flags.verbose && (affinity.gran_levels > 0)) {
+    KMP_INFORM(ThreadsMigrate, env_var, affinity.gran_levels);
   }
-  if (__kmp_affinity_gran_levels >= (int)depth) {
-    KMP_AFF_WARNING(AffThreadsMayMigrate);
+  if (affinity.gran_levels >= (int)depth) {
+    KMP_AFF_WARNING(affinity, AffThreadsMayMigrate);
   }
 
   // Run through the table, forming the masks for all threads on each core.
@@ -3380,22 +3969,25 @@ static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
   kmp_affin_mask_t *sum;
   KMP_CPU_ALLOC_ON_STACK(sum);
   KMP_CPU_ZERO(sum);
-  KMP_CPU_SET(__kmp_topology->at(0).os_id, sum);
-  for (i = 1; i < numAddrs; i++) {
+
+  i = j = leader = find_next(-1);
+  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
+  kmp_full_mask_modifier_t full_mask;
+  for (i = find_next(i); i < numAddrs; i = find_next(i)) {
     // If this thread is sufficiently close to the leader (within the
     // granularity setting), then set the bit for this os thread in the
     // affinity mask for this group, and go on to the next thread.
-    if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) {
+    if (__kmp_topology->is_close(leader, i, affinity)) {
       KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
       continue;
     }
 
     // For every thread in this group, copy the mask to the thread's entry in
-    // the osId2Mask table.  Mark the first address as a leader.
-    for (; j < i; j++) {
+    // the OS Id mask table. Mark the first address as a leader.
+    for (; j < i; j = find_next(j)) {
       int osId = __kmp_topology->at(j).os_id;
       KMP_DEBUG_ASSERT(osId <= maxOsId);
-      kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
+      kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
       KMP_CPU_COPY(mask, sum);
       __kmp_topology->at(j).leader = (j == leader);
     }
@@ -3403,25 +3995,30 @@ static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
 
     // Start a new mask.
     leader = i;
+    full_mask.include(sum);
     KMP_CPU_ZERO(sum);
     KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
   }
 
   // For every thread in last group, copy the mask to the thread's
-  // entry in the osId2Mask table.
-  for (; j < i; j++) {
+  // entry in the OS Id mask table.
+  for (; j < i; j = find_next(j)) {
     int osId = __kmp_topology->at(j).os_id;
     KMP_DEBUG_ASSERT(osId <= maxOsId);
-    kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
+    kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
     KMP_CPU_COPY(mask, sum);
     __kmp_topology->at(j).leader = (j == leader);
   }
+  full_mask.include(sum);
   unique++;
   KMP_CPU_FREE_FROM_STACK(sum);
 
-  *maxIndex = maxOsId;
+  // See if the OS Id mask table further restricts or changes the full mask
+  if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
+    __kmp_topology->print(env_var);
+  }
+
   *numUnique = unique;
-  return osId2Mask;
 }
 
 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
@@ -3454,7 +4051,7 @@ static int nextNewMask;
   {                                                                            \
     if (((_osId) > _maxOsId) ||                                                \
         (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) {     \
-      KMP_AFF_WARNING(AffIgnoreInvalidProcID, _osId);                          \
+      KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, _osId);                \
     } else {                                                                   \
       ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));                            \
     }                                                                          \
@@ -3462,12 +4059,13 @@ static int nextNewMask;
 
 // Re-parse the proclist (for the explicit affinity type), and form the list
 // of affinity newMasks indexed by gtid.
-static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
-                                            unsigned int *out_numMasks,
-                                            const char *proclist,
-                                            kmp_affin_mask_t *osId2Mask,
-                                            int maxOsId) {
+static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) {
   int i;
+  kmp_affin_mask_t **out_masks = &affinity.masks;
+  unsigned *out_numMasks = &affinity.num_masks;
+  const char *proclist = affinity.proclist;
+  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
+  int maxOsId = affinity.num_os_id_masks - 1;
   const char *scan = proclist;
   const char *next = proclist;
 
@@ -3505,7 +4103,7 @@ static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
       // Copy the mask for that osId to the sum (union) mask.
       if ((num > maxOsId) ||
           (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
-        KMP_AFF_WARNING(AffIgnoreInvalidProcID, num);
+        KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
         KMP_CPU_ZERO(sumMask);
       } else {
         KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
@@ -3537,7 +4135,7 @@ static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
         // Add the mask for that osId to the sum mask.
         if ((num > maxOsId) ||
             (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
-          KMP_AFF_WARNING(AffIgnoreInvalidProcID, num);
+          KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
         } else {
           KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
           setSize++;
@@ -3672,10 +4270,11 @@ signed := + signed
 signed := - signed
 -----------------------------------------------------------------------------*/
 static void __kmp_process_subplace_list(const char **scan,
-                                        kmp_affin_mask_t *osId2Mask,
-                                        int maxOsId, kmp_affin_mask_t *tempMask,
+                                        kmp_affinity_t &affinity, int maxOsId,
+                                        kmp_affin_mask_t *tempMask,
                                         int *setSize) {
   const char *next;
+  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
 
   for (;;) {
     int start, count, stride, i;
@@ -3694,7 +4293,7 @@ static void __kmp_process_subplace_list(const char **scan,
     if (**scan == '}' || **scan == ',') {
       if ((start > maxOsId) ||
           (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
-        KMP_AFF_WARNING(AffIgnoreInvalidProcID, start);
+        KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
       } else {
         KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
         (*setSize)++;
@@ -3723,7 +4322,7 @@ static void __kmp_process_subplace_list(const char **scan,
       for (i = 0; i < count; i++) {
         if ((start > maxOsId) ||
             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
-          KMP_AFF_WARNING(AffIgnoreInvalidProcID, start);
+          KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
           break; // don't proliferate warnings for large count
         } else {
           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
@@ -3770,7 +4369,7 @@ static void __kmp_process_subplace_list(const char **scan,
       for (i = 0; i < count; i++) {
         if ((start > maxOsId) ||
             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
-          KMP_AFF_WARNING(AffIgnoreInvalidProcID, start);
+          KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
           break; // don't proliferate warnings for large count
         } else {
           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
@@ -3789,21 +4388,22 @@ static void __kmp_process_subplace_list(const char **scan,
   }
 }
 
-static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
+static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity,
                                 int maxOsId, kmp_affin_mask_t *tempMask,
                                 int *setSize) {
   const char *next;
+  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
 
   // valid follow sets are '{' '!' and num
   SKIP_WS(*scan);
   if (**scan == '{') {
     (*scan)++; // skip '{'
-    __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
+    __kmp_process_subplace_list(scan, affinity, maxOsId, tempMask, setSize);
     KMP_ASSERT2(**scan == '}', "bad explicit places list");
     (*scan)++; // skip '}'
   } else if (**scan == '!') {
     (*scan)++; // skip '!'
-    __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
+    __kmp_process_place(scan, affinity, maxOsId, tempMask, setSize);
     KMP_CPU_COMPLEMENT(maxOsId, tempMask);
   } else if ((**scan >= '0') && (**scan <= '9')) {
     next = *scan;
@@ -3812,7 +4412,7 @@ static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
     KMP_ASSERT(num >= 0);
     if ((num > maxOsId) ||
         (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
-      KMP_AFF_WARNING(AffIgnoreInvalidProcID, num);
+      KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
     } else {
       KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
       (*setSize)++;
@@ -3824,12 +4424,13 @@ static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
 }
 
 // static void
-void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
-                                      unsigned int *out_numMasks,
-                                      const char *placelist,
-                                      kmp_affin_mask_t *osId2Mask,
-                                      int maxOsId) {
+void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) {
   int i, j, count, stride, sign;
+  kmp_affin_mask_t **out_masks = &affinity.masks;
+  unsigned *out_numMasks = &affinity.num_masks;
+  const char *placelist = affinity.proclist;
+  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
+  int maxOsId = affinity.num_os_id_masks - 1;
   const char *scan = placelist;
   const char *next = placelist;
 
@@ -3849,7 +4450,7 @@ void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
   int setSize = 0;
 
   for (;;) {
-    __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
+    __kmp_process_place(&scan, affinity, maxOsId, tempMask, &setSize);
 
     // valid follow sets are ',' ':' and EOL
     SKIP_WS(scan);
@@ -3930,7 +4531,7 @@ void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
             (!KMP_CPU_ISSET(j + stride,
                             KMP_CPU_INDEX(osId2Mask, j + stride)))) {
           if (i < count - 1) {
-            KMP_AFF_WARNING(AffIgnoreInvalidProcID, j + stride);
+            KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, j + stride);
           }
           continue;
         }
@@ -4028,28 +4629,149 @@ static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
 
 static int *procarr = NULL;
 static int __kmp_aff_depth = 0;
+static int *__kmp_osid_to_hwthread_map = NULL;
+
+static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
+                                                  kmp_affinity_ids_t &ids,
+                                                  kmp_affinity_attrs_t &attrs) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+
+  // Initiailze ids and attrs thread data
+  for (int i = 0; i < KMP_HW_LAST; ++i)
+    ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
+  attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
+
+  // Iterate through each os id within the mask and determine
+  // the topology id and attribute information
+  int cpu;
+  int depth = __kmp_topology->get_depth();
+  KMP_CPU_SET_ITERATE(cpu, mask) {
+    int osid_idx = __kmp_osid_to_hwthread_map[cpu];
+    ids.os_id = cpu;
+    const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
+    for (int level = 0; level < depth; ++level) {
+      kmp_hw_t type = __kmp_topology->get_type(level);
+      int id = hw_thread.sub_ids[level];
+      if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
+        ids.ids[type] = id;
+      } else {
+        // This mask spans across multiple topology units, set it as such
+        // and mark every level below as such as well.
+        ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+        for (; level < depth; ++level) {
+          kmp_hw_t type = __kmp_topology->get_type(level);
+          ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+        }
+      }
+    }
+    if (!attrs.valid) {
+      attrs.core_type = hw_thread.attrs.get_core_type();
+      attrs.core_eff = hw_thread.attrs.get_core_eff();
+      attrs.valid = 1;
+    } else {
+      // This mask spans across multiple attributes, set it as such
+      if (attrs.core_type != hw_thread.attrs.get_core_type())
+        attrs.core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+      if (attrs.core_eff != hw_thread.attrs.get_core_eff())
+        attrs.core_eff = kmp_hw_attr_t::UNKNOWN_CORE_EFF;
+    }
+  }
+}
+
+static void __kmp_affinity_get_thread_topology_info(kmp_info_t *th) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+  const kmp_affin_mask_t *mask = th->th.th_affin_mask;
+  kmp_affinity_ids_t &ids = th->th.th_topology_ids;
+  kmp_affinity_attrs_t &attrs = th->th.th_topology_attrs;
+  __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
+}
+
+// Assign the topology information to each place in the place list
+// A thread can then grab not only its affinity mask, but the topology
+// information associated with that mask. e.g., Which socket is a thread on
+static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+  if (affinity.type != affinity_none) {
+    KMP_ASSERT(affinity.num_os_id_masks);
+    KMP_ASSERT(affinity.os_id_masks);
+  }
+  KMP_ASSERT(affinity.num_masks);
+  KMP_ASSERT(affinity.masks);
+  KMP_ASSERT(__kmp_affin_fullMask);
+
+  int max_cpu = __kmp_affin_fullMask->get_max_cpu();
+  int num_hw_threads = __kmp_topology->get_num_hw_threads();
+
+  // Allocate thread topology information
+  if (!affinity.ids) {
+    affinity.ids = (kmp_affinity_ids_t *)__kmp_allocate(
+        sizeof(kmp_affinity_ids_t) * affinity.num_masks);
+  }
+  if (!affinity.attrs) {
+    affinity.attrs = (kmp_affinity_attrs_t *)__kmp_allocate(
+        sizeof(kmp_affinity_attrs_t) * affinity.num_masks);
+  }
+  if (!__kmp_osid_to_hwthread_map) {
+    // Want the +1 because max_cpu should be valid index into map
+    __kmp_osid_to_hwthread_map =
+        (int *)__kmp_allocate(sizeof(int) * (max_cpu + 1));
+  }
+
+  // Create the OS proc to hardware thread map
+  for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) {
+    int os_id = __kmp_topology->at(hw_thread).os_id;
+    if (KMP_CPU_ISSET(os_id, __kmp_affin_fullMask))
+      __kmp_osid_to_hwthread_map[os_id] = hw_thread;
+  }
+
+  for (unsigned i = 0; i < affinity.num_masks; ++i) {
+    kmp_affinity_ids_t &ids = affinity.ids[i];
+    kmp_affinity_attrs_t &attrs = affinity.attrs[i];
+    kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.masks, i);
+    __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
+  }
+}
+
+// Called when __kmp_topology is ready
+static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
+  // Initialize other data structures which depend on the topology
+  if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
+    machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
+    __kmp_affinity_get_topology_info(affinity);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+    __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
+#endif
+  }
+}
 
 // Create a one element mask array (set of places) which only contains the
 // initial process's affinity mask
-static void __kmp_create_affinity_none_places() {
+static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {
   KMP_ASSERT(__kmp_affin_fullMask != NULL);
-  KMP_ASSERT(__kmp_affinity_type == affinity_none);
-  __kmp_affinity_num_masks = 1;
-  KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
-  kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0);
+  KMP_ASSERT(affinity.type == affinity_none);
+  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+  affinity.num_masks = 1;
+  KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
+  kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0);
   KMP_CPU_COPY(dest, __kmp_affin_fullMask);
+  __kmp_aux_affinity_initialize_other_data(affinity);
 }
 
-static void __kmp_aux_affinity_initialize(void) {
-  if (__kmp_affinity_masks != NULL) {
-    KMP_ASSERT(__kmp_affin_fullMask != NULL);
-    return;
-  }
-
+static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) {
   // Create the "full" mask - this defines all of the processors that we
   // consider to be in the machine model. If respect is set, then it is the
   // initialization thread's affinity mask. Otherwise, it is all processors that
   // we know about on the machine.
+  int verbose = affinity.flags.verbose;
+  const char *env_var = affinity.env_var;
+
+  // Already initialized
+  if (__kmp_affin_fullMask && __kmp_affin_origMask)
+    return;
+
   if (__kmp_affin_fullMask == NULL) {
     KMP_CPU_ALLOC(__kmp_affin_fullMask);
   }
@@ -4060,7 +4782,7 @@ static void __kmp_aux_affinity_initialize(void) {
     __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
     // Make a copy before possible expanding to the entire machine mask
     __kmp_affin_origMask->copy(__kmp_affin_fullMask);
-    if (__kmp_affinity_respect_mask) {
+    if (affinity.flags.respect) {
       // Count the number of available processors.
       unsigned i;
       __kmp_avail_proc = 0;
@@ -4071,24 +4793,24 @@ static void __kmp_aux_affinity_initialize(void) {
         __kmp_avail_proc++;
       }
       if (__kmp_avail_proc > __kmp_xproc) {
-        KMP_AFF_WARNING(ErrorInitializeAffinity);
-        __kmp_affinity_type = affinity_none;
+        KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
+        affinity.type = affinity_none;
         KMP_AFFINITY_DISABLE();
         return;
       }
 
-      if (__kmp_affinity_verbose) {
+      if (verbose) {
         char buf[KMP_AFFIN_MASK_PRINT_LEN];
         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                                   __kmp_affin_fullMask);
-        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+        KMP_INFORM(InitOSProcSetRespect, env_var, buf);
       }
     } else {
-      if (__kmp_affinity_verbose) {
+      if (verbose) {
         char buf[KMP_AFFIN_MASK_PRINT_LEN];
         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                                   __kmp_affin_fullMask);
-        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+        KMP_INFORM(InitOSProcSetNotRespect, env_var, buf);
       }
       __kmp_avail_proc =
           __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
@@ -4103,8 +4825,13 @@ static void __kmp_aux_affinity_initialize(void) {
 #endif
     }
   }
+}
 
+static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
+  bool success = false;
+  const char *env_var = affinity.env_var;
   kmp_i18n_id_t msg_id = kmp_i18n_null;
+  int verbose = affinity.flags.verbose;
 
   // For backward compatibility, setting KMP_CPUINFO_FILE =>
   // KMP_TOPOLOGY_METHOD=cpuinfo
@@ -4113,7 +4840,6 @@ static void __kmp_aux_affinity_initialize(void) {
     __kmp_affinity_top_method = affinity_top_method_cpuinfo;
   }
 
-  bool success = false;
   if (__kmp_affinity_top_method == affinity_top_method_all) {
 // In the default code path, errors are not fatal - we just try using
 // another method. We only emit a warning message if affinity is on, or the
@@ -4123,11 +4849,11 @@ static void __kmp_aux_affinity_initialize(void) {
         __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
       if (!__kmp_hwloc_error) {
         success = __kmp_affinity_create_hwloc_map(&msg_id);
-        if (!success && __kmp_affinity_verbose) {
-          KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+        if (!success && verbose) {
+          KMP_INFORM(AffIgnoringHwloc, env_var);
         }
-      } else if (__kmp_affinity_verbose) {
-        KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+      } else if (verbose) {
+        KMP_INFORM(AffIgnoringHwloc, env_var);
       }
     }
 #endif
@@ -4135,24 +4861,24 @@ static void __kmp_aux_affinity_initialize(void) {
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
     if (!success) {
       success = __kmp_affinity_create_x2apicid_map(&msg_id);
-      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
-        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
       }
     }
     if (!success) {
       success = __kmp_affinity_create_apicid_map(&msg_id);
-      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
-        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
       }
     }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-#if KMP_OS_LINUX
+#if KMP_OS_LINUX || KMP_OS_AIX
     if (!success) {
       int line = 0;
       success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
-      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
-        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
       }
     }
 #endif /* KMP_OS_LINUX */
@@ -4160,16 +4886,16 @@ static void __kmp_aux_affinity_initialize(void) {
 #if KMP_GROUP_AFFINITY
     if (!success && (__kmp_num_proc_groups > 1)) {
       success = __kmp_affinity_create_proc_group_map(&msg_id);
-      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
-        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
       }
     }
 #endif /* KMP_GROUP_AFFINITY */
 
     if (!success) {
       success = __kmp_affinity_create_flat_map(&msg_id);
-      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
-        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
       }
       KMP_ASSERT(success);
     }
@@ -4241,130 +4967,187 @@ static void __kmp_aux_affinity_initialize(void) {
   // Early exit if topology could not be created
   if (!__kmp_topology) {
     if (KMP_AFFINITY_CAPABLE()) {
-      KMP_AFF_WARNING(ErrorInitializeAffinity);
+      KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
     }
     if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
         __kmp_ncores > 0) {
       __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
       __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
                                    __kmp_nThreadsPerCore, __kmp_ncores);
-      if (__kmp_affinity_verbose) {
-        __kmp_topology->print("KMP_AFFINITY");
+      if (verbose) {
+        __kmp_topology->print(env_var);
       }
     }
-    __kmp_affinity_type = affinity_none;
-    __kmp_create_affinity_none_places();
-#if KMP_USE_HIER_SCHED
-    __kmp_dispatch_set_hierarchy_values();
-#endif
-    KMP_AFFINITY_DISABLE();
-    return;
+    return false;
   }
 
-  // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and
-  // initialize other data structures which depend on the topology
+  // Canonicalize, print (if requested), apply KMP_HW_SUBSET
   __kmp_topology->canonicalize();
-  if (__kmp_affinity_verbose)
-    __kmp_topology->print("KMP_AFFINITY");
+  if (verbose)
+    __kmp_topology->print(env_var);
   bool filtered = __kmp_topology->filter_hw_subset();
-  if (filtered) {
-#if KMP_OS_WINDOWS
-    // Copy filtered full mask if topology has single processor group
-    if (__kmp_num_proc_groups <= 1)
-#endif
-      __kmp_affin_origMask->copy(__kmp_affin_fullMask);
-  }
-  if (filtered && __kmp_affinity_verbose)
+  if (filtered && verbose)
     __kmp_topology->print("KMP_HW_SUBSET");
-  machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
-  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+  return success;
+}
+
+static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
+  bool is_regular_affinity = (&affinity == &__kmp_affinity);
+  bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity);
+  const char *env_var = __kmp_get_affinity_env_var(affinity);
+
+  if (affinity.flags.initialized) {
+    KMP_ASSERT(__kmp_affin_fullMask != NULL);
+    return;
+  }
+
+  if (is_regular_affinity && (!__kmp_affin_fullMask || !__kmp_affin_origMask))
+    __kmp_aux_affinity_initialize_masks(affinity);
+
+  if (is_regular_affinity && !__kmp_topology) {
+    bool success = __kmp_aux_affinity_initialize_topology(affinity);
+    if (success) {
+      KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+    } else {
+      affinity.type = affinity_none;
+      KMP_AFFINITY_DISABLE();
+    }
+  }
+
   // If KMP_AFFINITY=none, then only create the single "none" place
   // which is the process's initial affinity mask or the number of
   // hardware threads depending on respect,norespect
-  if (__kmp_affinity_type == affinity_none) {
-    __kmp_create_affinity_none_places();
+  if (affinity.type == affinity_none) {
+    __kmp_create_affinity_none_places(affinity);
 #if KMP_USE_HIER_SCHED
     __kmp_dispatch_set_hierarchy_values();
 #endif
+    affinity.flags.initialized = TRUE;
     return;
   }
+
+  __kmp_topology->set_granularity(affinity);
   int depth = __kmp_topology->get_depth();
 
   // Create the table of masks, indexed by thread Id.
-  unsigned maxIndex;
-  unsigned numUnique;
-  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique);
-  if (__kmp_affinity_gran_levels == 0) {
-    KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
-  }
-
-  switch (__kmp_affinity_type) {
+  unsigned numUnique = 0;
+  int numAddrs = __kmp_topology->get_num_hw_threads();
+  // If OMP_PLACES=cores:<attribute> specified, then attempt
+  // to make OS Id mask table using those attributes
+  if (affinity.core_attr_gran.valid) {
+    __kmp_create_os_id_masks(&numUnique, affinity, [&](int idx) {
+      KMP_ASSERT(idx >= -1);
+      for (int i = idx + 1; i < numAddrs; ++i)
+        if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran))
+          return i;
+      return numAddrs;
+    });
+    if (!affinity.os_id_masks) {
+      const char *core_attribute;
+      if (affinity.core_attr_gran.core_eff != kmp_hw_attr_t::UNKNOWN_CORE_EFF)
+        core_attribute = "core_efficiency";
+      else
+        core_attribute = "core_type";
+      KMP_AFF_WARNING(affinity, AffIgnoringNotAvailable, env_var,
+                      core_attribute,
+                      __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true))
+    }
+  }
+  // If core attributes did not work, or none were specified,
+  // then make OS Id mask table using typical incremental way with
+  // checking for validity of each id at granularity level specified.
+  if (!affinity.os_id_masks) {
+    int gran = affinity.gran_levels;
+    int gran_level = depth - 1 - affinity.gran_levels;
+    if (gran >= 0 && gran_level >= 0 && gran_level < depth) {
+      __kmp_create_os_id_masks(
+          &numUnique, affinity, [depth, numAddrs, &affinity](int idx) {
+            KMP_ASSERT(idx >= -1);
+            int gran = affinity.gran_levels;
+            int gran_level = depth - 1 - affinity.gran_levels;
+            for (int i = idx + 1; i < numAddrs; ++i)
+              if ((gran >= depth) ||
+                  (gran < depth && __kmp_topology->at(i).ids[gran_level] !=
+                                       kmp_hw_thread_t::UNKNOWN_ID))
+                return i;
+            return numAddrs;
+          });
+    }
+  }
+  // Final attempt to make OS Id mask table using typical incremental way.
+  if (!affinity.os_id_masks) {
+    __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) {
+      KMP_ASSERT(idx >= -1);
+      return idx + 1;
+    });
+  }
+
+  switch (affinity.type) {
 
   case affinity_explicit:
-    KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
-    if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
-      __kmp_affinity_process_proclist(
-          &__kmp_affinity_masks, &__kmp_affinity_num_masks,
-          __kmp_affinity_proclist, osId2Mask, maxIndex);
+    KMP_DEBUG_ASSERT(affinity.proclist != NULL);
+    if (is_hidden_helper_affinity ||
+        __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
+      __kmp_affinity_process_proclist(affinity);
     } else {
-      __kmp_affinity_process_placelist(
-          &__kmp_affinity_masks, &__kmp_affinity_num_masks,
-          __kmp_affinity_proclist, osId2Mask, maxIndex);
-    }
-    if (__kmp_affinity_num_masks == 0) {
-      KMP_AFF_WARNING(AffNoValidProcID);
-      __kmp_affinity_type = affinity_none;
-      __kmp_create_affinity_none_places();
+      __kmp_affinity_process_placelist(affinity);
+    }
+    if (affinity.num_masks == 0) {
+      KMP_AFF_WARNING(affinity, AffNoValidProcID);
+      affinity.type = affinity_none;
+      __kmp_create_affinity_none_places(affinity);
+      affinity.flags.initialized = TRUE;
       return;
     }
     break;
 
   // The other affinity types rely on sorting the hardware threads according to
-  // some permutation of the machine topology tree. Set __kmp_affinity_compact
-  // and __kmp_affinity_offset appropriately, then jump to a common code
+  // some permutation of the machine topology tree. Set affinity.compact
+  // and affinity.offset appropriately, then jump to a common code
   // fragment to do the sort and create the array of affinity masks.
   case affinity_logical:
-    __kmp_affinity_compact = 0;
-    if (__kmp_affinity_offset) {
-      __kmp_affinity_offset =
-          __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
+    affinity.compact = 0;
+    if (affinity.offset) {
+      affinity.offset =
+          __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
     }
     goto sortTopology;
 
   case affinity_physical:
     if (__kmp_nThreadsPerCore > 1) {
-      __kmp_affinity_compact = 1;
-      if (__kmp_affinity_compact >= depth) {
-        __kmp_affinity_compact = 0;
+      affinity.compact = 1;
+      if (affinity.compact >= depth) {
+        affinity.compact = 0;
       }
     } else {
-      __kmp_affinity_compact = 0;
+      affinity.compact = 0;
     }
-    if (__kmp_affinity_offset) {
-      __kmp_affinity_offset =
-          __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
+    if (affinity.offset) {
+      affinity.offset =
+          __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
     }
     goto sortTopology;
 
   case affinity_scatter:
-    if (__kmp_affinity_compact >= depth) {
-      __kmp_affinity_compact = 0;
+    if (affinity.compact >= depth) {
+      affinity.compact = 0;
     } else {
-      __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
+      affinity.compact = depth - 1 - affinity.compact;
     }
     goto sortTopology;
 
   case affinity_compact:
-    if (__kmp_affinity_compact >= depth) {
-      __kmp_affinity_compact = depth - 1;
+    if (affinity.compact >= depth) {
+      affinity.compact = depth - 1;
     }
     goto sortTopology;
 
   case affinity_balanced:
-    if (depth <= 1) {
-      KMP_AFF_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
-      __kmp_affinity_type = affinity_none;
-      __kmp_create_affinity_none_places();
+    if (depth <= 1 || is_hidden_helper_affinity) {
+      KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
+      affinity.type = affinity_none;
+      __kmp_create_affinity_none_places(affinity);
+      affinity.flags.initialized = TRUE;
       return;
     } else if (!__kmp_topology->is_uniform()) {
       // Save the depth for further usage
@@ -4379,8 +5162,10 @@ static void __kmp_aux_affinity_initialize(void) {
 
       int nproc = ncores * maxprocpercore;
       if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
-        KMP_AFF_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
-        __kmp_affinity_type = affinity_none;
+        KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
+        affinity.type = affinity_none;
+        __kmp_create_affinity_none_places(affinity);
+        affinity.flags.initialized = TRUE;
         return;
       }
 
@@ -4405,48 +5190,57 @@ static void __kmp_aux_affinity_initialize(void) {
         procarr[core * maxprocpercore + inlastcore] = proc;
       }
     }
-    if (__kmp_affinity_compact >= depth) {
-      __kmp_affinity_compact = depth - 1;
+    if (affinity.compact >= depth) {
+      affinity.compact = depth - 1;
     }
 
   sortTopology:
     // Allocate the gtid->affinity mask table.
-    if (__kmp_affinity_dups) {
-      __kmp_affinity_num_masks = __kmp_avail_proc;
+    if (affinity.flags.dups) {
+      affinity.num_masks = __kmp_avail_proc;
     } else {
-      __kmp_affinity_num_masks = numUnique;
+      affinity.num_masks = numUnique;
     }
 
     if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
         (__kmp_affinity_num_places > 0) &&
-        ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
-      __kmp_affinity_num_masks = __kmp_affinity_num_places;
+        ((unsigned)__kmp_affinity_num_places < affinity.num_masks) &&
+        !is_hidden_helper_affinity) {
+      affinity.num_masks = __kmp_affinity_num_places;
     }
 
-    KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
+    KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
 
     // Sort the topology table according to the current setting of
-    // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
-    __kmp_topology->sort_compact();
+    // affinity.compact, then fill out affinity.masks.
+    __kmp_topology->sort_compact(affinity);
     {
       int i;
       unsigned j;
       int num_hw_threads = __kmp_topology->get_num_hw_threads();
+      kmp_full_mask_modifier_t full_mask;
       for (i = 0, j = 0; i < num_hw_threads; i++) {
-        if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) {
+        if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) {
           continue;
         }
         int osId = __kmp_topology->at(i).os_id;
 
-        kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
-        kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
+        kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId);
+        if (KMP_CPU_ISEMPTY(src))
+          continue;
+        kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
         KMP_ASSERT(KMP_CPU_ISSET(osId, src));
         KMP_CPU_COPY(dest, src);
-        if (++j >= __kmp_affinity_num_masks) {
+        full_mask.include(src);
+        if (++j >= affinity.num_masks) {
           break;
         }
       }
-      KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
+      KMP_DEBUG_ASSERT(j == affinity.num_masks);
+      // See if the places list further restricts or changes the full mask
+      if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
+        __kmp_topology->print(env_var);
+      }
     }
     // Sort the topology back using ids
     __kmp_topology->sort_ids();
@@ -4455,56 +5249,64 @@ static void __kmp_aux_affinity_initialize(void) {
   default:
     KMP_ASSERT2(0, "Unexpected affinity setting");
   }
-
-  KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
+  __kmp_aux_affinity_initialize_other_data(affinity);
+  affinity.flags.initialized = TRUE;
 }
 
-void __kmp_affinity_initialize(void) {
+void __kmp_affinity_initialize(kmp_affinity_t &affinity) {
   // Much of the code above was written assuming that if a machine was not
-  // affinity capable, then __kmp_affinity_type == affinity_none.  We now
-  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
-  // There are too many checks for __kmp_affinity_type == affinity_none
-  // in this code.  Instead of trying to change them all, check if
-  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
-  // affinity_none, call the real initialization routine, then restore
-  // __kmp_affinity_type to affinity_disabled.
-  int disabled = (__kmp_affinity_type == affinity_disabled);
-  if (!KMP_AFFINITY_CAPABLE()) {
+  // affinity capable, then affinity type == affinity_none.
+  // We now explicitly represent this as affinity type == affinity_disabled.
+  // There are too many checks for affinity type == affinity_none in this code.
+  // Instead of trying to change them all, check if
+  // affinity type == affinity_disabled, and if so, slam it with affinity_none,
+  // call the real initialization routine, then restore affinity type to
+  // affinity_disabled.
+  int disabled = (affinity.type == affinity_disabled);
+  if (!KMP_AFFINITY_CAPABLE())
     KMP_ASSERT(disabled);
-  }
-  if (disabled) {
-    __kmp_affinity_type = affinity_none;
-  }
-  __kmp_aux_affinity_initialize();
-  if (disabled) {
-    __kmp_affinity_type = affinity_disabled;
-  }
+  if (disabled)
+    affinity.type = affinity_none;
+  __kmp_aux_affinity_initialize(affinity);
+  if (disabled)
+    affinity.type = affinity_disabled;
 }
 
 void __kmp_affinity_uninitialize(void) {
-  if (__kmp_affinity_masks != NULL) {
-    KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
-    __kmp_affinity_masks = NULL;
-  }
-  if (__kmp_affin_fullMask != NULL) {
-    KMP_CPU_FREE(__kmp_affin_fullMask);
-    __kmp_affin_fullMask = NULL;
+  for (kmp_affinity_t *affinity : __kmp_affinities) {
+    if (affinity->masks != NULL)
+      KMP_CPU_FREE_ARRAY(affinity->masks, affinity->num_masks);
+    if (affinity->os_id_masks != NULL)
+      KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks);
+    if (affinity->proclist != NULL)
+      __kmp_free(affinity->proclist);
+    if (affinity->ids != NULL)
+      __kmp_free(affinity->ids);
+    if (affinity->attrs != NULL)
+      __kmp_free(affinity->attrs);
+    *affinity = KMP_AFFINITY_INIT(affinity->env_var);
   }
   if (__kmp_affin_origMask != NULL) {
+    if (KMP_AFFINITY_CAPABLE()) {
+#if KMP_OS_AIX
+      // Uninitialize by unbinding the thread.
+      bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
+#else
+      __kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
+#endif
+    }
     KMP_CPU_FREE(__kmp_affin_origMask);
     __kmp_affin_origMask = NULL;
   }
-  __kmp_affinity_num_masks = 0;
-  __kmp_affinity_type = affinity_default;
   __kmp_affinity_num_places = 0;
-  if (__kmp_affinity_proclist != NULL) {
-    __kmp_free(__kmp_affinity_proclist);
-    __kmp_affinity_proclist = NULL;
-  }
   if (procarr != NULL) {
     __kmp_free(procarr);
     procarr = NULL;
   }
+  if (__kmp_osid_to_hwthread_map) {
+    __kmp_free(__kmp_osid_to_hwthread_map);
+    __kmp_osid_to_hwthread_map = NULL;
+  }
 #if KMP_USE_HWLOC
   if (__kmp_hwloc_topology != NULL) {
     hwloc_topology_destroy(__kmp_hwloc_topology);
@@ -4522,12 +5324,36 @@ void __kmp_affinity_uninitialize(void) {
   KMPAffinity::destroy_api();
 }
 
+static void __kmp_select_mask_by_gtid(int gtid, const kmp_affinity_t *affinity,
+                                      int *place, kmp_affin_mask_t **mask) {
+  int mask_idx;
+  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
+  if (is_hidden_helper)
+    // The first gtid is the regular primary thread, the second gtid is the main
+    // thread of hidden team which does not participate in task execution.
+    mask_idx = gtid - 2;
+  else
+    mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
+  KMP_DEBUG_ASSERT(affinity->num_masks > 0);
+  *place = (mask_idx + affinity->offset) % affinity->num_masks;
+  *mask = KMP_CPU_INDEX(affinity->masks, *place);
+}
+
+// This function initializes the per-thread data concerning affinity including
+// the mask and topology information
 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
+
+  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+
+  // Set the thread topology information to default of unknown
+  for (int id = 0; id < KMP_HW_LAST; ++id)
+    th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
+  th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
+
   if (!KMP_AFFINITY_CAPABLE()) {
     return;
   }
 
-  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
   if (th->th.th_affin_mask == NULL) {
     KMP_CPU_ALLOC(th->th.th_affin_mask);
   } else {
@@ -4535,16 +5361,24 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
   }
 
   // Copy the thread mask to the kmp_info_t structure. If
-  // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
-  // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
-  // then the full mask is the same as the mask of the initialization thread.
+  // __kmp_affinity.type == affinity_none, copy the "full" mask, i.e.
+  // one that has all of the OS proc ids set, or if
+  // __kmp_affinity.flags.respect is set, then the full mask is the
+  // same as the mask of the initialization thread.
   kmp_affin_mask_t *mask;
   int i;
+  const kmp_affinity_t *affinity;
+  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
+
+  if (is_hidden_helper)
+    affinity = &__kmp_hh_affinity;
+  else
+    affinity = &__kmp_affinity;
 
-  if (KMP_AFFINITY_NON_PROC_BIND) {
-    if ((__kmp_affinity_type == affinity_none) ||
-        (__kmp_affinity_type == affinity_balanced) ||
-        KMP_HIDDEN_HELPER_THREAD(gtid)) {
+  if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) {
+    if ((affinity->type == affinity_none) ||
+        (affinity->type == affinity_balanced) ||
+        KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
 #if KMP_GROUP_AFFINITY
       if (__kmp_num_proc_groups > 1) {
         return;
@@ -4554,14 +5388,10 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
       i = 0;
       mask = __kmp_affin_fullMask;
     } else {
-      int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
-      KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
-      i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
-      mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+      __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
     }
   } else {
-    if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) ||
-        (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
+    if (!isa_root || __kmp_nested_proc_bind.bind_types[0] == proc_bind_false) {
 #if KMP_GROUP_AFFINITY
       if (__kmp_num_proc_groups > 1) {
         return;
@@ -4571,85 +5401,94 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
       i = KMP_PLACE_ALL;
       mask = __kmp_affin_fullMask;
     } else {
-      // int i = some hash function or just a counter that doesn't
-      // always start at 0.  Use adjusted gtid for now.
-      int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
-      KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
-      i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
-      mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+      __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
     }
   }
 
   th->th.th_current_place = i;
-  if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) {
+  if (isa_root && !is_hidden_helper) {
     th->th.th_new_place = i;
     th->th.th_first_place = 0;
-    th->th.th_last_place = __kmp_affinity_num_masks - 1;
+    th->th.th_last_place = affinity->num_masks - 1;
   } else if (KMP_AFFINITY_NON_PROC_BIND) {
     // When using a Non-OMP_PROC_BIND affinity method,
     // set all threads' place-partition-var to the entire place list
     th->th.th_first_place = 0;
-    th->th.th_last_place = __kmp_affinity_num_masks - 1;
+    th->th.th_last_place = affinity->num_masks - 1;
+  }
+  // Copy topology information associated with the place
+  if (i >= 0) {
+    th->th.th_topology_ids = __kmp_affinity.ids[i];
+    th->th.th_topology_attrs = __kmp_affinity.attrs[i];
   }
 
   if (i == KMP_PLACE_ALL) {
-    KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
+    KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n",
                    gtid));
   } else {
-    KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
+    KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n",
                    gtid, i));
   }
 
   KMP_CPU_COPY(th->th.th_affin_mask, mask);
+}
 
-  if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid)
-      /* to avoid duplicate printing (will be correctly printed on barrier) */
-      && (__kmp_affinity_type == affinity_none ||
-          (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) {
-    char buf[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-                              th->th.th_affin_mask);
-    KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
-               __kmp_gettid(), gtid, buf);
+void __kmp_affinity_bind_init_mask(int gtid) {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return;
   }
+  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+  const kmp_affinity_t *affinity;
+  const char *env_var;
+  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
 
-#if KMP_DEBUG
-  // Hidden helper thread affinity only printed for debug builds
-  if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) {
+  if (is_hidden_helper)
+    affinity = &__kmp_hh_affinity;
+  else
+    affinity = &__kmp_affinity;
+  env_var = __kmp_get_affinity_env_var(*affinity, /*for_binding=*/true);
+  /* to avoid duplicate printing (will be correctly printed on barrier) */
+  if (affinity->flags.verbose && (affinity->type == affinity_none ||
+                                  (th->th.th_current_place != KMP_PLACE_ALL &&
+                                   affinity->type != affinity_balanced)) &&
+      !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               th->th.th_affin_mask);
-    KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)",
-               (kmp_int32)getpid(), __kmp_gettid(), gtid, buf);
+    KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
+               gtid, buf);
   }
-#endif
 
 #if KMP_OS_WINDOWS
   // On Windows* OS, the process affinity mask might have changed. If the user
   // didn't request affinity and this call fails, just continue silently.
   // See CQ171393.
-  if (__kmp_affinity_type == affinity_none) {
+  if (affinity->type == affinity_none) {
     __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
   } else
 #endif
+#ifndef KMP_OS_AIX
+    // Do not set the full mask as the init mask on AIX.
     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
+#endif
 }
 
-void __kmp_affinity_set_place(int gtid) {
-  if (!KMP_AFFINITY_CAPABLE()) {
+void __kmp_affinity_bind_place(int gtid) {
+  // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND
+  if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) {
     return;
   }
 
   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
 
-  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
+  KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current "
                  "place = %d)\n",
                  gtid, th->th.th_new_place, th->th.th_current_place));
 
   // Check that the new place is within this thread's partition.
   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
   KMP_ASSERT(th->th.th_new_place >= 0);
-  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
+  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity.num_masks);
   if (th->th.th_first_place <= th->th.th_last_place) {
     KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
                (th->th.th_new_place <= th->th.th_last_place));
@@ -4661,11 +5500,11 @@ void __kmp_affinity_set_place(int gtid) {
   // Copy the thread mask to the kmp_info_t structure,
   // and set this thread's affinity.
   kmp_affin_mask_t *mask =
-      KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
+      KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place);
   KMP_CPU_COPY(th->th.th_affin_mask, mask);
   th->th.th_current_place = th->th.th_new_place;
 
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               th->th.th_affin_mask);
@@ -4733,7 +5572,7 @@ int __kmp_aux_set_affinity(void **mask) {
   th->th.th_current_place = KMP_PLACE_UNDEFINED;
   th->th.th_new_place = KMP_PLACE_UNDEFINED;
   th->th.th_first_place = 0;
-  th->th.th_last_place = __kmp_affinity_num_masks - 1;
+  th->th.th_last_place = __kmp_affinity.num_masks - 1;
 
   // Turn off 4.0 affinity for the current tread at this parallel level.
   th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
@@ -4744,7 +5583,7 @@ int __kmp_aux_set_affinity(void **mask) {
 int __kmp_aux_get_affinity(void **mask) {
   int gtid;
   int retval;
-#if KMP_OS_WINDOWS || KMP_DEBUG
+#if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
   kmp_info_t *th;
 #endif
   if (!KMP_AFFINITY_CAPABLE()) {
@@ -4752,7 +5591,7 @@ int __kmp_aux_get_affinity(void **mask) {
   }
 
   gtid = __kmp_entry_gtid();
-#if KMP_OS_WINDOWS || KMP_DEBUG
+#if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
   th = __kmp_threads[gtid];
 #else
   (void)gtid; // unused variable
@@ -4775,7 +5614,7 @@ int __kmp_aux_get_affinity(void **mask) {
     }
   }
 
-#if !KMP_OS_WINDOWS
+#if !KMP_OS_WINDOWS && !KMP_OS_AIX
 
   retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
   KA_TRACE(
@@ -4795,7 +5634,7 @@ int __kmp_aux_get_affinity(void **mask) {
   KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
   return 0;
 
-#endif /* KMP_OS_WINDOWS */
+#endif /* !KMP_OS_WINDOWS && !KMP_OS_AIX */
 }
 
 int __kmp_aux_get_affinity_max_proc() {
@@ -4908,17 +5747,40 @@ int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
   return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
 }
 
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+// Returns first os proc id with ATOM core
+int __kmp_get_first_osid_with_ecore(void) {
+  int low = 0;
+  int high = __kmp_topology->get_num_hw_threads() - 1;
+  int mid = 0;
+  while (high - low > 1) {
+    mid = (high + low) / 2;
+    if (__kmp_topology->at(mid).attrs.get_core_type() ==
+        KMP_HW_CORE_TYPE_CORE) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
+    return mid;
+  }
+  return -1;
+}
+#endif
+
 // Dynamic affinity settings - Affinity balanced
 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
   KMP_DEBUG_ASSERT(th);
   bool fine_gran = true;
   int tid = th->th.th_info.ds.ds_tid;
+  const char *env_var = "KMP_AFFINITY";
 
   // Do not perform balanced affinity for the hidden helper threads
   if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
     return;
 
-  switch (__kmp_affinity_gran) {
+  switch (__kmp_affinity.gran) {
   case KMP_HW_THREAD:
     break;
   case KMP_HW_CORE:
@@ -4976,12 +5838,13 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
         KMP_CPU_SET(osID, mask);
       }
     }
-    if (__kmp_affinity_verbose) {
+    if (__kmp_affinity.flags.verbose) {
       char buf[KMP_AFFIN_MASK_PRINT_LEN];
       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
-      KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
-                 __kmp_gettid(), tid, buf);
+      KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
+                 tid, buf);
     }
+    __kmp_affinity_get_thread_topology_info(th);
     __kmp_set_system_affinity(mask, TRUE);
   } else { // Non-uniform topology
 
@@ -5142,17 +6005,19 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
       __kmp_free(newarr);
     }
 
-    if (__kmp_affinity_verbose) {
+    if (__kmp_affinity.flags.verbose) {
       char buf[KMP_AFFIN_MASK_PRINT_LEN];
       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
-      KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
-                 __kmp_gettid(), tid, buf);
+      KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
+                 tid, buf);
     }
+    __kmp_affinity_get_thread_topology_info(th);
     __kmp_set_system_affinity(mask, TRUE);
   }
 }
 
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
+    KMP_OS_AIX
 // We don't need this entry for Windows because
 // there is GetProcessAffinityMask() api
 //
@@ -5187,7 +6052,11 @@ extern "C"
                 "set full mask for thread %d\n",
                 gtid));
   KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
+#if KMP_OS_AIX
+  return bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
+#else
   return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
+#endif
 }
 #endif
 
diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.h b/contrib/libs/cxxsupp/openmp/kmp_affinity.h
index ce00362f04c..9ab2c0cc70d 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_affinity.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.h
@@ -29,11 +29,14 @@ public:
       mask = hwloc_bitmap_alloc();
       this->zero();
     }
+    Mask(const Mask &other) = delete;
+    Mask &operator=(const Mask &other) = delete;
     ~Mask() { hwloc_bitmap_free(mask); }
     void set(int i) override { hwloc_bitmap_set(mask, i); }
     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
     void zero() override { hwloc_bitmap_zero(mask); }
+    bool empty() const override { return hwloc_bitmap_iszero(mask); }
     void copy(const KMPAffinity::Mask *src) override {
       const Mask *convert = static_cast<const Mask *>(src);
       hwloc_bitmap_copy(mask, convert->mask);
@@ -47,6 +50,10 @@ public:
       hwloc_bitmap_or(mask, mask, convert->mask);
     }
     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
+    bool is_equal(const KMPAffinity::Mask *rhs) const override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      return hwloc_bitmap_isequal(mask, convert->mask);
+    }
     int begin() const override { return hwloc_bitmap_first(mask); }
     int end() const override { return -1; }
     int next(int previous) const override {
@@ -62,7 +69,8 @@ public:
       }
       int error = errno;
       if (abort_on_error) {
-        __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
+        __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
+                    KMP_ERR(error), __kmp_msg_null);
       }
       return error;
     }
@@ -76,7 +84,8 @@ public:
       }
       int error = errno;
       if (abort_on_error) {
-        __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
+        __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
+                    KMP_ERR(error), __kmp_msg_null);
       }
       return error;
     }
@@ -95,7 +104,8 @@ public:
           return 0;
         error = errno;
         if (abort_on_error)
-          __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
+          __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
+                      KMP_ERR(error), __kmp_msg_null);
       }
       return error;
     }
@@ -128,13 +138,15 @@ public:
     if (__kmp_hwloc_topology == NULL) {
       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
         __kmp_hwloc_error = TRUE;
-        if (__kmp_affinity_verbose)
+        if (__kmp_affinity.flags.verbose) {
           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
+        }
       }
       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
         __kmp_hwloc_error = TRUE;
-        if (__kmp_affinity_verbose)
+        if (__kmp_affinity.flags.verbose) {
           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
+        }
       }
     }
     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
@@ -181,7 +193,8 @@ public:
 };
 #endif /* KMP_USE_HWLOC */
 
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
+    KMP_OS_AIX
 #if KMP_OS_LINUX
 /* On some of the older OS's that we build on, these constants aren't present
    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
@@ -254,11 +267,65 @@ public:
 #elif __NR_sched_getaffinity != 5196
 #error Wrong code for getaffinity system call.
 #endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_LOONGARCH64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 122
+#elif __NR_sched_setaffinity != 122
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 123
+#elif __NR_sched_getaffinity != 123
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_RISCV64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 122
+#elif __NR_sched_setaffinity != 122
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 123
+#elif __NR_sched_getaffinity != 123
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_VE
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 203
+#elif __NR_sched_setaffinity != 203
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 204
+#elif __NR_sched_getaffinity != 204
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_S390X
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 239
+#elif __NR_sched_setaffinity != 239
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 240
+#elif __NR_sched_getaffinity != 240
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#else
 #error Unknown or unsupported architecture
 #endif /* KMP_ARCH_* */
-#elif KMP_OS_FREEBSD
+#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
 #include <pthread.h>
 #include <pthread_np.h>
+#elif KMP_OS_NETBSD
+#include <pthread.h>
+#include <sched.h>
+#elif KMP_OS_AIX
+#include <sys/dr.h>
+#include <sys/rset.h>
+#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
+#define GET_NUMBER_SMT_SETS 0x0004
+extern "C" int syssmt(int flags, int, int, int *);
 #endif
 class KMPNativeAffinity : public KMPAffinity {
   class Mask : public KMPAffinity::Mask {
@@ -291,6 +358,13 @@ class KMPNativeAffinity : public KMPAffinity {
       for (mask_size_type i = 0; i < e; ++i)
         mask[i] = (mask_t)0;
     }
+    bool empty() const override {
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
+        if (mask[i] != (mask_t)0)
+          return false;
+      return true;
+    }
     void copy(const KMPAffinity::Mask *src) override {
       const Mask *convert = static_cast<const Mask *>(src);
       mask_size_type e = get_num_mask_types();
@@ -314,6 +388,14 @@ class KMPNativeAffinity : public KMPAffinity {
       for (mask_size_type i = 0; i < e; ++i)
         mask[i] = ~(mask[i]);
     }
+    bool is_equal(const KMPAffinity::Mask *rhs) const override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
+        if (mask[i] != convert->mask[i])
+          return false;
+      return true;
+    }
     int begin() const override {
       int retval = 0;
       while (retval < end() && !is_set(retval))
@@ -331,13 +413,77 @@ class KMPNativeAffinity : public KMPAffinity {
         ++retval;
       return retval;
     }
+#if KMP_OS_AIX
+    // On AIX, we don't have a way to get CPU(s) a thread is bound to.
+    // This routine is only used to get the full mask.
+    int get_system_affinity(bool abort_on_error) override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal get affinity operation when not capable");
+
+      (void)abort_on_error;
+
+      // Set the mask with all CPUs that are available.
+      for (int i = 0; i < __kmp_xproc; ++i)
+        KMP_CPU_SET(i, this);
+      return 0;
+    }
+    int set_system_affinity(bool abort_on_error) const override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+
+                  "Illegal set affinity operation when not capable");
+
+      int location;
+      int gtid = __kmp_entry_gtid();
+      int tid = thread_self();
+
+      // Unbind the thread if it was bound to any processors before so that
+      // we can bind the thread to CPUs specified by the mask not others.
+      int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
+
+      // On AIX, we can only bind to one instead of a set of CPUs with the
+      // bindprocessor() system call.
+      KMP_CPU_SET_ITERATE(location, this) {
+        if (KMP_CPU_ISSET(location, this)) {
+          retval = bindprocessor(BINDTHREAD, tid, location);
+          if (retval == -1 && errno == 1) {
+            rsid_t rsid;
+            rsethandle_t rsh;
+            // Put something in rsh to prevent compiler warning
+            // about uninitalized use
+            rsh = rs_alloc(RS_EMPTY);
+            rsid.at_pid = getpid();
+            if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
+              retval = ra_detachrset(R_PROCESS, rsid, 0);
+              retval = bindprocessor(BINDTHREAD, tid, location);
+            }
+          }
+          if (retval == 0) {
+            KA_TRACE(10, ("__kmp_set_system_affinity:  Done binding "
+                          "T#%d to cpu=%d.\n",
+                          gtid, location));
+            continue;
+          }
+          int error = errno;
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
+                        KMP_ERR(error), __kmp_msg_null);
+            KA_TRACE(10, ("__kmp_set_system_affinity:  Error binding "
+                          "T#%d to cpu=%d, errno=%d.\n",
+                          gtid, location, error));
+            return error;
+          }
+        }
+      }
+      return 0;
+    }
+#else // !KMP_OS_AIX
     int get_system_affinity(bool abort_on_error) override {
       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
                   "Illegal get affinity operation when not capable");
 #if KMP_OS_LINUX
       long retval =
           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
-#elif KMP_OS_FREEBSD
+#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
       int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
                                      reinterpret_cast<cpuset_t *>(mask));
       int retval = (r == 0 ? 0 : -1);
@@ -347,7 +493,8 @@ class KMPNativeAffinity : public KMPAffinity {
       }
       int error = errno;
       if (abort_on_error) {
-        __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
+        __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
+                    KMP_ERR(error), __kmp_msg_null);
       }
       return error;
     }
@@ -357,7 +504,7 @@ class KMPNativeAffinity : public KMPAffinity {
 #if KMP_OS_LINUX
       long retval =
           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
-#elif KMP_OS_FREEBSD
+#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
       int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
                                      reinterpret_cast<cpuset_t *>(mask));
       int retval = (r == 0 ? 0 : -1);
@@ -367,10 +514,12 @@ class KMPNativeAffinity : public KMPAffinity {
       }
       int error = errno;
       if (abort_on_error) {
-        __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
+        __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
+                    KMP_ERR(error), __kmp_msg_null);
       }
       return error;
     }
+#endif // KMP_OS_AIX
   };
   void determine_capable(const char *env_var) override {
     __kmp_affinity_determine_capable(env_var);
@@ -399,7 +548,8 @@ class KMPNativeAffinity : public KMPAffinity {
   }
   api_type get_api_type() const override { return NATIVE_OS; }
 };
-#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
+#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY  \
+          || KMP_OS_AIX */
 
 #if KMP_OS_WINDOWS
 class KMPNativeAffinity : public KMPAffinity {
@@ -429,6 +579,12 @@ class KMPNativeAffinity : public KMPAffinity {
       for (int i = 0; i < __kmp_num_proc_groups; ++i)
         mask[i] = 0;
     }
+    bool empty() const override {
+      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
+        if (mask[i])
+          return false;
+      return true;
+    }
     void copy(const KMPAffinity::Mask *src) override {
       const Mask *convert = static_cast<const Mask *>(src);
       for (int i = 0; i < __kmp_num_proc_groups; ++i)
@@ -448,6 +604,13 @@ class KMPNativeAffinity : public KMPAffinity {
       for (int i = 0; i < __kmp_num_proc_groups; ++i)
         mask[i] = ~(mask[i]);
     }
+    bool is_equal(const KMPAffinity::Mask *rhs) const override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
+        if (mask[i] != convert->mask[i])
+          return false;
+      return true;
+    }
     int begin() const override {
       int retval = 0;
       while (retval < end() && !is_set(retval))
@@ -649,6 +812,21 @@ struct kmp_hw_attr_t {
     }
     return false;
   }
+#if KMP_AFFINITY_SUPPORTED
+  bool contains(const kmp_affinity_attrs_t &attr) const {
+    if (!valid && !attr.valid)
+      return true;
+    if (valid && attr.valid) {
+      if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
+        return (is_core_type_valid() &&
+                (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
+      if (attr.core_eff != UNKNOWN_CORE_EFF)
+        return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
+      return true;
+    }
+    return false;
+  }
+#endif // KMP_AFFINITY_SUPPORTED
   bool operator==(const kmp_hw_attr_t &rhs) const {
     return (rhs.valid == valid && rhs.core_eff == core_eff &&
             rhs.core_type == core_type);
@@ -656,15 +834,21 @@ struct kmp_hw_attr_t {
   bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
 };
 
+#if KMP_AFFINITY_SUPPORTED
+KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
+#endif
+
 class kmp_hw_thread_t {
 public:
   static const int UNKNOWN_ID = -1;
+  static const int MULTIPLE_ID = -2;
   static int compare_ids(const void *a, const void *b);
   static int compare_compact(const void *a, const void *b);
   int ids[KMP_HW_LAST];
   int sub_ids[KMP_HW_LAST];
   bool leader;
   int os_id;
+  int original_idx;
   kmp_hw_attr_t attrs;
 
   void print() const;
@@ -721,8 +905,8 @@ class kmp_topology_t {
   // Flags describing the topology
   flags_t flags;
 
-  // Insert a new topology layer after allocation
-  void _insert_layer(kmp_hw_t type, const int *ids);
+  // Compact value used during sort_compact()
+  int compact;
 
 #if KMP_GROUP_AFFINITY
   // Insert topology information about Windows Processor groups
@@ -783,6 +967,10 @@ public:
     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
           kmp_hw_thread_t::compare_ids);
   }
+
+  // Insert a new topology layer after allocation
+  void insert_layer(kmp_hw_t type, const int *ids);
+
   // Check if the hardware ids are unique, if they are
   // return true, otherwise return false
   bool check_ids() const;
@@ -791,13 +979,23 @@ public:
   void canonicalize();
   void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
 
-  // Functions used after canonicalize() called
+// Functions used after canonicalize() called
+
+#if KMP_AFFINITY_SUPPORTED
+  // Set the granularity for affinity settings
+  void set_granularity(kmp_affinity_t &stgs) const;
+  bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
+  bool restrict_to_mask(const kmp_affin_mask_t *mask);
   bool filter_hw_subset();
-  bool is_close(int hwt1, int hwt2, int level) const;
+#endif
   bool is_uniform() const { return flags.uniform; }
   // Tell whether a type is a valid type in the topology
   // returns KMP_HW_UNKNOWN when there is no equivalent type
-  kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
+  kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
+    if (type == KMP_HW_UNKNOWN)
+      return KMP_HW_UNKNOWN;
+    return equivalent[type];
+  }
   // Set type1 = type2
   void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
@@ -858,7 +1056,9 @@ public:
   }
 
 #if KMP_AFFINITY_SUPPORTED
-  void sort_compact() {
+  friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
+  void sort_compact(kmp_affinity_t &affinity) {
+    compact = affinity.compact;
     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
           kmp_hw_thread_t::compare_compact);
   }
@@ -978,6 +1178,50 @@ public:
     qsort(items, depth, sizeof(item_t), hw_subset_compare);
   }
   bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
+
+  // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
+  // This means putting each of {sockets, cores, threads} in the topology if
+  // they are not specified:
+  // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
+  // e.g., 3module => *s,3module,*c,*t
+  // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
+  // are expecting the traditional sockets/cores/threads topology. For newer
+  // hardware, there can be intervening layers like dies/tiles/modules
+  // (usually corresponding to a cache level). So when a user asks for
+  // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
+  // should get 12 hardware threads across 6 cores and effectively ignore the
+  // module layer.
+  void canonicalize(const kmp_topology_t *top) {
+    // Layers to target for KMP_HW_SUBSET canonicalization
+    kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
+
+    // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
+    if (is_absolute())
+      return;
+
+    // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
+    // topology doesn't have these layers
+    for (kmp_hw_t type : targeted)
+      if (top->get_level(type) == KMP_HW_UNKNOWN)
+        return;
+
+    // Put targeted layers in topology if they do not exist
+    for (kmp_hw_t type : targeted) {
+      bool found = false;
+      for (int i = 0; i < get_depth(); ++i) {
+        if (top->get_equivalent_type(items[i].type) == type) {
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
+      }
+    }
+    sort();
+    // Set as an absolute topology that only targets the targeted layers
+    set_absolute();
+  }
   void dump() const {
     printf("**********************\n");
     printf("*** kmp_hw_subset: ***\n");
@@ -1029,7 +1273,7 @@ public:
       leaf. It corresponds to the number of entries in numPerLevel if we exclude
       all but one trailing 1. */
   kmp_uint32 depth;
-  kmp_uint32 base_num_threads;
+  kmp_uint32 base_num_threads = 0;
   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
   // 2=initialization in progress
@@ -1039,8 +1283,8 @@ public:
       the parent of a node at level i has. For example, if we have a machine
       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
-  kmp_uint32 *numPerLevel;
-  kmp_uint32 *skipPerLevel;
+  kmp_uint32 *numPerLevel = nullptr;
+  kmp_uint32 *skipPerLevel = nullptr;
 
   void deriveLevels() {
     int hier_depth = __kmp_topology->get_depth();
diff --git a/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp b/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp
index e9aaedc538c..00a4f1ef956 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp
@@ -1257,7 +1257,11 @@ static void **mk_dax_kmem_preferred;
 static void *(*kmp_target_alloc_host)(size_t size, int device);
 static void *(*kmp_target_alloc_shared)(size_t size, int device);
 static void *(*kmp_target_alloc_device)(size_t size, int device);
-static void *(*kmp_target_free)(void *ptr, int device);
+static void *(*kmp_target_lock_mem)(void *ptr, size_t size, int device);
+static void *(*kmp_target_unlock_mem)(void *ptr, int device);
+static void *(*kmp_target_free_host)(void *ptr, int device);
+static void *(*kmp_target_free_shared)(void *ptr, int device);
+static void *(*kmp_target_free_device)(void *ptr, int device);
 static bool __kmp_target_mem_available;
 #define KMP_IS_TARGET_MEM_SPACE(MS)                                            \
   (MS == llvm_omp_target_host_mem_space ||                                     \
@@ -1370,10 +1374,18 @@ void __kmp_init_target_mem() {
       KMP_DLSYM("llvm_omp_target_alloc_shared");
   *(void **)(&kmp_target_alloc_device) =
       KMP_DLSYM("llvm_omp_target_alloc_device");
-  *(void **)(&kmp_target_free) = KMP_DLSYM("omp_target_free");
-  __kmp_target_mem_available = kmp_target_alloc_host &&
-                               kmp_target_alloc_shared &&
-                               kmp_target_alloc_device && kmp_target_free;
+  *(void **)(&kmp_target_free_host) = KMP_DLSYM("llvm_omp_target_free_host");
+  *(void **)(&kmp_target_free_shared) =
+      KMP_DLSYM("llvm_omp_target_free_shared");
+  *(void **)(&kmp_target_free_device) =
+      KMP_DLSYM("llvm_omp_target_free_device");
+  __kmp_target_mem_available =
+      kmp_target_alloc_host && kmp_target_alloc_shared &&
+      kmp_target_alloc_device && kmp_target_free_host &&
+      kmp_target_free_shared && kmp_target_free_device;
+  // lock/pin and unlock/unpin target calls
+  *(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem");
+  *(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem");
 }
 
 omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
@@ -1391,7 +1403,9 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
     switch (traits[i].key) {
     case omp_atk_sync_hint:
     case omp_atk_access:
+      break;
     case omp_atk_pinned:
+      al->pinned = true;
       break;
     case omp_atk_alignment:
       __kmp_type_convert(traits[i].value, &(al->alignment));
@@ -1550,6 +1564,8 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
     return NULL;
   if (allocator == omp_null_allocator)
     allocator = __kmp_threads[gtid]->th.th_def_allocator;
+  kmp_int32 default_device =
+      __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
 
   al = RCAST(kmp_allocator_t *, allocator);
 
@@ -1565,6 +1581,46 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
     align = algn; // max of allocator trait, parameter and sizeof(void*)
   desc.size_orig = size;
   desc.size_a = size + sz_desc + align;
+  bool is_pinned = false;
+  if (allocator > kmp_max_mem_alloc)
+    is_pinned = al->pinned;
+
+  // Use default allocator if libmemkind is not available
+  int use_default_allocator = (__kmp_memkind_available) ? false : true;
+
+  if (KMP_IS_TARGET_MEM_ALLOC(allocator)) {
+    // Use size input directly as the memory may not be accessible on host.
+    // Use default device for now.
+    if (__kmp_target_mem_available) {
+      kmp_int32 device =
+          __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
+      if (allocator == llvm_omp_target_host_mem_alloc)
+        ptr = kmp_target_alloc_host(size, device);
+      else if (allocator == llvm_omp_target_shared_mem_alloc)
+        ptr = kmp_target_alloc_shared(size, device);
+      else // allocator == llvm_omp_target_device_mem_alloc
+        ptr = kmp_target_alloc_device(size, device);
+      return ptr;
+    } else {
+      KMP_INFORM(TargetMemNotAvailable);
+    }
+  }
+
+  if (allocator >= kmp_max_mem_alloc && KMP_IS_TARGET_MEM_SPACE(al->memspace)) {
+    if (__kmp_target_mem_available) {
+      kmp_int32 device =
+          __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
+      if (al->memspace == llvm_omp_target_host_mem_space)
+        ptr = kmp_target_alloc_host(size, device);
+      else if (al->memspace == llvm_omp_target_shared_mem_space)
+        ptr = kmp_target_alloc_shared(size, device);
+      else // al->memspace == llvm_omp_target_device_mem_space
+        ptr = kmp_target_alloc_device(size, device);
+      return ptr;
+    } else {
+      KMP_INFORM(TargetMemNotAvailable);
+    }
+  }
 
   if (__kmp_memkind_available) {
     if (allocator < kmp_max_mem_alloc) {
@@ -1591,7 +1647,10 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
         } else if (al->fb == omp_atv_allocator_fb) {
           KMP_ASSERT(al != al->fb_data);
           al = al->fb_data;
-          return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+          ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+          if (is_pinned && kmp_target_lock_mem)
+            kmp_target_lock_mem(ptr, size, default_device);
+          return ptr;
         } // else ptr == NULL;
       } else {
         // pool has enough space
@@ -1605,7 +1664,10 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
           } else if (al->fb == omp_atv_allocator_fb) {
             KMP_ASSERT(al != al->fb_data);
             al = al->fb_data;
-            return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+            ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+            if (is_pinned && kmp_target_lock_mem)
+              kmp_target_lock_mem(ptr, size, default_device);
+            return ptr;
           }
         }
       }
@@ -1621,47 +1683,36 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
         } else if (al->fb == omp_atv_allocator_fb) {
           KMP_ASSERT(al != al->fb_data);
           al = al->fb_data;
-          return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+          ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+          if (is_pinned && kmp_target_lock_mem)
+            kmp_target_lock_mem(ptr, size, default_device);
+          return ptr;
         }
       }
     }
   } else if (allocator < kmp_max_mem_alloc) {
-    if (KMP_IS_TARGET_MEM_ALLOC(allocator)) {
-      // Use size input directly as the memory may not be accessible on host.
-      // Use default device for now.
-      if (__kmp_target_mem_available) {
-        kmp_int32 device =
-            __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
-        if (allocator == llvm_omp_target_host_mem_alloc)
-          ptr = kmp_target_alloc_host(size, device);
-        else if (allocator == llvm_omp_target_shared_mem_alloc)
-          ptr = kmp_target_alloc_shared(size, device);
-        else // allocator == llvm_omp_target_device_mem_alloc
-          ptr = kmp_target_alloc_device(size, device);
-      }
-      return ptr;
-    }
-
     // pre-defined allocator
     if (allocator == omp_high_bw_mem_alloc) {
-      // ptr = NULL;
+      KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc");
     } else if (allocator == omp_large_cap_mem_alloc) {
-      // warnings?
-    } else {
-      ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
+      KMP_WARNING(OmpNoAllocator, "omp_large_cap_mem_alloc");
+    } else if (allocator == omp_const_mem_alloc) {
+      KMP_WARNING(OmpNoAllocator, "omp_const_mem_alloc");
+    } else if (allocator == omp_low_lat_mem_alloc) {
+      KMP_WARNING(OmpNoAllocator, "omp_low_lat_mem_alloc");
+    } else if (allocator == omp_cgroup_mem_alloc) {
+      KMP_WARNING(OmpNoAllocator, "omp_cgroup_mem_alloc");
+    } else if (allocator == omp_pteam_mem_alloc) {
+      KMP_WARNING(OmpNoAllocator, "omp_pteam_mem_alloc");
+    } else if (allocator == omp_thread_mem_alloc) {
+      KMP_WARNING(OmpNoAllocator, "omp_thread_mem_alloc");
+    } else { // default allocator requested
+      use_default_allocator = true;
     }
-  } else if (KMP_IS_TARGET_MEM_SPACE(al->memspace)) {
-    if (__kmp_target_mem_available) {
-      kmp_int32 device =
-          __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
-      if (al->memspace == llvm_omp_target_host_mem_space)
-        ptr = kmp_target_alloc_host(size, device);
-      else if (al->memspace == llvm_omp_target_shared_mem_space)
-        ptr = kmp_target_alloc_shared(size, device);
-      else // al->memspace == llvm_omp_target_device_mem_space
-        ptr = kmp_target_alloc_device(size, device);
+    if (use_default_allocator) {
+      ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
+      use_default_allocator = false;
     }
-    return ptr;
   } else if (al->pool_size > 0) {
     // custom allocator with pool size requested
     kmp_uint64 used =
@@ -1677,7 +1728,10 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
       } else if (al->fb == omp_atv_allocator_fb) {
         KMP_ASSERT(al != al->fb_data);
         al = al->fb_data;
-        return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+        ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+        if (is_pinned && kmp_target_lock_mem)
+          kmp_target_lock_mem(ptr, size, default_device);
+        return ptr;
       } // else ptr == NULL;
     } else {
       // pool has enough space
@@ -1697,6 +1751,9 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
   if (ptr == NULL)
     return NULL;
 
+  if (is_pinned && kmp_target_lock_mem)
+    kmp_target_lock_mem(ptr, desc.size_a, default_device);
+
   addr = (kmp_uintptr_t)ptr;
   addr_align = (addr + sz_desc + align - 1) & ~(align - 1);
   addr_descr = addr_align - sz_desc;
@@ -1786,13 +1843,18 @@ void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
   kmp_mem_desc_t desc;
   kmp_uintptr_t addr_align; // address to return to caller
   kmp_uintptr_t addr_descr; // address of memory block descriptor
-  if (KMP_IS_TARGET_MEM_ALLOC(allocator) ||
-      (allocator > kmp_max_mem_alloc &&
-       KMP_IS_TARGET_MEM_SPACE(al->memspace))) {
-    KMP_DEBUG_ASSERT(kmp_target_free);
+  if (__kmp_target_mem_available && (KMP_IS_TARGET_MEM_ALLOC(allocator) ||
+                                     (allocator > kmp_max_mem_alloc &&
+                                      KMP_IS_TARGET_MEM_SPACE(al->memspace)))) {
     kmp_int32 device =
         __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
-    kmp_target_free(ptr, device);
+    if (allocator == llvm_omp_target_host_mem_alloc) {
+      kmp_target_free_host(ptr, device);
+    } else if (allocator == llvm_omp_target_shared_mem_alloc) {
+      kmp_target_free_shared(ptr, device);
+    } else if (allocator == llvm_omp_target_device_mem_alloc) {
+      kmp_target_free_device(ptr, device);
+    }
     return;
   }
 
@@ -1808,6 +1870,12 @@ void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
   oal = (omp_allocator_handle_t)al; // cast to void* for comparisons
   KMP_DEBUG_ASSERT(al);
 
+  if (allocator > kmp_max_mem_alloc && kmp_target_unlock_mem && al->pinned) {
+    kmp_int32 device =
+        __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
+    kmp_target_unlock_mem(desc.ptr_alloc, device);
+  }
+
   if (__kmp_memkind_available) {
     if (oal < kmp_max_mem_alloc) {
       // pre-defined allocator
diff --git a/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp b/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp
index 21c2c60bfb6..261e9f1beee 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp
@@ -832,7 +832,7 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs,
 // end of the first part of the workaround for C78287
 #endif // USE_CMPXCHG_FIX
 
-#if KMP_OS_WINDOWS && KMP_ARCH_AARCH64
+#if KMP_OS_WINDOWS && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
 // Undo explicit type casts to get MSVC ARM64 to build. Uses
 // OP_CMPXCHG_WORKAROUND definition for OP_CMPXCHG
 #undef OP_CMPXCHG
@@ -863,7 +863,7 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs,
   (*lhs) = (*lhs)OP rhs;                                                       \
   __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
 
-#endif // KMP_OS_WINDOWS && KMP_ARCH_AARCH64
+#endif // KMP_OS_WINDOWS && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
@@ -1914,8 +1914,7 @@ ATOMIC_CMPXCHG_CMPLX(cmplx4, kmp_cmplx32, mul, 64, *, cmplx8, kmp_cmplx64, 8c,
 ATOMIC_CMPXCHG_CMPLX(cmplx4, kmp_cmplx32, div, 64, /, cmplx8, kmp_cmplx64, 8c,
                      7, KMP_ARCH_X86) // __kmpc_atomic_cmplx4_div_cmplx8
 
-// READ, WRITE, CAPTURE are supported only on IA-32 architecture and Intel(R) 64
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// READ, WRITE, CAPTURE
 
 // ------------------------------------------------------------------------
 // Atomic READ routines
@@ -2925,6 +2924,7 @@ ATOMIC_CRITICAL_CPT(cmplx16, div_a16_cpt, kmp_cmplx128_a16_t, /, 32c,
 // binop x; v = x; }  for non-commutative operations.
 // Supported only on IA-32 architecture and Intel(R) 64
 
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
 // -------------------------------------------------------------------------
 // Operation on *lhs, rhs bound by critical section
 //     OP     - operator (it's supposed to contain an assignment)
diff --git a/contrib/libs/cxxsupp/openmp/kmp_atomic.h b/contrib/libs/cxxsupp/openmp/kmp_atomic.h
index 19c02e9d25c..4fc51ee4289 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_atomic.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_atomic.h
@@ -1005,8 +1005,7 @@ void __kmpc_atomic_20(ident_t *id_ref, int gtid, void *lhs, void *rhs,
 void __kmpc_atomic_32(ident_t *id_ref, int gtid, void *lhs, void *rhs,
                       void (*f)(void *, void *, void *));
 
-// READ, WRITE, CAPTURE are supported only on IA-32 architecture and Intel(R) 64
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// READ, WRITE, CAPTURE
 
 //  Below routines for atomic READ are listed
 char __kmpc_atomic_fixed1_rd(ident_t *id_ref, int gtid, char *loc);
@@ -1337,7 +1336,6 @@ void __kmpc_atomic_cmplx4_mul_cpt(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
                                   kmp_cmplx32 rhs, kmp_cmplx32 *out, int flag);
 void __kmpc_atomic_cmplx4_div_cpt(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
                                   kmp_cmplx32 rhs, kmp_cmplx32 *out, int flag);
-
 kmp_cmplx64 __kmpc_atomic_cmplx8_add_cpt(ident_t *id_ref, int gtid,
                                          kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
                                          int flag);
@@ -1419,7 +1417,7 @@ void __kmpc_atomic_end(void);
 
 // OpenMP 4.0: v = x = expr binop x; { v = x; x = expr binop x; } { x = expr
 // binop x; v = x; }  for non-commutative operations.
-
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
 char __kmpc_atomic_fixed1_sub_cpt_rev(ident_t *id_ref, int gtid, char *lhs,
                                       char rhs, int flag);
 char __kmpc_atomic_fixed1_div_cpt_rev(ident_t *id_ref, int gtid, char *lhs,
diff --git a/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp b/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp
index 1a718b45fff..d7ef57c6081 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp
@@ -444,7 +444,8 @@ static void __kmp_dist_barrier_release(
       next_go = my_current_iter + distributedBarrier::MAX_ITERS;
       my_go_index = tid / b->threads_per_go;
       if (this_thr->th.th_used_in_team.load() == 3) {
-        KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
+        (void)KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3,
+                                          1);
       }
       // Check if go flag is set
       if (b->go[my_go_index].go.load() != next_go) {
@@ -1805,7 +1806,25 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
     // It is OK to report the barrier state after the barrier begin callback.
     // According to the OMPT specification, a compliant implementation may
     // even delay reporting this state until the barrier begins to wait.
-    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
+    auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
+    switch (barrier_kind) {
+    case ompt_sync_region_barrier_explicit:
+      ompt_thr_info->state = ompt_state_wait_barrier_explicit;
+      break;
+    case ompt_sync_region_barrier_implicit_workshare:
+      ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
+      break;
+    case ompt_sync_region_barrier_implicit_parallel:
+      ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
+      break;
+    case ompt_sync_region_barrier_teams:
+      ompt_thr_info->state = ompt_state_wait_barrier_teams;
+      break;
+    case ompt_sync_region_barrier_implementation:
+      [[fallthrough]];
+    default:
+      ompt_thr_info->state = ompt_state_wait_barrier_implementation;
+    }
   }
 #endif
 
@@ -1858,8 +1877,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
     }
 
     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
-      // use 0 to only setup the current team if nthreads > 1
-      __kmp_task_team_setup(this_thr, team, 0);
+      __kmp_task_team_setup(this_thr, team);
 
     if (cancellable) {
       cancelled = __kmp_linear_barrier_gather_cancellable(
@@ -2042,7 +2060,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
             this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
                 TRUE);
         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
-        __kmp_task_team_setup(this_thr, team, 0);
+        __kmp_task_team_setup(this_thr, team);
 
 #if USE_ITT_BUILD
         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
@@ -2214,20 +2232,24 @@ void __kmp_join_barrier(int gtid) {
       codeptr = team->t.ompt_team_info.master_return_address;
     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
+    ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
+    ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
+    if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
+      sync_kind = ompt_sync_region_barrier_teams;
+      ompt_state = ompt_state_wait_barrier_teams;
+    }
     if (ompt_enabled.ompt_callback_sync_region) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
-          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
-          my_task_data, codeptr);
+          sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
     }
     if (ompt_enabled.ompt_callback_sync_region_wait) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
-          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
-          my_task_data, codeptr);
+          sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
     }
     if (!KMP_MASTER_TID(ds_tid))
       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
 #endif
-    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
+    this_thr->th.ompt_thread_info.state = ompt_state;
   }
 #endif
 
@@ -2243,9 +2265,7 @@ void __kmp_join_barrier(int gtid) {
                   __kmp_gtid_from_thread(this_thr), team_id,
                   team->t.t_task_team[this_thr->th.th_task_state],
                   this_thr->th.th_task_team));
-    if (this_thr->th.th_task_team)
-      KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
-                       team->t.t_task_team[this_thr->th.th_task_state]);
+    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
   }
 #endif /* KMP_DEBUG */
 
@@ -2403,11 +2423,11 @@ void __kmp_fork_barrier(int gtid, int tid) {
 #if USE_ITT_BUILD
   void *itt_sync_obj = NULL;
 #endif /* USE_ITT_BUILD */
+#ifdef KMP_DEBUG
   if (team)
-
-  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
-                (team != NULL) ? team->t.t_id : -1, tid));
-
+    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
+                  (team != NULL) ? team->t.t_id : -1, tid));
+#endif
   // th_team pointer only valid for primary thread here
   if (KMP_MASTER_TID(tid)) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
@@ -2440,10 +2460,8 @@ void __kmp_fork_barrier(int gtid, int tid) {
     }
 #endif
 
-    if (__kmp_tasking_mode != tskm_immediate_exec) {
-      // 0 indicates setup current task team if nthreads > 1
-      __kmp_task_team_setup(this_thr, team, 0);
-    }
+    if (__kmp_tasking_mode != tskm_immediate_exec)
+      __kmp_task_team_setup(this_thr, team);
 
     /* The primary thread may have changed its blocktime between join barrier
        and fork barrier. Copy the blocktime info to the thread, where
@@ -2493,8 +2511,10 @@ void __kmp_fork_barrier(int gtid, int tid) {
   }
 
 #if OMPT_SUPPORT
+  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
   if (ompt_enabled.enabled &&
-      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
+      (ompt_state == ompt_state_wait_barrier_teams ||
+       ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
     int ds_tid = this_thr->th.th_info.ds.ds_tid;
     ompt_data_t *task_data = (team)
                                  ? OMPT_CUR_TASK_DATA(this_thr)
@@ -2506,15 +2526,16 @@ void __kmp_fork_barrier(int gtid, int tid) {
         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
       codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
+    ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
+    if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
+      sync_kind = ompt_sync_region_barrier_teams;
     if (ompt_enabled.ompt_callback_sync_region_wait) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
-          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
-          codeptr);
+          sync_kind, ompt_scope_end, NULL, task_data, codeptr);
     }
     if (ompt_enabled.ompt_callback_sync_region) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
-          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
-          codeptr);
+          sync_kind, ompt_scope_end, NULL, task_data, codeptr);
     }
 #endif
     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
@@ -2582,7 +2603,7 @@ void __kmp_fork_barrier(int gtid, int tid) {
   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
   if (proc_bind == proc_bind_intel) {
     // Call dynamic affinity settings
-    if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
+    if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
       __kmp_balanced_affinity(this_thr, team->t.t_nproc);
     }
   } else if (proc_bind != proc_bind_false) {
@@ -2591,7 +2612,7 @@ void __kmp_fork_barrier(int gtid, int tid) {
                      __kmp_gtid_from_thread(this_thr),
                      this_thr->th.th_current_place));
     } else {
-      __kmp_affinity_set_place(gtid);
+      __kmp_affinity_bind_place(gtid);
     }
   }
 #endif // KMP_AFFINITY_SUPPORTED
@@ -2599,7 +2620,7 @@ void __kmp_fork_barrier(int gtid, int tid) {
   if (__kmp_display_affinity) {
     if (team->t.t_display_affinity
 #if KMP_AFFINITY_SUPPORTED
-        || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
+        || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
 #endif
     ) {
       // NULL means use the affinity-format-var ICV
diff --git a/contrib/libs/cxxsupp/openmp/kmp_barrier.h b/contrib/libs/cxxsupp/openmp/kmp_barrier.h
index ac28a13217e..ae9b8d62f4c 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_barrier.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_barrier.h
@@ -21,7 +21,10 @@
 #define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment)
 #define KMP_ALIGNED_FREE(ptr) _mm_free(ptr)
 #elif KMP_HAVE_ALIGNED_ALLOC
-#define KMP_ALIGNED_ALLOCATE(size, alignment) aligned_alloc(alignment, size)
+#define KMP_ALGIN_UP(val, alignment)                                           \
+  (((val) + (alignment)-1) / (alignment) * (alignment))
+#define KMP_ALIGNED_ALLOCATE(size, alignment)                                  \
+  aligned_alloc(alignment, KMP_ALGIN_UP(size, alignment))
 #define KMP_ALIGNED_FREE(ptr) free(ptr)
 #elif KMP_HAVE_POSIX_MEMALIGN
 static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) {
diff --git a/contrib/libs/cxxsupp/openmp/kmp_collapse.cpp b/contrib/libs/cxxsupp/openmp/kmp_collapse.cpp
new file mode 100644
index 00000000000..f1bf04901dc
--- /dev/null
+++ b/contrib/libs/cxxsupp/openmp/kmp_collapse.cpp
@@ -0,0 +1,1781 @@
+/*
+ * kmp_collapse.cpp -- loop collapse feature
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#include "kmp_collapse.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+// OMPTODO: different style of comments (see kmp_sched)
+// OMPTODO: OMPT/OMPD
+
+// avoid inadevertently using a library based abs
+template <typename T> T __kmp_abs(const T val) {
+  return (val < 0) ? -val : val;
+}
+kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
+kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
+
+//----------------------------------------------------------------------------
+// Common functions for working with rectangular and non-rectangular loops
+//----------------------------------------------------------------------------
+
+template <typename T> int __kmp_sign(T val) {
+  return (T(0) < val) - (val < T(0));
+}
+
+template <typename T> class CollapseAllocator {
+  typedef T *pT;
+
+private:
+  static const size_t allocaSize = 32; // size limit for stack allocations
+                                       // (8 bytes x 4 nested loops)
+  char stackAlloc[allocaSize];
+  static constexpr size_t maxElemCount = allocaSize / sizeof(T);
+  pT pTAlloc;
+
+public:
+  CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
+    if (n > maxElemCount) {
+      pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
+    }
+  }
+  ~CollapseAllocator() {
+    if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
+      __kmp_free(pTAlloc);
+    }
+  }
+  T &operator[](int index) { return pTAlloc[index]; }
+  operator const pT() { return pTAlloc; }
+};
+
+//----------Loop canonicalization---------------------------------------------
+
+// For loop nest (any shape):
+// convert != to < or >;
+// switch from using < or > to <= or >=.
+// "bounds" array has to be allocated per thread.
+// All other internal functions will work only with canonicalized loops.
+template <typename T>
+void kmp_canonicalize_one_loop_XX(
+    ident_t *loc,
+    /*in/out*/ bounds_infoXX_template<T> *bounds) {
+
+  if (__kmp_env_consistency_check) {
+    if (bounds->step == 0) {
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
+                            loc);
+    }
+  }
+
+  if (bounds->comparison == comparison_t::comp_not_eq) {
+    // We can convert this to < or >, depends on the sign of the step:
+    if (bounds->step > 0) {
+      bounds->comparison = comparison_t::comp_less;
+    } else {
+      bounds->comparison = comparison_t::comp_greater;
+    }
+  }
+
+  if (bounds->comparison == comparison_t::comp_less) {
+    // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
+    // because ub0 + ub1*j should be still positive (otherwise loop was not
+    // well formed)
+    bounds->ub0 -= 1;
+    bounds->comparison = comparison_t::comp_less_or_eq;
+  } else if (bounds->comparison == comparison_t::comp_greater) {
+    bounds->ub0 += 1;
+    bounds->comparison = comparison_t::comp_greater_or_eq;
+  }
+}
+
+// Canonicalize loop nest. original_bounds_nest is an array of length n.
+void kmp_canonicalize_loop_nest(ident_t *loc,
+                                /*in/out*/ bounds_info_t *original_bounds_nest,
+                                kmp_index_t n) {
+
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(original_bounds_nest[ind]);
+
+    switch (bounds->loop_type) {
+    case loop_type_t::loop_type_int32:
+      kmp_canonicalize_one_loop_XX<kmp_int32>(
+          loc,
+          /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
+      break;
+    case loop_type_t::loop_type_uint32:
+      kmp_canonicalize_one_loop_XX<kmp_uint32>(
+          loc,
+          /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
+      break;
+    case loop_type_t::loop_type_int64:
+      kmp_canonicalize_one_loop_XX<kmp_int64>(
+          loc,
+          /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
+      break;
+    case loop_type_t::loop_type_uint64:
+      kmp_canonicalize_one_loop_XX<kmp_uint64>(
+          loc,
+          /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
+      break;
+    default:
+      KMP_ASSERT(false);
+    }
+  }
+}
+
+//----------Calculating trip count on one level-------------------------------
+
+// Calculate trip count on this loop level.
+// We do this either for a rectangular loop nest,
+// or after an adjustment bringing the loops to a parallelepiped shape.
+// This number should not depend on the value of outer IV
+// even if the formular has lb1 and ub1.
+// Note: for non-rectangular loops don't use span for this, it's too big.
+
+template <typename T>
+kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
+    /*in/out*/ bounds_infoXX_template<T> *bounds) {
+
+  if (bounds->comparison == comparison_t::comp_less_or_eq) {
+    if (bounds->ub0 < bounds->lb0) {
+      // Note: after this we don't need to calculate inner loops,
+      // but that should be an edge case:
+      bounds->trip_count = 0;
+    } else {
+      // ub - lb may exceed signed type range; we need to cast to
+      // kmp_loop_nest_iv_t anyway
+      bounds->trip_count =
+          static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
+              __kmp_abs(bounds->step) +
+          1;
+    }
+  } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
+    if (bounds->lb0 < bounds->ub0) {
+      // Note: after this we don't need to calculate inner loops,
+      // but that should be an edge case:
+      bounds->trip_count = 0;
+    } else {
+      // lb - ub may exceed signed type range; we need to cast to
+      // kmp_loop_nest_iv_t anyway
+      bounds->trip_count =
+          static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
+              __kmp_abs(bounds->step) +
+          1;
+    }
+  } else {
+    KMP_ASSERT(false);
+  }
+  return bounds->trip_count;
+}
+
+// Calculate trip count on this loop level.
+kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
+
+  kmp_loop_nest_iv_t trip_count = 0;
+
+  switch (bounds->loop_type) {
+  case loop_type_t::loop_type_int32:
+    trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
+        /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
+    break;
+  case loop_type_t::loop_type_uint32:
+    trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
+        /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
+    break;
+  case loop_type_t::loop_type_int64:
+    trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
+        /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
+    break;
+  case loop_type_t::loop_type_uint64:
+    trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
+        /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
+    break;
+  default:
+    KMP_ASSERT(false);
+  }
+
+  return trip_count;
+}
+
+//----------Trim original iv according to its type----------------------------
+
+// Trim original iv according to its type.
+// Return kmp_uint64 value which can be easily used in all internal calculations
+// And can be statically cast back to original type in user code.
+kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
+  kmp_uint64 res = 0;
+
+  switch (loop_iv_type) {
+  case loop_type_t::loop_type_int8:
+    res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
+    break;
+  case loop_type_t::loop_type_uint8:
+    res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
+    break;
+  case loop_type_t::loop_type_int16:
+    res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
+    break;
+  case loop_type_t::loop_type_uint16:
+    res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
+    break;
+  case loop_type_t::loop_type_int32:
+    res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
+    break;
+  case loop_type_t::loop_type_uint32:
+    res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
+    break;
+  case loop_type_t::loop_type_int64:
+    res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
+    break;
+  case loop_type_t::loop_type_uint64:
+    res = static_cast<kmp_uint64>(original_iv);
+    break;
+  default:
+    KMP_ASSERT(false);
+  }
+
+  return res;
+}
+
+//----------Compare two IVs (remember they have a type)-----------------------
+
+bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
+                kmp_uint64 original_iv2) {
+  bool res = false;
+
+  switch (loop_iv_type) {
+  case loop_type_t::loop_type_int8:
+    res = static_cast<kmp_int8>(original_iv1) ==
+          static_cast<kmp_int8>(original_iv2);
+    break;
+  case loop_type_t::loop_type_uint8:
+    res = static_cast<kmp_uint8>(original_iv1) ==
+          static_cast<kmp_uint8>(original_iv2);
+    break;
+  case loop_type_t::loop_type_int16:
+    res = static_cast<kmp_int16>(original_iv1) ==
+          static_cast<kmp_int16>(original_iv2);
+    break;
+  case loop_type_t::loop_type_uint16:
+    res = static_cast<kmp_uint16>(original_iv1) ==
+          static_cast<kmp_uint16>(original_iv2);
+    break;
+  case loop_type_t::loop_type_int32:
+    res = static_cast<kmp_int32>(original_iv1) ==
+          static_cast<kmp_int32>(original_iv2);
+    break;
+  case loop_type_t::loop_type_uint32:
+    res = static_cast<kmp_uint32>(original_iv1) ==
+          static_cast<kmp_uint32>(original_iv2);
+    break;
+  case loop_type_t::loop_type_int64:
+    res = static_cast<kmp_int64>(original_iv1) ==
+          static_cast<kmp_int64>(original_iv2);
+    break;
+  case loop_type_t::loop_type_uint64:
+    res = static_cast<kmp_uint64>(original_iv1) ==
+          static_cast<kmp_uint64>(original_iv2);
+    break;
+  default:
+    KMP_ASSERT(false);
+  }
+
+  return res;
+}
+
+//----------Calculate original iv on one level--------------------------------
+
+// Return true if the point fits into upper bounds on this level,
+// false otherwise
+template <typename T>
+bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
+                                 const kmp_point_t original_ivs,
+                                 kmp_index_t ind) {
+
+  T iv = static_cast<T>(original_ivs[ind]);
+  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
+
+  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
+       (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
+      ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
+       (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
+    // The calculated point is outside of loop upper boundary:
+    return false;
+  }
+
+  return true;
+}
+
+// Calculate one iv corresponding to iteration on the level ind.
+// Return true if it fits into lower-upper bounds on this level
+// (if not, we need to re-calculate)
+template <typename T>
+bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
+                        /*in/out*/ kmp_point_t original_ivs,
+                        const kmp_iterations_t iterations, kmp_index_t ind,
+                        bool start_with_lower_bound, bool checkBounds) {
+
+  kmp_uint64 temp = 0;
+  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
+
+  if (start_with_lower_bound) {
+    // we moved to the next iteration on one of outer loops, should start
+    // with the lower bound here:
+    temp = bounds->lb0 + bounds->lb1 * outer_iv;
+  } else {
+    auto iteration = iterations[ind];
+    temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
+  }
+
+  // Now trim original iv according to its type:
+  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
+
+  if (checkBounds) {
+    return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
+  } else {
+    return true;
+  }
+}
+
+bool kmp_calc_one_iv(const bounds_info_t *bounds,
+                     /*in/out*/ kmp_point_t original_ivs,
+                     const kmp_iterations_t iterations, kmp_index_t ind,
+                     bool start_with_lower_bound, bool checkBounds) {
+
+  switch (bounds->loop_type) {
+  case loop_type_t::loop_type_int32:
+    return kmp_calc_one_iv_XX<kmp_int32>(
+        (bounds_infoXX_template<kmp_int32> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
+        checkBounds);
+    break;
+  case loop_type_t::loop_type_uint32:
+    return kmp_calc_one_iv_XX<kmp_uint32>(
+        (bounds_infoXX_template<kmp_uint32> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
+        checkBounds);
+    break;
+  case loop_type_t::loop_type_int64:
+    return kmp_calc_one_iv_XX<kmp_int64>(
+        (bounds_infoXX_template<kmp_int64> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
+        checkBounds);
+    break;
+  case loop_type_t::loop_type_uint64:
+    return kmp_calc_one_iv_XX<kmp_uint64>(
+        (bounds_infoXX_template<kmp_uint64> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
+        checkBounds);
+    break;
+  default:
+    KMP_ASSERT(false);
+    return false;
+  }
+}
+
+//----------Calculate original iv on one level for rectangular loop nest------
+
+// Calculate one iv corresponding to iteration on the level ind.
+// Return true if it fits into lower-upper bounds on this level
+// (if not, we need to re-calculate)
+template <typename T>
+void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
+                                /*in/out*/ kmp_uint64 *original_ivs,
+                                const kmp_iterations_t iterations,
+                                kmp_index_t ind) {
+
+  auto iteration = iterations[ind];
+
+  kmp_uint64 temp =
+      bounds->lb0 +
+      bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
+      iteration * bounds->step;
+
+  // Now trim original iv according to its type:
+  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
+}
+
+void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
+                             /*in/out*/ kmp_uint64 *original_ivs,
+                             const kmp_iterations_t iterations,
+                             kmp_index_t ind) {
+
+  switch (bounds->loop_type) {
+  case loop_type_t::loop_type_int32:
+    kmp_calc_one_iv_rectang_XX<kmp_int32>(
+        (bounds_infoXX_template<kmp_int32> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind);
+    break;
+  case loop_type_t::loop_type_uint32:
+    kmp_calc_one_iv_rectang_XX<kmp_uint32>(
+        (bounds_infoXX_template<kmp_uint32> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind);
+    break;
+  case loop_type_t::loop_type_int64:
+    kmp_calc_one_iv_rectang_XX<kmp_int64>(
+        (bounds_infoXX_template<kmp_int64> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind);
+    break;
+  case loop_type_t::loop_type_uint64:
+    kmp_calc_one_iv_rectang_XX<kmp_uint64>(
+        (bounds_infoXX_template<kmp_uint64> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind);
+    break;
+  default:
+    KMP_ASSERT(false);
+  }
+}
+
+//----------------------------------------------------------------------------
+// Rectangular loop nest
+//----------------------------------------------------------------------------
+
+//----------Canonicalize loop nest and calculate trip count-------------------
+
+// Canonicalize loop nest and calculate overall trip count.
+// "bounds_nest" has to be allocated per thread.
+// API will modify original bounds_nest array to bring it to a canonical form
+// (only <= and >=, no !=, <, >). If the original loop nest was already in a
+// canonical form there will be no changes to bounds in bounds_nest array
+// (only trip counts will be calculated).
+// Returns trip count of overall space.
+extern "C" kmp_loop_nest_iv_t
+__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
+                                 /*in/out*/ bounds_info_t *original_bounds_nest,
+                                 kmp_index_t n) {
+
+  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
+
+  kmp_loop_nest_iv_t total = 1;
+
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(original_bounds_nest[ind]);
+
+    kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
+    total *= trip_count;
+  }
+
+  return total;
+}
+
+//----------Calculate old induction variables---------------------------------
+
+// Calculate old induction variables corresponding to overall new_iv.
+// Note: original IV will be returned as if it had kmp_uint64 type,
+// will have to be converted to original type in user code.
+// Note: trip counts should be already calculated by
+// __kmpc_process_loop_nest_rectang.
+// OMPTODO: special case 2, 3 nested loops: either do different
+// interface without array or possibly template this over n
+extern "C" void
+__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
+                                 const bounds_info_t *original_bounds_nest,
+                                 /*out*/ kmp_uint64 *original_ivs,
+                                 kmp_index_t n) {
+
+  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
+
+  // First, calc corresponding iteration in every original loop:
+  for (kmp_index_t ind = n; ind > 0;) {
+    --ind;
+    auto bounds = &(original_bounds_nest[ind]);
+
+    // should be optimized to OPDIVREM:
+    auto temp = new_iv / bounds->trip_count;
+    auto iteration = new_iv % bounds->trip_count;
+    new_iv = temp;
+
+    iterations[ind] = iteration;
+  }
+  KMP_ASSERT(new_iv == 0);
+
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(original_bounds_nest[ind]);
+
+    kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
+  }
+}
+
+//----------------------------------------------------------------------------
+// Non-rectangular loop nest
+//----------------------------------------------------------------------------
+
+//----------Calculate maximum possible span of iv values on one level---------
+
+// Calculate span for IV on this loop level for "<=" case.
+// Note: it's for <= on this loop nest level, so lower bound should be smallest
+// value, upper bound should be the biggest value. If the loop won't execute,
+// 'smallest' may be bigger than 'biggest', but we'd better not switch them
+// around.
+template <typename T>
+void kmp_calc_span_lessoreq_XX(
+    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
+    /* in/out*/ bounds_info_internal_t *bounds_nest) {
+
+  typedef typename traits_t<T>::unsigned_t UT;
+  // typedef typename traits_t<T>::signed_t ST;
+
+  // typedef typename big_span_t span_t;
+  typedef T span_t;
+
+  auto &bbounds = bounds->b;
+
+  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
+    // This dimention depends on one of previous ones; can't be the outermost
+    // one.
+    bounds_info_internalXX_template<T> *previous =
+        reinterpret_cast<bounds_info_internalXX_template<T> *>(
+            &(bounds_nest[bbounds.outer_iv]));
+
+    // OMPTODO: assert that T is compatible with loop variable type on
+    // 'previous' loop
+
+    {
+      span_t bound_candidate1 =
+          bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
+      span_t bound_candidate2 =
+          bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
+      if (bound_candidate1 < bound_candidate2) {
+        bounds->span_smallest = bound_candidate1;
+      } else {
+        bounds->span_smallest = bound_candidate2;
+      }
+    }
+
+    {
+      // We can't adjust the upper bound with respect to step, because
+      // lower bound might be off after adjustments
+
+      span_t bound_candidate1 =
+          bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
+      span_t bound_candidate2 =
+          bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
+      if (bound_candidate1 < bound_candidate2) {
+        bounds->span_biggest = bound_candidate2;
+      } else {
+        bounds->span_biggest = bound_candidate1;
+      }
+    }
+  } else {
+    // Rectangular:
+    bounds->span_smallest = bbounds.lb0;
+    bounds->span_biggest = bbounds.ub0;
+  }
+  if (!bounds->loop_bounds_adjusted) {
+    // Here it's safe to reduce the space to the multiply of step.
+    // OMPTODO: check if the formular is correct.
+    // Also check if it would be safe to do this if we didn't adjust left side.
+    bounds->span_biggest -=
+        (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
+  }
+}
+
+// Calculate span for IV on this loop level for ">=" case.
+template <typename T>
+void kmp_calc_span_greateroreq_XX(
+    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
+    /* in/out*/ bounds_info_internal_t *bounds_nest) {
+
+  typedef typename traits_t<T>::unsigned_t UT;
+  // typedef typename traits_t<T>::signed_t ST;
+
+  // typedef typename big_span_t span_t;
+  typedef T span_t;
+
+  auto &bbounds = bounds->b;
+
+  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
+    // This dimention depends on one of previous ones; can't be the outermost
+    // one.
+    bounds_info_internalXX_template<T> *previous =
+        reinterpret_cast<bounds_info_internalXX_template<T> *>(
+            &(bounds_nest[bbounds.outer_iv]));
+
+    // OMPTODO: assert that T is compatible with loop variable type on
+    // 'previous' loop
+
+    {
+      span_t bound_candidate1 =
+          bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
+      span_t bound_candidate2 =
+          bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
+      if (bound_candidate1 >= bound_candidate2) {
+        bounds->span_smallest = bound_candidate1;
+      } else {
+        bounds->span_smallest = bound_candidate2;
+      }
+    }
+
+    {
+      // We can't adjust the upper bound with respect to step, because
+      // lower bound might be off after adjustments
+
+      span_t bound_candidate1 =
+          bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
+      span_t bound_candidate2 =
+          bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
+      if (bound_candidate1 >= bound_candidate2) {
+        bounds->span_biggest = bound_candidate2;
+      } else {
+        bounds->span_biggest = bound_candidate1;
+      }
+    }
+
+  } else {
+    // Rectangular:
+    bounds->span_biggest = bbounds.lb0;
+    bounds->span_smallest = bbounds.ub0;
+  }
+  if (!bounds->loop_bounds_adjusted) {
+    // Here it's safe to reduce the space to the multiply of step.
+    // OMPTODO: check if the formular is correct.
+    // Also check if it would be safe to do this if we didn't adjust left side.
+    bounds->span_biggest -=
+        (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
+  }
+}
+
+// Calculate maximum possible span for IV on this loop level.
+template <typename T>
+void kmp_calc_span_XX(
+    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
+    /* in/out*/ bounds_info_internal_t *bounds_nest) {
+
+  if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
+    kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
+  } else {
+    KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
+    kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
+  }
+}
+
+//----------All initial processing of the loop nest---------------------------
+
+// Calculate new bounds for this loop level.
+// To be able to work with the nest we need to get it to a parallelepiped shape.
+// We need to stay in the original range of values, so that there will be no
+// overflow, for that we'll adjust both upper and lower bounds as needed.
+template <typename T>
+void kmp_calc_new_bounds_XX(
+    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
+    /* in/out*/ bounds_info_internal_t *bounds_nest) {
+
+  auto &bbounds = bounds->b;
+
+  if (bbounds.lb1 == bbounds.ub1) {
+    // Already parallel, no need to adjust:
+    bounds->loop_bounds_adjusted = false;
+  } else {
+    bounds->loop_bounds_adjusted = true;
+
+    T old_lb1 = bbounds.lb1;
+    T old_ub1 = bbounds.ub1;
+
+    if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
+      // With this shape we can adjust to a rectangle:
+      bbounds.lb1 = 0;
+      bbounds.ub1 = 0;
+    } else {
+      // get upper and lower bounds to be parallel
+      // with values in the old range.
+      // Note: abs didn't work here.
+      if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
+          ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
+        bbounds.lb1 = old_ub1;
+      } else {
+        bbounds.ub1 = old_lb1;
+      }
+    }
+
+    // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
+    // The idea here that for this IV we are now getting the same span
+    // irrespective of the previous IV value.
+    bounds_info_internalXX_template<T> *previous =
+        reinterpret_cast<bounds_info_internalXX_template<T> *>(
+            &bounds_nest[bbounds.outer_iv]);
+
+    if (bbounds.comparison == comparison_t::comp_less_or_eq) {
+      if (old_lb1 < bbounds.lb1) {
+        KMP_ASSERT(old_lb1 < 0);
+        // The length is good on outer_iv biggest number,
+        // can use it to find where to move the lower bound:
+
+        T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
+        bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
+                            // e.g. it was 0?? (same below)
+      } else if (old_lb1 > bbounds.lb1) {
+        // still need to move lower bound:
+        T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
+        bbounds.lb0 += add;
+      }
+
+      if (old_ub1 > bbounds.ub1) {
+        KMP_ASSERT(old_ub1 > 0);
+        // The length is good on outer_iv biggest number,
+        // can use it to find where to move upper bound:
+
+        T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
+        bbounds.ub0 += add;
+      } else if (old_ub1 < bbounds.ub1) {
+        // still need to move upper bound:
+        T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
+        bbounds.ub0 -= sub;
+      }
+    } else {
+      KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
+      if (old_lb1 < bbounds.lb1) {
+        KMP_ASSERT(old_lb1 < 0);
+        T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
+        bbounds.lb0 -= sub;
+      } else if (old_lb1 > bbounds.lb1) {
+        T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
+        bbounds.lb0 += add;
+      }
+
+      if (old_ub1 > bbounds.ub1) {
+        KMP_ASSERT(old_ub1 > 0);
+        T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
+        bbounds.ub0 += add;
+      } else if (old_ub1 < bbounds.ub1) {
+        T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
+        bbounds.ub0 -= sub;
+      }
+    }
+  }
+}
+
+// Do all processing for one canonicalized loop in the nest
+// (assuming that outer loops already were processed):
+template <typename T>
+kmp_loop_nest_iv_t kmp_process_one_loop_XX(
+    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
+    /*in/out*/ bounds_info_internal_t *bounds_nest) {
+
+  kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
+  kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
+  return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
+}
+
+// Non-rectangular loop nest, canonicalized to use <= or >=.
+// Process loop nest to have a parallelepiped shape,
+// calculate biggest spans for IV's on all levels and calculate overall trip
+// count. "bounds_nest" has to be allocated per thread.
+// Returns overall trip count (for adjusted space).
+kmp_loop_nest_iv_t kmp_process_loop_nest(
+    /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
+
+  kmp_loop_nest_iv_t total = 1;
+
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(bounds_nest[ind]);
+    kmp_loop_nest_iv_t trip_count = 0;
+
+    switch (bounds->b.loop_type) {
+    case loop_type_t::loop_type_int32:
+      trip_count = kmp_process_one_loop_XX<kmp_int32>(
+          /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
+          /*in/out*/ bounds_nest);
+      break;
+    case loop_type_t::loop_type_uint32:
+      trip_count = kmp_process_one_loop_XX<kmp_uint32>(
+          /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
+          /*in/out*/ bounds_nest);
+      break;
+    case loop_type_t::loop_type_int64:
+      trip_count = kmp_process_one_loop_XX<kmp_int64>(
+          /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
+          /*in/out*/ bounds_nest);
+      break;
+    case loop_type_t::loop_type_uint64:
+      trip_count = kmp_process_one_loop_XX<kmp_uint64>(
+          /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
+          /*in/out*/ bounds_nest);
+      break;
+    default:
+      KMP_ASSERT(false);
+    }
+    total *= trip_count;
+  }
+
+  return total;
+}
+
+//----------Calculate iterations (in the original or updated space)-----------
+
+// Calculate number of iterations in original or updated space resulting in
+// original_ivs[ind] (only on this level, non-negative)
+// (not counting initial iteration)
+template <typename T>
+kmp_loop_nest_iv_t
+kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
+                                 const kmp_point_t original_ivs,
+                                 kmp_index_t ind) {
+
+  kmp_loop_nest_iv_t iterations = 0;
+
+  if (bounds->comparison == comparison_t::comp_less_or_eq) {
+    iterations =
+        (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
+         bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
+        __kmp_abs(bounds->step);
+  } else {
+    KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
+    iterations = (bounds->lb0 +
+                  bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
+                  static_cast<T>(original_ivs[ind])) /
+                 __kmp_abs(bounds->step);
+  }
+
+  return iterations;
+}
+
+// Calculate number of iterations in the original or updated space resulting in
+// original_ivs[ind] (only on this level, non-negative)
+kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
+                                                 const kmp_point_t original_ivs,
+                                                 kmp_index_t ind) {
+
+  switch (bounds->loop_type) {
+  case loop_type_t::loop_type_int32:
+    return kmp_calc_number_of_iterations_XX<kmp_int32>(
+        (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
+    break;
+  case loop_type_t::loop_type_uint32:
+    return kmp_calc_number_of_iterations_XX<kmp_uint32>(
+        (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
+    break;
+  case loop_type_t::loop_type_int64:
+    return kmp_calc_number_of_iterations_XX<kmp_int64>(
+        (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
+    break;
+  case loop_type_t::loop_type_uint64:
+    return kmp_calc_number_of_iterations_XX<kmp_uint64>(
+        (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
+    break;
+  default:
+    KMP_ASSERT(false);
+    return 0;
+  }
+}
+
+//----------Calculate new iv corresponding to original ivs--------------------
+
+// We got a point in the original loop nest.
+// Take updated bounds and calculate what new_iv will correspond to this point.
+// When we are getting original IVs from new_iv, we have to adjust to fit into
+// original loops bounds. Getting new_iv for the adjusted original IVs will help
+// with making more chunks non-empty.
+kmp_loop_nest_iv_t
+kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
+                                  const kmp_point_t original_ivs,
+                                  kmp_index_t n) {
+
+  kmp_loop_nest_iv_t new_iv = 0;
+
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(bounds_nest[ind].b);
+
+    new_iv = new_iv * bounds->trip_count +
+             kmp_calc_number_of_iterations(bounds, original_ivs, ind);
+  }
+
+  return new_iv;
+}
+
+//----------Calculate original ivs for provided iterations--------------------
+
+// Calculate original IVs for provided iterations, assuming iterations are
+// calculated in the original space.
+// Loop nest is in canonical form (with <= / >=).
+bool kmp_calc_original_ivs_from_iterations(
+    const bounds_info_t *original_bounds_nest, kmp_index_t n,
+    /*in/out*/ kmp_point_t original_ivs,
+    /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
+
+  kmp_index_t lengthened_ind = n;
+
+  for (; ind < n;) {
+    auto bounds = &(original_bounds_nest[ind]);
+    bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
+                                ind, (lengthened_ind < ind), true);
+
+    if (!good) {
+      // The calculated iv value is too big (or too small for >=):
+      if (ind == 0) {
+        // Space is empty:
+        return false;
+      } else {
+        // Go to next iteration on the outer loop:
+        --ind;
+        ++iterations[ind];
+        lengthened_ind = ind;
+        for (kmp_index_t i = ind + 1; i < n; ++i) {
+          iterations[i] = 0;
+        }
+        continue;
+      }
+    }
+    ++ind;
+  }
+
+  return true;
+}
+
+//----------Calculate original ivs for the beginning of the loop nest---------
+
+// Calculate IVs for the beginning of the loop nest.
+// Note: lower bounds of all loops may not work -
+// if on some of the iterations of the outer loops inner loops are empty.
+// Loop nest is in canonical form (with <= / >=).
+bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
+                                     kmp_index_t n,
+                                     /*out*/ kmp_point_t original_ivs) {
+
+  // Iterations in the original space, multiplied by step:
+  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
+  for (kmp_index_t ind = n; ind > 0;) {
+    --ind;
+    iterations[ind] = 0;
+  }
+
+  // Now calculate the point:
+  bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
+                                                 /*in/out*/ original_ivs,
+                                                 /*in/out*/ iterations, 0);
+  return b;
+}
+
+//----------Calculate next point in the original loop space-------------------
+
+// From current set of original IVs calculate next point.
+// Return false if there is no next point in the loop bounds.
+bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
+                                kmp_index_t n, const kmp_point_t original_ivs,
+                                /*out*/ kmp_point_t next_original_ivs) {
+  // Iterations in the original space, multiplied by step (so can be negative):
+  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
+  // First, calc corresponding iteration in every original loop:
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(original_bounds_nest[ind]);
+    iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
+  }
+
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    next_original_ivs[ind] = original_ivs[ind];
+  }
+
+  // Next add one step to the iterations on the inner-most level, and see if we
+  // need to move up the nest:
+  kmp_index_t ind = n - 1;
+  ++iterations[ind];
+
+  bool b = kmp_calc_original_ivs_from_iterations(
+      original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
+
+  return b;
+}
+
+//----------Calculate chunk end in the original loop space--------------------
+
+// For one level calculate old induction variable corresponding to overall
+// new_iv for the chunk end.
+// Return true if it fits into upper bound on this level
+// (if not, we need to re-calculate)
+template <typename T>
+bool kmp_calc_one_iv_for_chunk_end_XX(
+    const bounds_infoXX_template<T> *bounds,
+    const bounds_infoXX_template<T> *updated_bounds,
+    /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
+    kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
+    const kmp_point_t original_ivs_start) {
+
+  // typedef  std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
+  // big_span_t;
+
+  // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
+  T temp = 0;
+
+  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
+
+  if (start_with_lower_bound) {
+    // we moved to the next iteration on one of outer loops, may as well use
+    // the lower bound here:
+    temp = bounds->lb0 + bounds->lb1 * outer_iv;
+  } else {
+    // Start in expanded space, but:
+    // - we need to hit original space lower bound, so need to account for
+    // that
+    // - we have to go into original space, even if that means adding more
+    // iterations than was planned
+    // - we have to go past (or equal to) previous point (which is the chunk
+    // starting point)
+
+    auto iteration = iterations[ind];
+
+    auto step = bounds->step;
+
+    // In case of >= it's negative:
+    auto accountForStep =
+        ((bounds->lb0 + bounds->lb1 * outer_iv) -
+         (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
+        step;
+
+    temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
+           accountForStep + iteration * step;
+
+    if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
+         (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
+        ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
+         (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
+      // Too small (or too big), didn't reach the original lower bound. Use
+      // heuristic:
+      temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
+    }
+
+    if (compare_with_start) {
+
+      T start = static_cast<T>(original_ivs_start[ind]);
+
+      temp = kmp_fix_iv(bounds->loop_iv_type, temp);
+
+      // On all previous levels start of the chunk is same as the end, need to
+      // be really careful here:
+      if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
+           (temp < start)) ||
+          ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
+           (temp > start))) {
+        // End of the chunk can't be smaller (for >= bigger) than it's start.
+        // Use heuristic:
+        temp = start + iteration / 4 * step;
+      }
+    }
+  }
+
+  original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
+
+  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
+       (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
+      ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
+       (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
+    // Too big (or too small for >=).
+    return false;
+  }
+
+  return true;
+}
+
+// For one level calculate old induction variable corresponding to overall
+// new_iv for the chunk end.
+bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
+                                   const bounds_info_t *updated_bounds,
+                                   /*in/out*/ kmp_point_t original_ivs,
+                                   const kmp_iterations_t iterations,
+                                   kmp_index_t ind, bool start_with_lower_bound,
+                                   bool compare_with_start,
+                                   const kmp_point_t original_ivs_start) {
+
+  switch (bounds->loop_type) {
+  case loop_type_t::loop_type_int32:
+    return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
+        (bounds_infoXX_template<kmp_int32> *)(bounds),
+        (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
+        /*in/out*/
+        original_ivs, iterations, ind, start_with_lower_bound,
+        compare_with_start, original_ivs_start);
+    break;
+  case loop_type_t::loop_type_uint32:
+    return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
+        (bounds_infoXX_template<kmp_uint32> *)(bounds),
+        (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
+        /*in/out*/
+        original_ivs, iterations, ind, start_with_lower_bound,
+        compare_with_start, original_ivs_start);
+    break;
+  case loop_type_t::loop_type_int64:
+    return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
+        (bounds_infoXX_template<kmp_int64> *)(bounds),
+        (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
+        /*in/out*/
+        original_ivs, iterations, ind, start_with_lower_bound,
+        compare_with_start, original_ivs_start);
+    break;
+  case loop_type_t::loop_type_uint64:
+    return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
+        (bounds_infoXX_template<kmp_uint64> *)(bounds),
+        (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
+        /*in/out*/
+        original_ivs, iterations, ind, start_with_lower_bound,
+        compare_with_start, original_ivs_start);
+    break;
+  default:
+    KMP_ASSERT(false);
+    return false;
+  }
+}
+
+// Calculate old induction variables corresponding to overall new_iv for the
+// chunk end. If due to space extension we are getting old IVs outside of the
+// boundaries, bring them into the boundaries. Need to do this in the runtime,
+// esp. on the lower bounds side. When getting result need to make sure that the
+// new chunk starts at next position to old chunk, not overlaps with it (this is
+// done elsewhere), and need to make sure end of the chunk is further than the
+// beginning of the chunk. We don't need an exact ending point here, just
+// something more-or-less close to the desired chunk length, bigger is fine
+// (smaller would be fine, but we risk going into infinite loop, so do smaller
+// only at the very end of the space). result: false if could not find the
+// ending point in the original loop space. In this case the caller can use
+// original upper bounds as the end of the chunk. Chunk won't be empty, because
+// it'll have at least the starting point, which is by construction in the
+// original space.
+bool kmp_calc_original_ivs_for_chunk_end(
+    const bounds_info_t *original_bounds_nest, kmp_index_t n,
+    const bounds_info_internal_t *updated_bounds_nest,
+    const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
+    /*out*/ kmp_point_t original_ivs) {
+
+  // Iterations in the expanded space:
+  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
+  // First, calc corresponding iteration in every modified loop:
+  for (kmp_index_t ind = n; ind > 0;) {
+    --ind;
+    auto &updated_bounds = updated_bounds_nest[ind];
+
+    // should be optimized to OPDIVREM:
+    auto new_ind = new_iv / updated_bounds.b.trip_count;
+    auto iteration = new_iv % updated_bounds.b.trip_count;
+
+    new_iv = new_ind;
+    iterations[ind] = iteration;
+  }
+  KMP_DEBUG_ASSERT(new_iv == 0);
+
+  kmp_index_t lengthened_ind = n;
+  kmp_index_t equal_ind = -1;
+
+  // Next calculate the point, but in original loop nest.
+  for (kmp_index_t ind = 0; ind < n;) {
+    auto bounds = &(original_bounds_nest[ind]);
+    auto updated_bounds = &(updated_bounds_nest[ind].b);
+
+    bool good = kmp_calc_one_iv_for_chunk_end(
+        bounds, updated_bounds,
+        /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
+        (equal_ind >= ind - 1), original_ivs_start);
+
+    if (!good) {
+      // Too big (or too small for >=).
+      if (ind == 0) {
+        // Need to reduce to the end.
+        return false;
+      } else {
+        // Go to next iteration on outer loop:
+        --ind;
+        ++(iterations[ind]);
+        lengthened_ind = ind;
+        if (equal_ind >= lengthened_ind) {
+          // We've changed the number of iterations here,
+          // can't be same anymore:
+          equal_ind = lengthened_ind - 1;
+        }
+        for (kmp_index_t i = ind + 1; i < n; ++i) {
+          iterations[i] = 0;
+        }
+        continue;
+      }
+    }
+
+    if ((equal_ind == ind - 1) &&
+        (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
+                    original_ivs_start[ind]))) {
+      equal_ind = ind;
+    } else if ((equal_ind > ind - 1) &&
+               !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
+                            original_ivs_start[ind]))) {
+      equal_ind = ind - 1;
+    }
+    ++ind;
+  }
+
+  return true;
+}
+
+//----------Calculate upper bounds for the last chunk-------------------------
+
+// Calculate one upper bound for the end.
+template <typename T>
+void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
+                            /*in/out*/ kmp_point_t original_ivs,
+                            kmp_index_t ind) {
+
+  T temp = bounds->ub0 +
+           bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
+
+  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
+}
+
+void kmp_calc_one_iv_end(const bounds_info_t *bounds,
+                         /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
+
+  switch (bounds->loop_type) {
+  default:
+    KMP_ASSERT(false);
+    break;
+  case loop_type_t::loop_type_int32:
+    kmp_calc_one_iv_end_XX<kmp_int32>(
+        (bounds_infoXX_template<kmp_int32> *)(bounds),
+        /*in/out*/ original_ivs, ind);
+    break;
+  case loop_type_t::loop_type_uint32:
+    kmp_calc_one_iv_end_XX<kmp_uint32>(
+        (bounds_infoXX_template<kmp_uint32> *)(bounds),
+        /*in/out*/ original_ivs, ind);
+    break;
+  case loop_type_t::loop_type_int64:
+    kmp_calc_one_iv_end_XX<kmp_int64>(
+        (bounds_infoXX_template<kmp_int64> *)(bounds),
+        /*in/out*/ original_ivs, ind);
+    break;
+  case loop_type_t::loop_type_uint64:
+    kmp_calc_one_iv_end_XX<kmp_uint64>(
+        (bounds_infoXX_template<kmp_uint64> *)(bounds),
+        /*in/out*/ original_ivs, ind);
+    break;
+  }
+}
+
+// Calculate upper bounds for the last loop iteration. Just use original upper
+// bounds (adjusted when canonicalized to use <= / >=). No need to check that
+// this point is in the original space (it's likely not)
+void kmp_calc_original_ivs_for_end(
+    const bounds_info_t *const original_bounds_nest, kmp_index_t n,
+    /*out*/ kmp_point_t original_ivs) {
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(original_bounds_nest[ind]);
+    kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
+  }
+}
+
+/**************************************************************************
+ * Identify nested loop structure - loops come in the canonical form
+ * Lower triangle matrix: i = 0; i <= N; i++        {0,0}:{N,0}
+ *                        j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
+ * Upper Triangle matrix
+ *                        i = 0;     i <= N; i++    {0,0}:{N,0}
+ *                        j = 0+1*i; j <= N; j++    {0,1}:{N,0}
+ * ************************************************************************/
+nested_loop_type_t
+kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,
+                                   /*in*/ kmp_index_t n) {
+  // only 2-level nested loops are supported
+  if (n != 2) {
+    return nested_loop_type_unkown;
+  }
+  // loops must be canonical
+  KMP_ASSERT(
+      (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
+      (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
+  // check outer loop bounds: for triangular need to be {0,0}:{N,0}
+  kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                        original_bounds_nest[0].lb0_u64);
+  kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                        original_bounds_nest[0].ub0_u64);
+  kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                        original_bounds_nest[0].lb1_u64);
+  kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                        original_bounds_nest[0].ub1_u64);
+  if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
+    return nested_loop_type_unkown;
+  }
+  // check inner bounds to determine triangle type
+  kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                        original_bounds_nest[1].lb0_u64);
+  kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                        original_bounds_nest[1].ub0_u64);
+  kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                        original_bounds_nest[1].lb1_u64);
+  kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                        original_bounds_nest[1].ub1_u64);
+  // lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
+  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
+      (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
+    return nested_loop_type_lower_triangular_matrix;
+  }
+  // upper triangle loop inner bounds need to be {0,1}:{N,0}
+  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
+      inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
+    return nested_loop_type_upper_triangular_matrix;
+  }
+  return nested_loop_type_unkown;
+}
+
+/**************************************************************************
+ * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
+ * Start point is x so the result is always > sqrt(x)
+ * The method has uniform convergence, PRECISION is set to 0.1
+ * ************************************************************************/
+#define level_of_precision 0.1
+double sqrt_newton_approx(/*in*/ kmp_uint64 x) {
+  double sqrt_old = 0.;
+  double sqrt_new = (double)x;
+  do {
+    sqrt_old = sqrt_new;
+    sqrt_new = (sqrt_old + x / sqrt_old) / 2;
+  } while ((sqrt_old - sqrt_new) > level_of_precision);
+  return sqrt_new;
+}
+
+/**************************************************************************
+ *  Handle lower triangle matrix in the canonical form
+ *  i = 0; i <= N; i++          {0,0}:{N,0}
+ *  j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
+ * ************************************************************************/
+void kmp_handle_lower_triangle_matrix(
+    /*in*/ kmp_uint32 nth,
+    /*in*/ kmp_uint32 tid,
+    /*in */ kmp_index_t n,
+    /*in/out*/ bounds_info_t *original_bounds_nest,
+    /*out*/ bounds_info_t *chunk_bounds_nest) {
+
+  // transfer loop types from the original loop to the chunks
+  for (kmp_index_t i = 0; i < n; ++i) {
+    chunk_bounds_nest[i] = original_bounds_nest[i];
+  }
+  // cleanup iv variables
+  kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                    original_bounds_nest[0].ub0_u64);
+  kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                    original_bounds_nest[0].lb0_u64);
+  kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                    original_bounds_nest[1].ub0_u64);
+  // calculate the chunk's lower and upper bounds
+  // the total number of iterations in the loop is the sum of the arithmetic
+  // progression from the outer lower to outer upper bound (inclusive since the
+  // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
+  // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
+  // + 1) -> N - 1
+  kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
+  kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
+  // the current thread's number of iterations:
+  // each thread gets an equal number of iterations: total number of iterations
+  // divided by the number of threads plus, if there's a remainder,
+  // the first threads with the number up to the remainder get an additional
+  // iteration each to cover it
+  kmp_uint64 iter_current =
+      iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
+  // cumulative number of iterations executed by all the previous threads:
+  // threads with the tid below the remainder will have (iter_total/nth+1)
+  // elements, and so will all threads before them so the cumulative number of
+  // iterations executed by the all previous will be the current thread's number
+  // of iterations multiplied by the number of previous threads which is equal
+  // to the current thread's tid; threads with the number equal or above the
+  // remainder will have (iter_total/nth) elements so the cumulative number of
+  // iterations previously executed is its number of iterations multipled by the
+  // number of previous threads which is again equal to the current thread's tid
+  // PLUS all the remainder iterations that will have been executed by the
+  // previous threads
+  kmp_uint64 iter_before_current =
+      tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
+  // cumulative number of iterations executed with the current thread is
+  // the cumulative number executed before it plus its own
+  kmp_uint64 iter_with_current = iter_before_current + iter_current;
+  // calculate the outer loop lower bound (lbo) which is the max outer iv value
+  // that gives the number of iterations that is equal or just below the total
+  // number of iterations executed by the previous threads, for less_than
+  // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
+  // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
+  // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
+  // i.e. lbo*(lbo+1)/2<=iter_before_current =>
+  // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
+  // using a parameter to control the equation sign
+  kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
+  kmp_uint64 lower_bound_outer =
+      (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
+                                      8 * iter_before_current) +
+                   inner_adjustment) /
+          2 -
+      inner_adjustment;
+  // calculate the inner loop lower bound which is the remaining number of
+  // iterations required to hit the total number of iterations executed by the
+  // previous threads giving the starting point of this thread
+  kmp_uint64 lower_bound_inner =
+      iter_before_current -
+      ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
+  // calculate the outer loop upper bound using the same approach as for the
+  // inner bound except using the total number of iterations executed with the
+  // current thread
+  kmp_uint64 upper_bound_outer =
+      (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
+                                      8 * iter_with_current) +
+                   inner_adjustment) /
+          2 -
+      inner_adjustment;
+  // calculate the inner loop upper bound which is the remaining number of
+  // iterations required to hit the total number of iterations executed after
+  // the current thread giving the starting point of the next thread
+  kmp_uint64 upper_bound_inner =
+      iter_with_current -
+      ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
+  // adjust the upper bounds down by 1 element to point at the last iteration of
+  // the current thread the first iteration of the next thread
+  if (upper_bound_inner == 0) {
+    // {n,0} => {n-1,n-1}
+    upper_bound_outer -= 1;
+    upper_bound_inner = upper_bound_outer;
+  } else {
+    // {n,m} => {n,m-1} (m!=0)
+    upper_bound_inner -= 1;
+  }
+
+  // assign the values, zeroing out lb1 and ub1 values since the iteration space
+  // is now one-dimensional
+  chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
+  chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
+  chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
+  chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
+  chunk_bounds_nest[0].lb1_u64 = 0;
+  chunk_bounds_nest[0].ub1_u64 = 0;
+  chunk_bounds_nest[1].lb1_u64 = 0;
+  chunk_bounds_nest[1].ub1_u64 = 0;
+
+#if 0
+  printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
+         tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
+         chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
+#endif
+}
+
+/**************************************************************************
+ *  Handle upper triangle matrix in the canonical form
+ *  i = 0; i <= N; i++     {0,0}:{N,0}
+ *  j = 0+1*i; j <= N; j++ {0,1}:{N,0}
+ * ************************************************************************/
+void kmp_handle_upper_triangle_matrix(
+    /*in*/ kmp_uint32 nth,
+    /*in*/ kmp_uint32 tid,
+    /*in */ kmp_index_t n,
+    /*in/out*/ bounds_info_t *original_bounds_nest,
+    /*out*/ bounds_info_t *chunk_bounds_nest) {
+
+  // transfer loop types from the original loop to the chunks
+  for (kmp_index_t i = 0; i < n; ++i) {
+    chunk_bounds_nest[i] = original_bounds_nest[i];
+  }
+  // cleanup iv variables
+  kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                    original_bounds_nest[0].ub0_u64);
+  kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                    original_bounds_nest[0].lb0_u64);
+  [[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv(
+      original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);
+  // calculate the chunk's lower and upper bounds
+  // the total number of iterations in the loop is the sum of the arithmetic
+  // progression from the outer lower to outer upper bound (inclusive since the
+  // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
+  // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
+  // + 1) -> N - 1
+  kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
+  kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
+  // the current thread's number of iterations:
+  // each thread gets an equal number of iterations: total number of iterations
+  // divided by the number of threads plus, if there's a remainder,
+  // the first threads with the number up to the remainder get an additional
+  // iteration each to cover it
+  kmp_uint64 iter_current =
+      iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
+  // cumulative number of iterations executed by all the previous threads:
+  // threads with the tid below the remainder will have (iter_total/nth+1)
+  // elements, and so will all threads before them so the cumulative number of
+  // iterations executed by the all previous will be the current thread's number
+  // of iterations multiplied by the number of previous threads which is equal
+  // to the current thread's tid; threads with the number equal or above the
+  // remainder will have (iter_total/nth) elements so the cumulative number of
+  // iterations previously executed is its number of iterations multipled by the
+  // number of previous threads which is again equal to the current thread's tid
+  // PLUS all the remainder iterations that will have been executed by the
+  // previous threads
+  kmp_uint64 iter_before_current =
+      tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
+  // cumulative number of iterations executed with the current thread is
+  // the cumulative number executed before it plus its own
+  kmp_uint64 iter_with_current = iter_before_current + iter_current;
+  // calculate the outer loop lower bound (lbo) which is the max outer iv value
+  // that gives the number of iterations that is equal or just below the total
+  // number of iterations executed by the previous threads:
+  // lbo*(lbo+1)/2<=iter_before_current =>
+  // lbo^2+lbo-2*iter_before_current<=0
+  kmp_uint64 lower_bound_outer =
+      (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
+  // calculate the inner loop lower bound which is the remaining number of
+  // iterations required to hit the total number of iterations executed by the
+  // previous threads giving the starting point of this thread
+  kmp_uint64 lower_bound_inner =
+      iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
+  // calculate the outer loop upper bound using the same approach as for the
+  // inner bound except using the total number of iterations executed with the
+  // current thread
+  kmp_uint64 upper_bound_outer =
+      (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
+  // calculate the inner loop upper bound which is the remaining number of
+  // iterations required to hit the total number of iterations executed after
+  // the current thread giving the starting point of the next thread
+  kmp_uint64 upper_bound_inner =
+      iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
+  // adjust the upper bounds down by 1 element to point at the last iteration of
+  // the current thread the first iteration of the next thread
+  if (upper_bound_inner == 0) {
+    // {n,0} => {n-1,n-1}
+    upper_bound_outer -= 1;
+    upper_bound_inner = upper_bound_outer;
+  } else {
+    // {n,m} => {n,m-1} (m!=0)
+    upper_bound_inner -= 1;
+  }
+
+  // assign the values, zeroing out lb1 and ub1 values since the iteration space
+  // is now one-dimensional
+  chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
+  chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
+  chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
+  chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
+  chunk_bounds_nest[0].lb1_u64 = 0;
+  chunk_bounds_nest[0].ub1_u64 = 0;
+  chunk_bounds_nest[1].lb1_u64 = 0;
+  chunk_bounds_nest[1].ub1_u64 = 0;
+
+#if 0
+  printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
+         tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
+         chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
+#endif
+}
+//----------Init API for non-rectangular loops--------------------------------
+
+// Init API for collapsed loops (static, no chunks defined).
+// "bounds_nest" has to be allocated per thread.
+// API will modify original bounds_nest array to bring it to a canonical form
+// (only <= and >=, no !=, <, >). If the original loop nest was already in a
+// canonical form there will be no changes to bounds in bounds_nest array
+// (only trip counts will be calculated). Internally API will expand the space
+// to parallelogram/parallelepiped, calculate total, calculate bounds for the
+// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
+// important on the left side, to hit the lower bounds and not step over), and
+// pick the correct chunk for this thread (so it will calculate chunks up to the
+// needed one). It could be optimized to calculate just this chunk, potentially
+// a bit less well distributed among threads. It is designed to make sure that
+// threads will receive predictable chunks, deterministically (so that next nest
+// of loops with similar characteristics will get exactly same chunks on same
+// threads).
+// Current contract: chunk_bounds_nest has only lb0 and ub0,
+// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
+extern "C" kmp_int32
+__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
+                          /*in/out*/ bounds_info_t *original_bounds_nest,
+                          /*out*/ bounds_info_t *chunk_bounds_nest,
+                          kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
+
+  KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
+  KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
+
+  if (__kmp_env_consistency_check) {
+    __kmp_push_workshare(gtid, ct_pdo, loc);
+  }
+
+  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
+
+  CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
+
+  for (kmp_index_t i = 0; i < n; ++i) {
+    updated_bounds_nest[i].b = original_bounds_nest[i];
+  }
+
+  kmp_loop_nest_iv_t total =
+      kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
+
+  if (plastiter != NULL) {
+    *plastiter = FALSE;
+  }
+
+  if (total == 0) {
+    // Loop won't execute:
+    return FALSE;
+  }
+
+  // OMPTODO: DISTRIBUTE is not supported yet
+  __kmp_assert_valid_gtid(gtid);
+  kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
+
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+  kmp_uint32 nth = team->t.t_nproc; // Number of threads
+
+  KMP_DEBUG_ASSERT(tid < nth);
+
+  // Handle special cases
+  nested_loop_type_t loop_type =
+      kmp_identify_nested_loop_structure(original_bounds_nest, n);
+  if (loop_type == nested_loop_type_lower_triangular_matrix) {
+    kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
+                                     chunk_bounds_nest);
+    return TRUE;
+  } else if (loop_type == nested_loop_type_upper_triangular_matrix) {
+    kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
+                                     chunk_bounds_nest);
+    return TRUE;
+  }
+
+  CollapseAllocator<kmp_uint64> original_ivs_start(n);
+
+  if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
+                                       /*out*/ original_ivs_start)) {
+    // Loop won't execute:
+    return FALSE;
+  }
+
+  // Not doing this optimization for one thread:
+  // (1) more to test
+  // (2) without it current contract that chunk_bounds_nest has only lb0 and
+  // ub0, lb1 and ub1 are set to 0 and can be ignored.
+  // if (nth == 1) {
+  //  // One thread:
+  //  // Copy all info from original_bounds_nest, it'll be good enough.
+
+  //  for (kmp_index_t i = 0; i < n; ++i) {
+  //    chunk_bounds_nest[i] = original_bounds_nest[i];
+  //  }
+
+  //  if (plastiter != NULL) {
+  //    *plastiter = TRUE;
+  //  }
+  //  return TRUE;
+  //}
+
+  kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
+      updated_bounds_nest, original_ivs_start, n);
+
+  bool last_iter = false;
+
+  for (; nth > 0;) {
+    // We could calculate chunk size once, but this is to compensate that the
+    // original space is not parallelepiped and some threads can be left
+    // without work:
+    KMP_DEBUG_ASSERT(total >= new_iv);
+
+    kmp_loop_nest_iv_t total_left = total - new_iv;
+    kmp_loop_nest_iv_t chunk_size = total_left / nth;
+    kmp_loop_nest_iv_t remainder = total_left % nth;
+
+    kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
+
+    if (remainder > 0) {
+      ++curr_chunk_size;
+      --remainder;
+    }
+
+#if defined(KMP_DEBUG)
+    kmp_loop_nest_iv_t new_iv_for_start = new_iv;
+#endif
+
+    if (curr_chunk_size > 1) {
+      new_iv += curr_chunk_size - 1;
+    }
+
+    CollapseAllocator<kmp_uint64> original_ivs_end(n);
+    if ((nth == 1) || (new_iv >= total - 1)) {
+      // Do this one till the end - just in case we miscalculated
+      // and either too much is left to process or new_iv is a bit too big:
+      kmp_calc_original_ivs_for_end(original_bounds_nest, n,
+                                    /*out*/ original_ivs_end);
+
+      last_iter = true;
+    } else {
+      // Note: here we make sure it's past (or equal to) the previous point.
+      if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
+                                               updated_bounds_nest,
+                                               original_ivs_start, new_iv,
+                                               /*out*/ original_ivs_end)) {
+        // We could not find the ending point, use the original upper bounds:
+        kmp_calc_original_ivs_for_end(original_bounds_nest, n,
+                                      /*out*/ original_ivs_end);
+
+        last_iter = true;
+      }
+    }
+
+#if defined(KMP_DEBUG)
+    auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
+        updated_bounds_nest, original_ivs_end, n);
+    KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
+#endif
+
+    if (last_iter && (tid != 0)) {
+      // We are done, this was last chunk, but no chunk for current thread was
+      // found:
+      return FALSE;
+    }
+
+    if (tid == 0) {
+      // We found the chunk for this thread, now we need to check if it's the
+      // last chunk or not:
+
+      CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
+      if (last_iter ||
+          !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
+                                      /*out*/ original_ivs_next_start)) {
+        // no more loop iterations left to process,
+        // this means that currently found chunk is the last chunk:
+        if (plastiter != NULL) {
+          *plastiter = TRUE;
+        }
+      }
+
+      // Fill in chunk bounds:
+      for (kmp_index_t i = 0; i < n; ++i) {
+        chunk_bounds_nest[i] =
+            original_bounds_nest[i]; // To fill in types, etc. - optional
+        chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
+        chunk_bounds_nest[i].lb1_u64 = 0;
+
+        chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
+        chunk_bounds_nest[i].ub1_u64 = 0;
+      }
+
+      return TRUE;
+    }
+
+    --tid;
+    --nth;
+
+    bool next_chunk = kmp_calc_next_original_ivs(
+        original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
+    if (!next_chunk) {
+      // no more loop iterations to process,
+      // the prevoius chunk was the last chunk
+      break;
+    }
+
+    // original_ivs_start is next to previous chunk original_ivs_end,
+    // we need to start new chunk here, so chunks will be one after another
+    // without any gap or overlap:
+    new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
+                                               original_ivs_start, n);
+  }
+
+  return FALSE;
+}
diff --git a/contrib/libs/cxxsupp/openmp/kmp_collapse.h b/contrib/libs/cxxsupp/openmp/kmp_collapse.h
new file mode 100644
index 00000000000..1044478554a
--- /dev/null
+++ b/contrib/libs/cxxsupp/openmp/kmp_collapse.h
@@ -0,0 +1,247 @@
+/*
+ * kmp_collapse.h -- header for loop collapse feature
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_COLLAPSE_H
+#define KMP_COLLAPSE_H
+
+#include <type_traits>
+
+// Type of the index into the loop nest structures
+// (with values from 0 to less than n from collapse(n))
+typedef kmp_int32 kmp_index_t;
+
+// Type for combined loop nest space IV:
+typedef kmp_uint64 kmp_loop_nest_iv_t;
+
+// Loop has <, <=, etc. as a comparison:
+enum comparison_t : kmp_int32 {
+  comp_less_or_eq = 0,
+  comp_greater_or_eq = 1,
+  comp_not_eq = 2,
+  comp_less = 3,
+  comp_greater = 4
+};
+
+// Type of loop IV.
+// Type of bounds and step, after usual promotions
+// are a subset of these types (32 & 64 only):
+enum loop_type_t : kmp_int32 {
+  loop_type_uint8 = 0,
+  loop_type_int8 = 1,
+  loop_type_uint16 = 2,
+  loop_type_int16 = 3,
+  loop_type_uint32 = 4,
+  loop_type_int32 = 5,
+  loop_type_uint64 = 6,
+  loop_type_int64 = 7
+};
+
+// Defining loop types to handle special cases
+enum nested_loop_type_t : kmp_int32 {
+  nested_loop_type_unkown = 0,
+  nested_loop_type_lower_triangular_matrix = 1,
+  nested_loop_type_upper_triangular_matrix = 2
+};
+
+/*!
+ @ingroup WORK_SHARING
+ * Describes the structure for rectangular nested loops.
+ */
+template <typename T> struct bounds_infoXX_template {
+
+  // typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+
+  loop_type_t loop_type; // The differentiator
+  loop_type_t loop_iv_type;
+  comparison_t comparison;
+  // outer_iv should be 0 (or any other less then number of dimentions)
+  // if loop doesn't depend on it (lb1 and ub1 will be 0).
+  // This way we can do multiplication without a check.
+  kmp_index_t outer_iv;
+
+  // unions to keep the size constant:
+  union {
+    T lb0;
+    kmp_uint64 lb0_u64; // real type can be signed
+  };
+
+  union {
+    T lb1;
+    kmp_uint64 lb1_u64; // real type can be signed
+  };
+
+  union {
+    T ub0;
+    kmp_uint64 ub0_u64; // real type can be signed
+  };
+
+  union {
+    T ub1;
+    kmp_uint64 ub1_u64; // real type can be signed
+  };
+
+  union {
+    ST step; // signed even if bounds type is unsigned
+    kmp_int64 step_64; // signed
+  };
+
+  kmp_loop_nest_iv_t trip_count;
+};
+
+/*!
+ @ingroup WORK_SHARING
+ * Interface struct for rectangular nested loops.
+ * Same size as bounds_infoXX_template.
+ */
+struct bounds_info_t {
+
+  loop_type_t loop_type; // The differentiator
+  loop_type_t loop_iv_type;
+  comparison_t comparison;
+  // outer_iv should be 0  (or any other less then number of dimentions)
+  // if loop doesn't depend on it (lb1 and ub1 will be 0).
+  // This way we can do multiplication without a check.
+  kmp_index_t outer_iv;
+
+  kmp_uint64 lb0_u64; // real type can be signed
+  kmp_uint64 lb1_u64; // real type can be signed
+  kmp_uint64 ub0_u64; // real type can be signed
+  kmp_uint64 ub1_u64; // real type can be signed
+  kmp_int64 step_64; // signed
+
+  // This is internal, but it's the only internal thing we need
+  // in rectangular case, so let's expose it here:
+  kmp_loop_nest_iv_t trip_count;
+};
+
+//-------------------------------------------------------------------------
+// Additional types for internal representation:
+
+// Array for a point in the loop space, in the original space.
+// It's represented in kmp_uint64, but each dimention is calculated in
+// that loop IV type. Also dimentions have to be converted to those types
+// when used in generated code.
+typedef kmp_uint64 *kmp_point_t;
+
+// Array: Number of loop iterations on each nesting level to achieve some point,
+// in expanded space or in original space.
+// OMPTODO: move from using iterations to using offsets (iterations multiplied
+// by steps). For those we need to be careful with the types, as step can be
+// negative, but it'll remove multiplications and divisions in several places.
+typedef kmp_loop_nest_iv_t *kmp_iterations_t;
+
+// Internal struct with additional info:
+template <typename T> struct bounds_info_internalXX_template {
+
+  // OMPTODO: should span have type T or should it better be
+  // kmp_uint64/kmp_int64 depending on T sign? (if kmp_uint64/kmp_int64 than
+  // updated bounds should probably also be kmp_uint64/kmp_int64). I'd like to
+  // use big_span_t, if it can be resolved at compile time.
+  typedef
+      typename std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
+          big_span_t;
+
+  // typedef typename big_span_t span_t;
+  typedef T span_t;
+
+  bounds_infoXX_template<T> b; // possibly adjusted bounds
+
+  // Leaving this as a union in case we'll switch to span_t with different sizes
+  // (depending on T)
+  union {
+    // Smallest possible value of iv (may be smaller than actually possible)
+    span_t span_smallest;
+    kmp_uint64 span_smallest_u64;
+  };
+
+  // Leaving this as a union in case we'll switch to span_t with different sizes
+  // (depending on T)
+  union {
+    // Biggest possible value of iv (may be bigger than actually possible)
+    span_t span_biggest;
+    kmp_uint64 span_biggest_u64;
+  };
+
+  // Did we adjust loop bounds (not counting canonicalization)?
+  bool loop_bounds_adjusted;
+};
+
+// Internal struct with additional info:
+struct bounds_info_internal_t {
+
+  bounds_info_t b; // possibly adjusted bounds
+
+  // Smallest possible value of iv (may be smaller than actually possible)
+  kmp_uint64 span_smallest_u64;
+
+  // Biggest possible value of iv (may be bigger than actually possible)
+  kmp_uint64 span_biggest_u64;
+
+  // Did we adjust loop bounds (not counting canonicalization)?
+  bool loop_bounds_adjusted;
+};
+
+//----------APIs for rectangular loop nests--------------------------------
+
+// Canonicalize loop nest and calculate overall trip count.
+// "bounds_nest" has to be allocated per thread.
+// API will modify original bounds_nest array to bring it to a canonical form
+// (only <= and >=, no !=, <, >). If the original loop nest was already in a
+// canonical form there will be no changes to bounds in bounds_nest array
+// (only trip counts will be calculated).
+// Returns trip count of overall space.
+extern "C" kmp_loop_nest_iv_t
+__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
+                                 /*in/out*/ bounds_info_t *original_bounds_nest,
+                                 kmp_index_t n);
+
+// Calculate old induction variables corresponding to overall new_iv.
+// Note: original IV will be returned as if it had kmp_uint64 type,
+// will have to be converted to original type in user code.
+// Note: trip counts should be already calculated by
+// __kmpc_process_loop_nest_rectang.
+// OMPTODO: special case 2, 3 nested loops - if it'll be possible to inline
+// that into user code.
+extern "C" void
+__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
+                                 const bounds_info_t *original_bounds_nest,
+                                 /*out*/ kmp_uint64 *original_ivs,
+                                 kmp_index_t n);
+
+//----------Init API for non-rectangular loops--------------------------------
+
+// Init API for collapsed loops (static, no chunks defined).
+// "bounds_nest" has to be allocated per thread.
+// API will modify original bounds_nest array to bring it to a canonical form
+// (only <= and >=, no !=, <, >). If the original loop nest was already in a
+// canonical form there will be no changes to bounds in bounds_nest array
+// (only trip counts will be calculated). Internally API will expand the space
+// to parallelogram/parallelepiped, calculate total, calculate bounds for the
+// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
+// important on the left side, to hit the lower bounds and not step over), and
+// pick the correct chunk for this thread (so it will calculate chunks up to the
+// needed one). It could be optimized to calculate just this chunk, potentially
+// a bit less well distributed among threads. It is designed to make sure that
+// threads will receive predictable chunks, deterministically (so that next nest
+// of loops with similar characteristics will get exactly same chunks on same
+// threads).
+// Current contract: chunk_bounds_nest has only lb0 and ub0,
+// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
+extern "C" kmp_int32
+__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
+                          /*in/out*/ bounds_info_t *original_bounds_nest,
+                          /*out*/ bounds_info_t *chunk_bounds_nest,
+                          kmp_index_t n,
+                          /*out*/ kmp_int32 *plastiter);
+
+#endif // KMP_COLLAPSE_H
diff --git a/contrib/libs/cxxsupp/openmp/kmp_config-linux.h b/contrib/libs/cxxsupp/openmp/kmp_config-linux.h
index 2f7a7f9320a..a0289547f10 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_config-linux.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_config-linux.h
@@ -42,6 +42,8 @@
 #define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
 #define LIBOMP_OMPD_SUPPORT 0
 #define OMPD_SUPPORT LIBOMP_OMPD_SUPPORT
+#define LIBOMP_OMPX_TASKGRAPH 0
+#define OMPX_TASKGRAPH LIBOMP_OMPX_TASKGRAPH
 #define LIBOMP_PROFILING_SUPPORT 0
 #define OMP_PROFILING_SUPPORT LIBOMP_PROFILING_SUPPORT
 #define LIBOMP_OMPT_OPTIONAL 1
@@ -90,12 +92,16 @@
 #define KMP_HAVE_POSIX_MEMALIGN LIBOMP_HAVE_POSIX_MEMALIGN
 #define LIBOMP_HAVE__ALIGNED_MALLOC 0
 #define KMP_HAVE__ALIGNED_MALLOC LIBOMP_HAVE__ALIGNED_MALLOC
+#define OPENMP_ENABLE_LIBOMPTARGET 0
+#define ENABLE_LIBOMPTARGET OPENMP_ENABLE_LIBOMPTARGET
 
 // Configured cache line based on architecture
-#if KMP_ARCH_PPC64
+#if KMP_ARCH_PPC64 || KMP_ARCH_PPC
 # define CACHE_LINE 128
 #elif KMP_ARCH_AARCH64_A64FX
 # define CACHE_LINE 256
+#elif KMP_ARCH_S390X
+# define CACHE_LINE 256
 #else
 # define CACHE_LINE 64
 #endif
diff --git a/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp b/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp
index c932d450c84..fdbf9ff45e3 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp
@@ -18,6 +18,7 @@
 #include "kmp_itt.h"
 #include "kmp_lock.h"
 #include "kmp_stats.h"
+#include "kmp_utils.h"
 #include "ompt-specific.h"
 
 #define MAX_MESSAGE 512
@@ -236,6 +237,50 @@ void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
   __kmp_push_num_threads(loc, global_tid, num_threads);
 }
 
+void __kmpc_push_num_threads_strict(ident_t *loc, kmp_int32 global_tid,
+                                    kmp_int32 num_threads, int severity,
+                                    const char *message) {
+  __kmp_push_num_threads(loc, global_tid, num_threads);
+  __kmp_set_strict_num_threads(loc, global_tid, severity, message);
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param list_length number of entries in the num_threads_list array
+@param num_threads_list array of numbers of threads requested for this parallel
+construct and subsequent nested parallel constructs
+
+Set the number of threads to be used by the next fork spawned by this thread,
+and some nested forks as well.
+This call is only required if the parallel construct has a `num_threads` clause
+that has a list of integers as the argument.
+*/
+void __kmpc_push_num_threads_list(ident_t *loc, kmp_int32 global_tid,
+                                  kmp_uint32 list_length,
+                                  kmp_int32 *num_threads_list) {
+  KA_TRACE(20, ("__kmpc_push_num_threads_list: enter T#%d num_threads_list=",
+                global_tid));
+  KA_TRACE(20, ("%d", num_threads_list[0]));
+#ifdef KMP_DEBUG
+  for (kmp_uint32 i = 1; i < list_length; ++i)
+    KA_TRACE(20, (", %d", num_threads_list[i]));
+#endif
+  KA_TRACE(20, ("/n"));
+
+  __kmp_assert_valid_gtid(global_tid);
+  __kmp_push_num_threads_list(loc, global_tid, list_length, num_threads_list);
+}
+
+void __kmpc_push_num_threads_list_strict(ident_t *loc, kmp_int32 global_tid,
+                                         kmp_uint32 list_length,
+                                         kmp_int32 *num_threads_list,
+                                         int severity, const char *message) {
+  __kmp_push_num_threads_list(loc, global_tid, list_length, num_threads_list);
+  __kmp_set_strict_num_threads(loc, global_tid, severity, message);
+}
+
 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
   /* the num_threads are automatically popped */
@@ -332,6 +377,55 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
 
 /*!
 @ingroup PARALLEL
+@param loc  source location information
+@param microtask  pointer to callback routine consisting of outlined parallel
+construct
+@param cond  condition for running in parallel
+@param args  struct of pointers to shared variables that aren't global
+
+Perform a fork only if the condition is true.
+*/
+void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
+                         kmp_int32 cond, void *args) {
+  int gtid = __kmp_entry_gtid();
+  if (cond) {
+    if (args)
+      __kmpc_fork_call(loc, argc, microtask, args);
+    else
+      __kmpc_fork_call(loc, argc, microtask);
+  } else {
+    __kmpc_serialized_parallel(loc, gtid);
+
+#if OMPT_SUPPORT
+    void *exit_frame_ptr;
+#endif
+
+    if (args)
+      __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
+                             /*npr=*/0,
+                             /*argc=*/1, &args
+#if OMPT_SUPPORT
+                             ,
+                             &exit_frame_ptr
+#endif
+      );
+    else
+      __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
+                             /*npr=*/0,
+                             /*argc=*/0,
+                             /*args=*/nullptr
+#if OMPT_SUPPORT
+                             ,
+                             &exit_frame_ptr
+#endif
+      );
+
+    __kmpc_end_serialized_parallel(loc, gtid);
+  }
+}
+
+/*!
+@ingroup PARALLEL
 @param loc source location information
 @param global_tid global thread number
 @param num_teams number of teams requested for the teams construct
@@ -354,6 +448,24 @@ void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
 @ingroup PARALLEL
 @param loc source location information
 @param global_tid global thread number
+@param thread_limit limit on number of threads which can be created within the
+current task
+
+Set the thread_limit for the current task
+This call is there to support `thread_limit` clause on the `target` construct
+*/
+void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
+                             kmp_int32 thread_limit) {
+  __kmp_assert_valid_gtid(global_tid);
+  kmp_info_t *thread = __kmp_threads[global_tid];
+  if (thread_limit > 0)
+    thread->th.th_current_task->td_icvs.task_thread_limit = thread_limit;
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
 @param num_teams_lb lower bound on number of teams requested for the teams
 construct
 @param num_teams_ub upper bound on number of teams requested for the teams
@@ -586,6 +698,12 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
         serial_team->t.t_dispatch->th_disp_buffer->next;
     __kmp_free(disp_buffer);
   }
+
+  /* pop the task team stack */
+  if (serial_team->t.t_serialized > 1) {
+    __kmp_pop_task_team_node(this_thr, serial_team);
+  }
+
   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
 
   --serial_team->t.t_serialized;
@@ -624,6 +742,11 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     this_thr->th.th_current_task->td_flags.executing = 1;
 
     if (__kmp_tasking_mode != tskm_immediate_exec) {
+      // Restore task state from serial team structure
+      KMP_DEBUG_ASSERT(serial_team->t.t_primary_task_state == 0 ||
+                       serial_team->t.t_primary_task_state == 1);
+      this_thr->th.th_task_state =
+          (kmp_uint8)serial_team->t.t_primary_task_state;
       // Copy the task team from the new child / old parent team to the thread.
       this_thr->th.th_task_team =
           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
@@ -633,7 +756,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
     }
 #if KMP_AFFINITY_SUPPORTED
-    if (this_thr->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
+    if (this_thr->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
       __kmp_reset_root_init_mask(global_tid);
     }
 #endif
@@ -668,45 +791,7 @@ void __kmpc_flush(ident_t *loc) {
   KC_TRACE(10, ("__kmpc_flush: called\n"));
 
   /* need explicit __mf() here since use volatile instead in library */
-  KMP_MB(); /* Flush all pending memory write invalidates.  */
-
-#if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
-#if KMP_MIC
-// fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
-// We shouldn't need it, though, since the ABI rules require that
-// * If the compiler generates NGO stores it also generates the fence
-// * If users hand-code NGO stores they should insert the fence
-// therefore no incomplete unordered stores should be visible.
-#else
-  // C74404
-  // This is to address non-temporal store instructions (sfence needed).
-  // The clflush instruction is addressed either (mfence needed).
-  // Probably the non-temporal load monvtdqa instruction should also be
-  // addressed.
-  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
-  if (!__kmp_cpuinfo.initialized) {
-    __kmp_query_cpuid(&__kmp_cpuinfo);
-  }
-  if (!__kmp_cpuinfo.flags.sse2) {
-    // CPU cannot execute SSE2 instructions.
-  } else {
-#if KMP_COMPILER_ICC || KMP_COMPILER_ICX
-    _mm_mfence();
-#elif KMP_COMPILER_MSVC
-    MemoryBarrier();
-#else
-    __sync_synchronize();
-#endif // KMP_COMPILER_ICC || KMP_COMPILER_ICX
-  }
-#endif // KMP_MIC
-#elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
-       KMP_ARCH_RISCV64)
-// Nothing to see here move along
-#elif KMP_ARCH_PPC64
-// Nothing needed here (we have a real MB above).
-#else
-#error Unknown or unsupported architecture
-#endif
+  KMP_MFENCE(); /* Flush all pending memory write invalidates.  */
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_flush) {
@@ -1504,8 +1589,9 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
   if (*lk == 0) {
     if (KMP_IS_D_LOCK(lockseq)) {
-      KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
-                                  KMP_GET_D_TAG(lockseq));
+      (void)KMP_COMPARE_AND_STORE_ACQ32(
+          (volatile kmp_int32 *)&((kmp_base_tas_lock_t *)crit)->poll, 0,
+          KMP_GET_D_TAG(lockseq));
     } else {
       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
     }
@@ -1920,13 +2006,13 @@ void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_work) {
-    ompt_work_t ompt_work_type = ompt_work_loop;
+    ompt_work_t ompt_work_type = ompt_work_loop_static;
     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
     // Determine workshare type
     if (loc != NULL) {
       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
-        ompt_work_type = ompt_work_loop;
+        ompt_work_type = ompt_work_loop_static;
       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
         ompt_work_type = ompt_work_sections;
       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
@@ -2027,7 +2113,8 @@ void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
   __kmp_assign_root_init_mask();
   gtid = __kmp_get_gtid();
 #if KMP_AFFINITY_SUPPORTED
-  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
+  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
+      __kmp_affinity.flags.reset) {
     __kmp_reset_root_init_mask(gtid);
   }
 #endif
@@ -2045,7 +2132,8 @@ size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
   __kmp_assign_root_init_mask();
   gtid = __kmp_get_gtid();
 #if KMP_AFFINITY_SUPPORTED
-  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
+  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
+      __kmp_affinity.flags.reset) {
     __kmp_reset_root_init_mask(gtid);
   }
 #endif
@@ -2070,14 +2158,15 @@ void kmpc_set_stacksize_s(size_t arg) {
 }
 
 void kmpc_set_blocktime(int arg) {
-  int gtid, tid;
+  int gtid, tid, bt = arg;
   kmp_info_t *thread;
 
   gtid = __kmp_entry_gtid();
   tid = __kmp_tid_from_gtid(gtid);
   thread = __kmp_thread_from_gtid(gtid);
 
-  __kmp_aux_set_blocktime(arg, thread, tid);
+  __kmp_aux_convert_blocktime(&bt);
+  __kmp_aux_set_blocktime(bt, thread, tid);
 }
 
 void kmpc_set_library(int arg) {
@@ -3155,7 +3244,7 @@ int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
-        ompt_mutex_lock, omp_lock_hint_none,
+        ompt_mutex_test_lock, omp_lock_hint_none,
         __ompt_get_mutex_impl_type(user_lock),
         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
@@ -3179,7 +3268,7 @@ int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     if (ompt_enabled.ompt_callback_mutex_acquired) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-          ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+          ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
     }
 #endif
     return FTN_TRUE;
@@ -3219,7 +3308,7 @@ int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
-        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        ompt_mutex_test_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
   }
 #endif
@@ -3235,7 +3324,7 @@ int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+        ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
   }
 #endif
 
@@ -3260,7 +3349,7 @@ int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
-        ompt_mutex_nest_lock, omp_lock_hint_none,
+        ompt_mutex_test_nest_lock, omp_lock_hint_none,
         __ompt_get_mutex_impl_type(user_lock),
         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
@@ -3279,7 +3368,7 @@ int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
       if (ompt_enabled.ompt_callback_mutex_acquired) {
         // lock_first
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
+            ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
             codeptr);
       }
     } else {
@@ -3326,7 +3415,7 @@ int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   if (ompt_enabled.enabled) &&
         ompt_enabled.ompt_callback_mutex_acquire) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
-          ompt_mutex_nest_lock, omp_lock_hint_none,
+          ompt_mutex_test_nest_lock, omp_lock_hint_none,
           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
           codeptr);
     }
@@ -3346,7 +3435,7 @@ int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
       if (ompt_enabled.ompt_callback_mutex_acquired) {
         // lock_first
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+            ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
       }
     } else {
       if (ompt_enabled.ompt_callback_nest_lock) {
@@ -3397,8 +3486,8 @@ __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
   // Check if it is initialized.
   if (*lk == 0) {
     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
-      KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
-                                  KMP_GET_D_TAG(__kmp_user_lock_seq));
+      (void)KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
+                                        KMP_GET_D_TAG(__kmp_user_lock_seq));
     } else {
       __kmp_init_indirect_csptr(crit, loc, global_tid,
                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
@@ -4200,7 +4289,7 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
   up = pr_buf->th_doacross_info[3];
   st = pr_buf->th_doacross_info[4];
 #if OMPT_SUPPORT && OMPT_OPTIONAL
-  ompt_dependence_t deps[num_dims];
+  SimpleVLA<ompt_dependence_t> deps(num_dims);
 #endif
   if (st == 1) { // most common case
     if (vec[0] < lo || vec[0] > up) {
@@ -4312,7 +4401,7 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
   lo = pr_buf->th_doacross_info[2];
   st = pr_buf->th_doacross_info[4];
 #if OMPT_SUPPORT && OMPT_OPTIONAL
-  ompt_dependence_t deps[num_dims];
+  SimpleVLA<ompt_dependence_t> deps(num_dims);
 #endif
   if (st == 1) { // most common case
     iter_number = vec[0] - lo;
@@ -4458,7 +4547,7 @@ void __kmpc_error(ident_t *loc, int severity, const char *message) {
   if (loc && loc->psource) {
     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
     src_loc =
-        __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
+        __kmp_str_format("%s:%d:%d", str_loc.file, str_loc.line, str_loc.col);
     __kmp_str_loc_free(&str_loc);
   } else {
     src_loc = __kmp_str_format("unknown");
diff --git a/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp b/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp
index 8acf3d429e6..2431e3a5cab 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp
@@ -90,6 +90,70 @@ static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
   return monotonicity;
 }
 
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+// Return floating point number rounded to two decimal points
+static inline float __kmp_round_2decimal_val(float num) {
+  return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
+}
+static inline int __kmp_get_round_val(float num) {
+  return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
+}
+#endif
+
+template <typename T>
+inline void
+__kmp_initialize_self_buffer(kmp_team_t *team, T id,
+                             dispatch_private_info_template<T> *pr,
+                             typename traits_t<T>::unsigned_t nchunks, T nproc,
+                             typename traits_t<T>::unsigned_t &init,
+                             T &small_chunk, T &extras, T &p_extra) {
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+  if (pr->flags.use_hybrid) {
+    kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
+    kmp_hw_core_type_t type =
+        (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
+    T pchunks = pr->u.p.pchunks;
+    T echunks = nchunks - pchunks;
+    T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
+    T num_procs_with_ecore = nproc - num_procs_with_pcore;
+    T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
+    T big_chunk =
+        pchunks / num_procs_with_pcore; // chunks per thread with p-core
+    small_chunk =
+        echunks / num_procs_with_ecore; // chunks per thread with e-core
+
+    extras =
+        (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
+
+    p_extra = (big_chunk - small_chunk);
+
+    if (type == KMP_HW_CORE_TYPE_CORE) {
+      if (id < first_thread_with_ecore) {
+        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
+      } else {
+        init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
+               (id < extras ? id : extras);
+      }
+    } else {
+      if (id == first_thread_with_ecore) {
+        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
+      } else {
+        init = id * small_chunk + first_thread_with_ecore * p_extra +
+               (id < extras ? id : extras);
+      }
+    }
+    p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
+    return;
+  }
+#endif
+
+  small_chunk = nchunks / nproc; // chunks per thread
+  extras = nchunks % nproc;
+  p_extra = 0;
+  init = id * small_chunk + (id < extras ? id : extras);
+}
+
 #if KMP_STATIC_STEAL_ENABLED
 enum { // values for steal_flag (possible states of private per-loop buffer)
   UNUSED = 0,
@@ -366,7 +430,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
   switch (schedule) {
 #if KMP_STATIC_STEAL_ENABLED
   case kmp_sch_static_steal: {
-    T ntc, init;
+    T ntc, init = 0;
 
     KD_TRACE(100,
              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
@@ -376,7 +440,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
     if (nproc > 1 && ntc >= nproc) {
       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
       T id = tid;
-      T small_chunk, extras;
+      T small_chunk, extras, p_extra = 0;
       kmp_uint32 old = UNUSED;
       int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
       if (traits_t<T>::type_size > 4) {
@@ -388,13 +452,110 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
         pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
         __kmp_init_lock(pr->u.p.steal_lock);
       }
-      small_chunk = ntc / nproc;
-      extras = ntc % nproc;
 
-      init = id * small_chunk + (id < extras ? id : extras);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+      // Iterations are divided in a 60/40 skewed distribution among CORE and
+      // ATOM processors for hybrid systems
+      bool use_hybrid = false;
+      kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+      T first_thread_with_ecore = 0;
+      T num_procs_with_pcore = 0;
+      T num_procs_with_ecore = 0;
+      T p_ntc = 0, e_ntc = 0;
+      if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
+          __kmp_affinity.type != affinity_explicit) {
+        use_hybrid = true;
+        core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
+        if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
+            __kmp_first_osid_with_ecore > -1) {
+          for (int i = 0; i < team->t.t_nproc; ++i) {
+            kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
+                                          ->th.th_topology_attrs.core_type;
+            int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
+            if (id == __kmp_first_osid_with_ecore) {
+              first_thread_with_ecore =
+                  team->t.t_threads[i]->th.th_info.ds.ds_tid;
+            }
+            if (type == KMP_HW_CORE_TYPE_CORE) {
+              num_procs_with_pcore++;
+            } else if (type == KMP_HW_CORE_TYPE_ATOM) {
+              num_procs_with_ecore++;
+            } else {
+              use_hybrid = false;
+              break;
+            }
+          }
+        }
+        if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
+          float multiplier = 60.0 / 40.0;
+          float p_ratio = (float)num_procs_with_pcore / nproc;
+          float e_ratio = (float)num_procs_with_ecore / nproc;
+          float e_multiplier =
+              (float)1 /
+              (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
+          float p_multiplier = multiplier * e_multiplier;
+          p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
+          if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
+            e_ntc =
+                (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
+          else
+            e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
+          KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
+
+          // Use regular static steal if not enough chunks for skewed
+          // distribution
+          use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
+                                       e_ntc >= num_procs_with_ecore)
+                            ? true
+                            : false);
+        } else {
+          use_hybrid = false;
+        }
+      }
+      pr->flags.use_hybrid = use_hybrid;
+      pr->u.p.pchunks = p_ntc;
+      pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
+      pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
+
+      if (use_hybrid) {
+        KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
+        T big_chunk = p_ntc / num_procs_with_pcore;
+        small_chunk = e_ntc / num_procs_with_ecore;
+
+        extras =
+            (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
+
+        p_extra = (big_chunk - small_chunk);
+
+        if (core_type == KMP_HW_CORE_TYPE_CORE) {
+          if (id < first_thread_with_ecore) {
+            init =
+                id * small_chunk + id * p_extra + (id < extras ? id : extras);
+          } else {
+            init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
+                   (id < extras ? id : extras);
+          }
+        } else {
+          if (id == first_thread_with_ecore) {
+            init =
+                id * small_chunk + id * p_extra + (id < extras ? id : extras);
+          } else {
+            init = id * small_chunk + first_thread_with_ecore * p_extra +
+                   (id < extras ? id : extras);
+          }
+        }
+        p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
+      } else
+#endif
+      {
+        small_chunk = ntc / nproc;
+        extras = ntc % nproc;
+        init = id * small_chunk + (id < extras ? id : extras);
+        p_extra = 0;
+      }
       pr->u.p.count = init;
       if (claimed) { // are we succeeded in claiming own buffer?
-        pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+        pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
         // Other threads will inspect steal_flag when searching for a victim.
         // READY means other threads may steal from this thread from now on.
         KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
@@ -1003,8 +1164,9 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
     ompt_callbacks.ompt_callback(ompt_callback_work)(
-        ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
-        &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
+        ompt_get_work_schedule(pr->schedule), ompt_scope_begin,
+        &(team_info->parallel_data), &(task_info->task_data), pr->u.p.tc,
+        OMPT_LOAD_RETURN_ADDRESS(gtid));
   }
 #endif
   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
@@ -1261,13 +1423,13 @@ int __kmp_dispatch_next_algorithm(int gtid,
             if (status) {
               // initialize self buffer with victim's whole range of chunks
               T id = victimId;
-              T small_chunk, extras;
-              small_chunk = nchunks / nproc; // chunks per thread
-              extras = nchunks % nproc;
-              init = id * small_chunk + (id < extras ? id : extras);
+              T small_chunk = 0, extras = 0, p_extra = 0;
+              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
+                                              init, small_chunk, extras,
+                                              p_extra);
               __kmp_acquire_lock(lck, gtid);
               pr->u.p.count = init + 1; // exclude one we execute immediately
-              pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+              pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
               __kmp_release_lock(lck, gtid);
               pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
               // no need to reinitialize other thread invariants: lb, st, etc.
@@ -1275,10 +1437,10 @@ int __kmp_dispatch_next_algorithm(int gtid,
               {
                 char *buff;
                 // create format specifiers before the debug output
-                buff = __kmp_str_format(
-                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
-                    "count:%%%s ub:%%%s\n",
-                    traits_t<UT>::spec, traits_t<T>::spec);
+                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+                                        "stolen chunks from T#%%d, "
+                                        "count:%%%s ub:%%%s\n",
+                                        traits_t<UT>::spec, traits_t<T>::spec);
                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
                 __kmp_str_free(&buff);
               }
@@ -1289,7 +1451,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
               break;
             }
           }
-          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
+          if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
               v->u.p.count >= (UT)v->u.p.ub) {
             pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
             continue; // no chunks to steal, try next victim
@@ -1404,12 +1566,12 @@ int __kmp_dispatch_next_algorithm(int gtid,
             if (status) {
               // initialize self buffer with victim's whole range of chunks
               T id = victimId;
-              T small_chunk, extras;
-              small_chunk = nchunks / nproc; // chunks per thread
-              extras = nchunks % nproc;
-              init = id * small_chunk + (id < extras ? id : extras);
+              T small_chunk = 0, extras = 0, p_extra = 0;
+              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
+                                              init, small_chunk, extras,
+                                              p_extra);
               vnew.p.count = init + 1;
-              vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+              vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
               // write pair (count, ub) at once atomically
 #if KMP_ARCH_X86
               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
@@ -1422,10 +1584,10 @@ int __kmp_dispatch_next_algorithm(int gtid,
               {
                 char *buff;
                 // create format specifiers before the debug output
-                buff = __kmp_str_format(
-                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
-                    "count:%%%s ub:%%%s\n",
-                    traits_t<UT>::spec, traits_t<T>::spec);
+                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+                                        "stolen chunks from T#%%d, "
+                                        "count:%%%s ub:%%%s\n",
+                                        traits_t<UT>::spec, traits_t<T>::spec);
                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
                 __kmp_str_free(&buff);
               }
@@ -1960,8 +2122,8 @@ int __kmp_dispatch_next_algorithm(int gtid,
       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
-          ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
-          &(task_info->task_data), 0, codeptr);                                \
+          ompt_get_work_schedule(pr->schedule), ompt_scope_end,                \
+          &(team_info->parallel_data), &(task_info->task_data), 0, codeptr);   \
     }                                                                          \
   }
 #define OMPT_LOOP_DISPATCH(lb, ub, st, status)                                 \
@@ -2236,6 +2398,8 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
           sh->u.s.ordered_iteration = 0;
         }
 
+        KMP_MB(); /* Flush all pending memory write invalidates.  */
+
         sh->buffer_index += __kmp_dispatch_num_buffers;
         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
                        gtid, sh->buffer_index));
@@ -2387,7 +2551,7 @@ thread
 kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
                               kmp_int32 numberOfSections) {
 
-  KMP_TIME_PARTITIONED_BLOCK(OMP_sections);
+  KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
 
   kmp_info_t *th = __kmp_threads[gtid];
 #ifdef KMP_DEBUG
@@ -2460,7 +2624,6 @@ kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
           ompt_dispatch_section, instance);
     }
 #endif
-    KMP_POP_PARTITIONED_TIMER();
   }
 
   return sectionIndex;
@@ -2492,9 +2655,9 @@ void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
           &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
     }
 #endif
-    KMP_POP_PARTITIONED_TIMER();
   }
 
+  KMP_POP_PARTITIONED_TIMER();
   KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
 }
 
@@ -2847,6 +3010,11 @@ See @ref __kmpc_dispatch_fini_4
 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
 }
+
+/*!
+See @ref __kmpc_dispatch_deinit
+*/
+void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid) {}
 /*! @} */
 
 //-----------------------------------------------------------------------------
diff --git a/contrib/libs/cxxsupp/openmp/kmp_dispatch.h b/contrib/libs/cxxsupp/openmp/kmp_dispatch.h
index 154db174613..cf19eb52662 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_dispatch.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_dispatch.h
@@ -75,14 +75,17 @@ template <typename T> struct dispatch_private_infoXX_template {
   ST st; // signed
   UT tc; // unsigned
   kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+  UT ordered_lower; // unsigned
+  UT ordered_upper; // unsigned
+
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
-  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+  // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
   //    a) parm3 is properly aligned and
   //    b) all parm1-4 are in the same cache line.
   // Because of parm1-4 are used together, performance seems to be better
   // if they are in the same line (not measured though).
-
   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
     T parm1;
     T parm2;
@@ -90,8 +93,11 @@ template <typename T> struct dispatch_private_infoXX_template {
     T parm4;
   };
 
-  UT ordered_lower; // unsigned
-  UT ordered_upper; // unsigned
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+  UT pchunks; // total number of chunks for processes with p-core
+  UT num_procs_with_pcore; // number of threads with p-core
+  T first_thread_with_ecore;
+#endif
 #if KMP_OS_WINDOWS
   T last_upper;
 #endif /* KMP_OS_WINDOWS */
diff --git a/contrib/libs/cxxsupp/openmp/kmp_environment.cpp b/contrib/libs/cxxsupp/openmp/kmp_environment.cpp
index b35027b57f0..4def6ea9ac2 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_environment.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_environment.cpp
@@ -407,9 +407,11 @@ ___kmp_env_blk_parse_unix(kmp_env_blk_t *block, // M: Env block to fill.
     int i;
     var = bulk;
     for (i = 0; i < count; ++i) {
+      KMP_ASSERT(var < bulk + size);
+      [[maybe_unused]] size_t ssize = size - (var - bulk);
       // Copy variable to bulk.
       len = KMP_STRLEN(env[i]);
-      KMP_MEMCPY_S(var, size, env[i], len + 1);
+      KMP_MEMCPY_S(var, ssize, env[i], len + 1);
       // Save found variable in vars array.
       __kmp_str_split(var, '=', &name, &value);
       vars[i].name = name;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h b/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h
index 6b332244c6d..8882899c483 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h
@@ -112,17 +112,19 @@ void FTN_STDCALL FTN_SET_BLOCKTIME(int KMP_DEREF arg) {
 #ifdef KMP_STUB
   __kmps_set_blocktime(KMP_DEREF arg);
 #else
-  int gtid, tid;
+  int gtid, tid, bt = (KMP_DEREF arg);
   kmp_info_t *thread;
 
   gtid = __kmp_entry_gtid();
   tid = __kmp_tid_from_gtid(gtid);
   thread = __kmp_thread_from_gtid(gtid);
 
-  __kmp_aux_set_blocktime(KMP_DEREF arg, thread, tid);
+  __kmp_aux_convert_blocktime(&bt);
+  __kmp_aux_set_blocktime(bt, thread, tid);
 #endif
 }
 
+// Gets blocktime in units used for KMP_BLOCKTIME, ms otherwise
 int FTN_STDCALL FTN_GET_BLOCKTIME(void) {
 #ifdef KMP_STUB
   return __kmps_get_blocktime();
@@ -136,21 +138,24 @@ int FTN_STDCALL FTN_GET_BLOCKTIME(void) {
 
   /* These must match the settings used in __kmp_wait_sleep() */
   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
-    KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
-                  team->t.t_id, tid, KMP_MAX_BLOCKTIME));
+    KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d%cs\n", gtid,
+                  team->t.t_id, tid, KMP_MAX_BLOCKTIME, __kmp_blocktime_units));
     return KMP_MAX_BLOCKTIME;
   }
 #ifdef KMP_ADJUST_BLOCKTIME
   else if (__kmp_zero_bt && !get__bt_set(team, tid)) {
-    KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
-                  team->t.t_id, tid, 0));
+    KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d%cs\n", gtid,
+                  team->t.t_id, tid, 0, __kmp_blocktime_units));
     return 0;
   }
 #endif /* KMP_ADJUST_BLOCKTIME */
   else {
-    KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
-                  team->t.t_id, tid, get__blocktime(team, tid)));
-    return get__blocktime(team, tid);
+    int bt = get__blocktime(team, tid);
+    if (__kmp_blocktime_units == 'm')
+      bt = bt / 1000;
+    KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d%cs\n", gtid,
+                  team->t.t_id, tid, bt, __kmp_blocktime_units));
+    return bt;
   }
 #endif
 }
@@ -239,7 +244,8 @@ int FTN_STDCALL FTN_GET_AFFINITY(void **mask) {
   }
   __kmp_assign_root_init_mask();
   int gtid = __kmp_get_gtid();
-  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
+  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
+      __kmp_affinity.flags.reset) {
     __kmp_reset_root_init_mask(gtid);
   }
   return __kmp_aux_get_affinity(mask);
@@ -365,7 +371,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_THREADS)(void) {
   gtid = __kmp_entry_gtid();
   thread = __kmp_threads[gtid];
 #if KMP_AFFINITY_SUPPORTED
-  if (thread->th.th_team->t.t_level == 0 && !__kmp_affin_reset) {
+  if (thread->th.th_team->t.t_level == 0 && !__kmp_affinity.flags.reset) {
     __kmp_assign_root_init_mask();
   }
 #endif
@@ -518,7 +524,8 @@ void FTN_STDCALL KMP_EXPAND_NAME_IF_APPEND(FTN_DISPLAY_AFFINITY)(
   __kmp_assign_root_init_mask();
   gtid = __kmp_get_gtid();
 #if KMP_AFFINITY_SUPPORTED
-  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
+  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
+      __kmp_affinity.flags.reset) {
     __kmp_reset_root_init_mask(gtid);
   }
 #endif
@@ -551,7 +558,8 @@ size_t FTN_STDCALL KMP_EXPAND_NAME_IF_APPEND(FTN_CAPTURE_AFFINITY)(
   __kmp_assign_root_init_mask();
   gtid = __kmp_get_gtid();
 #if KMP_AFFINITY_SUPPORTED
-  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
+  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
+      __kmp_affinity.flags.reset) {
     __kmp_reset_root_init_mask(gtid);
   }
 #endif
@@ -574,7 +582,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_NUM)(void) {
   int gtid;
 
 #if KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||    \
-    KMP_OS_HURD || KMP_OS_OPENBSD
+    KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX
   gtid = __kmp_entry_gtid();
 #elif KMP_OS_WINDOWS
   if (!__kmp_init_parallel ||
@@ -585,7 +593,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_NUM)(void) {
     return 0;
   }
   --gtid; // We keep (gtid+1) in TLS
-#elif KMP_OS_LINUX
+#elif KMP_OS_LINUX || KMP_OS_WASI
 #ifdef KMP_TDATA_GTID
   if (__kmp_gtid_mode >= 3) {
     if ((gtid = __kmp_gtid) == KMP_GTID_DNE) {
@@ -631,7 +639,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PROCS)(void) {
     __kmp_middle_initialize();
   }
 #if KMP_AFFINITY_SUPPORTED
-  if (!__kmp_affin_reset) {
+  if (!__kmp_affinity.flags.reset) {
     // only bind root here if its affinity reset is not requested
     int gtid = __kmp_entry_gtid();
     kmp_info_t *thread = __kmp_threads[gtid];
@@ -799,6 +807,10 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_LIMIT)(void) {
 
   gtid = __kmp_entry_gtid();
   thread = __kmp_threads[gtid];
+  // If thread_limit for the target task is defined, return that instead of the
+  // regular task thread_limit
+  if (int thread_limit = thread->th.th_current_task->td_icvs.task_thread_limit)
+    return thread_limit;
   return thread->th.th_current_task->td_icvs.thread_limit;
 #endif
 }
@@ -831,7 +843,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PLACES)(void) {
   }
   if (!KMP_AFFINITY_CAPABLE())
     return 0;
-  if (!__kmp_affin_reset) {
+  if (!__kmp_affinity.flags.reset) {
     // only bind root here if its affinity reset is not requested
     int gtid = __kmp_entry_gtid();
     kmp_info_t *thread = __kmp_threads[gtid];
@@ -839,7 +851,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PLACES)(void) {
       __kmp_assign_root_init_mask();
     }
   }
-  return __kmp_affinity_num_masks;
+  return __kmp_affinity.num_masks;
 #endif
 }
 
@@ -854,7 +866,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM_PROCS)(int place_num) {
   }
   if (!KMP_AFFINITY_CAPABLE())
     return 0;
-  if (!__kmp_affin_reset) {
+  if (!__kmp_affinity.flags.reset) {
     // only bind root here if its affinity reset is not requested
     int gtid = __kmp_entry_gtid();
     kmp_info_t *thread = __kmp_threads[gtid];
@@ -862,9 +874,9 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM_PROCS)(int place_num) {
       __kmp_assign_root_init_mask();
     }
   }
-  if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
+  if (place_num < 0 || place_num >= (int)__kmp_affinity.num_masks)
     return 0;
-  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num);
+  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity.masks, place_num);
   KMP_CPU_SET_ITERATE(i, mask) {
     if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
         (!KMP_CPU_ISSET(i, mask))) {
@@ -887,7 +899,7 @@ void FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_PROC_IDS)(int place_num,
   }
   if (!KMP_AFFINITY_CAPABLE())
     return;
-  if (!__kmp_affin_reset) {
+  if (!__kmp_affinity.flags.reset) {
     // only bind root here if its affinity reset is not requested
     int gtid = __kmp_entry_gtid();
     kmp_info_t *thread = __kmp_threads[gtid];
@@ -895,9 +907,9 @@ void FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_PROC_IDS)(int place_num,
       __kmp_assign_root_init_mask();
     }
   }
-  if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
+  if (place_num < 0 || place_num >= (int)__kmp_affinity.num_masks)
     return;
-  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num);
+  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity.masks, place_num);
   j = 0;
   KMP_CPU_SET_ITERATE(i, mask) {
     if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
@@ -922,7 +934,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM)(void) {
     return -1;
   gtid = __kmp_entry_gtid();
   thread = __kmp_thread_from_gtid(gtid);
-  if (thread->th.th_team->t.t_level == 0 && !__kmp_affin_reset) {
+  if (thread->th.th_team->t.t_level == 0 && !__kmp_affinity.flags.reset) {
     __kmp_assign_root_init_mask();
   }
   if (thread->th.th_current_place < 0)
@@ -944,7 +956,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_NUM_PLACES)(void) {
     return 0;
   gtid = __kmp_entry_gtid();
   thread = __kmp_thread_from_gtid(gtid);
-  if (thread->th.th_team->t.t_level == 0 && !__kmp_affin_reset) {
+  if (thread->th.th_team->t.t_level == 0 && !__kmp_affinity.flags.reset) {
     __kmp_assign_root_init_mask();
   }
   first_place = thread->th.th_first_place;
@@ -954,7 +966,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_NUM_PLACES)(void) {
   if (first_place <= last_place)
     num_places = last_place - first_place + 1;
   else
-    num_places = __kmp_affinity_num_masks - first_place + last_place + 1;
+    num_places = __kmp_affinity.num_masks - first_place + last_place + 1;
   return num_places;
 #endif
 }
@@ -973,7 +985,7 @@ KMP_EXPAND_NAME(FTN_GET_PARTITION_PLACE_NUMS)(int *place_nums) {
     return;
   gtid = __kmp_entry_gtid();
   thread = __kmp_thread_from_gtid(gtid);
-  if (thread->th.th_team->t.t_level == 0 && !__kmp_affin_reset) {
+  if (thread->th.th_team->t.t_level == 0 && !__kmp_affinity.flags.reset) {
     __kmp_assign_root_init_mask();
   }
   first_place = thread->th.th_first_place;
@@ -1031,7 +1043,7 @@ void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_DEFAULT_DEVICE)(int KMP_DEREF arg) {
 int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void)
     KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) {
-#if KMP_MIC || KMP_OS_DARWIN || defined(KMP_STUB)
+#if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
   return 0;
 #else
   int (*fptr)();
@@ -1415,6 +1427,8 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_PAUSE_RESOURCE)(kmp_pause_status_t kind,
 #ifdef KMP_STUB
   return 1; // just fail
 #else
+  if (kind == kmp_stop_tool_paused)
+    return 1; // stop_tool must not be specified
   if (device_num == KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)())
     return __kmpc_pause_resource(kind);
   else {
@@ -1546,14 +1560,14 @@ typedef void *omp_interop_t;
 
 // libomptarget, if loaded, provides this function
 int FTN_STDCALL FTN_GET_NUM_INTEROP_PROPERTIES(const omp_interop_t interop) {
-#if KMP_MIC || KMP_OS_DARWIN || defined(KMP_STUB)
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
   return 0;
 #else
   int (*fptr)(const omp_interop_t);
   if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_num_interop_properties")))
     return (*fptr)(interop);
   return 0;
-#endif // KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
+#endif
 }
 
 /// TODO Convert FTN_GET_INTEROP_XXX functions into a macro like interop.cpp
@@ -1561,57 +1575,81 @@ int FTN_STDCALL FTN_GET_NUM_INTEROP_PROPERTIES(const omp_interop_t interop) {
 intptr_t FTN_STDCALL FTN_GET_INTEROP_INT(const omp_interop_t interop,
                                          omp_interop_property_t property_id,
                                          int *err) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return 0;
+#else
   intptr_t (*fptr)(const omp_interop_t, omp_interop_property_t, int *);
   if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_int")))
     return (*fptr)(interop, property_id, err);
   return 0;
+#endif
 }
 
 // libomptarget, if loaded, provides this function
 void *FTN_STDCALL FTN_GET_INTEROP_PTR(const omp_interop_t interop,
                                       omp_interop_property_t property_id,
                                       int *err) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return nullptr;
+#else
   void *(*fptr)(const omp_interop_t, omp_interop_property_t, int *);
   if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_ptr")))
     return (*fptr)(interop, property_id, err);
   return nullptr;
+#endif
 }
 
 // libomptarget, if loaded, provides this function
 const char *FTN_STDCALL FTN_GET_INTEROP_STR(const omp_interop_t interop,
                                             omp_interop_property_t property_id,
                                             int *err) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return nullptr;
+#else
   const char *(*fptr)(const omp_interop_t, omp_interop_property_t, int *);
   if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_str")))
     return (*fptr)(interop, property_id, err);
   return nullptr;
+#endif
 }
 
 // libomptarget, if loaded, provides this function
 const char *FTN_STDCALL FTN_GET_INTEROP_NAME(
     const omp_interop_t interop, omp_interop_property_t property_id) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return nullptr;
+#else
   const char *(*fptr)(const omp_interop_t, omp_interop_property_t);
   if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_name")))
     return (*fptr)(interop, property_id);
   return nullptr;
+#endif
 }
 
 // libomptarget, if loaded, provides this function
 const char *FTN_STDCALL FTN_GET_INTEROP_TYPE_DESC(
     const omp_interop_t interop, omp_interop_property_t property_id) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return nullptr;
+#else
   const char *(*fptr)(const omp_interop_t, omp_interop_property_t);
   if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_type_desc")))
     return (*fptr)(interop, property_id);
   return nullptr;
+#endif
 }
 
 // libomptarget, if loaded, provides this function
 const char *FTN_STDCALL FTN_GET_INTEROP_RC_DESC(
     const omp_interop_t interop, omp_interop_property_t property_id) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return nullptr;
+#else
   const char *(*fptr)(const omp_interop_t, omp_interop_property_t);
   if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_rec_desc")))
     return (*fptr)(interop, property_id);
   return nullptr;
+#endif
 }
 
 // display environment variables when requested
diff --git a/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h b/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h
index d37c9c86028..7d595b947f4 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h
@@ -116,6 +116,8 @@
 #define FTN_TARGET_IS_PRESENT omp_target_is_present
 #define FTN_TARGET_MEMCPY omp_target_memcpy
 #define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect
+#define FTN_TARGET_MEMSET omp_target_memset
+#define FTN_TARGET_MEMSET_ASYNC omp_target_memset_async
 #define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr
 #define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr
 #endif
diff --git a/contrib/libs/cxxsupp/openmp/kmp_global.cpp b/contrib/libs/cxxsupp/openmp/kmp_global.cpp
index 04b63c72d6e..30fb65163cd 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_global.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_global.cpp
@@ -63,8 +63,8 @@ int __kmp_init_counter = 0;
 int __kmp_root_counter = 0;
 int __kmp_version = 0;
 
-std::atomic<kmp_int32> __kmp_team_counter = ATOMIC_VAR_INIT(0);
-std::atomic<kmp_int32> __kmp_task_counter = ATOMIC_VAR_INIT(0);
+std::atomic<kmp_int32> __kmp_team_counter = 0;
+std::atomic<kmp_int32> __kmp_task_counter = 0;
 
 size_t __kmp_stksize = KMP_DEFAULT_STKSIZE;
 #if KMP_USE_MONITOR
@@ -125,6 +125,7 @@ size_t __kmp_sys_min_stksize = KMP_MIN_STKSIZE;
 int __kmp_sys_max_nth = KMP_MAX_NTH;
 int __kmp_max_nth = 0;
 int __kmp_cg_max_nth = 0;
+int __kmp_task_max_nth = 0;
 int __kmp_teams_max_nth = 0;
 int __kmp_threads_capacity = 0;
 int __kmp_dflt_team_nth = 0;
@@ -154,7 +155,8 @@ int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
 int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
 kmp_hier_sched_env_t __kmp_hier_scheds = {0, 0, NULL, NULL, NULL};
 #endif
-int __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
+int __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; // in microseconds
+char __kmp_blocktime_units = 'm'; // Units specified in KMP_BLOCKTIME
 bool __kmp_wpolicy_passive = false;
 #if KMP_USE_MONITOR
 int __kmp_monitor_wakeups = KMP_MIN_MONITOR_WAKEUPS;
@@ -170,7 +172,7 @@ int __kmp_ncores = 0;
 int __kmp_chunk = 0;
 int __kmp_force_monotonic = 0;
 int __kmp_abort_delay = 0;
-#if KMP_OS_LINUX && defined(KMP_TDATA_GTID)
+#if (KMP_OS_LINUX || KMP_OS_AIX) && defined(KMP_TDATA_GTID)
 int __kmp_gtid_mode = 3; /* use __declspec(thread) TLS to store gtid */
 int __kmp_adjust_gtid_mode = FALSE;
 #elif KMP_OS_WINDOWS
@@ -269,23 +271,20 @@ kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity = NULL;
 #endif /* KMP_OS_WINDOWS */
 
 size_t __kmp_affin_mask_size = 0;
-enum affinity_type __kmp_affinity_type = affinity_default;
-kmp_hw_t __kmp_affinity_gran = KMP_HW_UNKNOWN;
-int __kmp_affinity_gran_levels = -1;
-int __kmp_affinity_dups = TRUE;
 enum affinity_top_method __kmp_affinity_top_method =
     affinity_top_method_default;
-int __kmp_affinity_compact = 0;
-int __kmp_affinity_offset = 0;
-int __kmp_affinity_verbose = FALSE;
-int __kmp_affinity_warnings = TRUE;
-int __kmp_affinity_respect_mask = affinity_respect_mask_default;
-char *__kmp_affinity_proclist = NULL;
-kmp_affin_mask_t *__kmp_affinity_masks = NULL;
-unsigned __kmp_affinity_num_masks = 0;
+
+// Regular thread affinity settings from KMP_AFFINITY
+kmp_affinity_t __kmp_affinity = KMP_AFFINITY_INIT("KMP_AFFINITY");
+// Hidden helper thread affinity settings from KMP_HIDDEN_HELPER_AFFINITY
+kmp_affinity_t __kmp_hh_affinity =
+    KMP_AFFINITY_INIT("KMP_HIDDEN_HELPER_AFFINITY");
+kmp_affinity_t *__kmp_affinities[] = {&__kmp_affinity, &__kmp_hh_affinity};
 
 char *__kmp_cpuinfo_file = NULL;
-bool __kmp_affin_reset = 0;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+int __kmp_first_osid_with_ecore = -1;
+#endif
 
 #endif /* KMP_AFFINITY_SUPPORTED */
 
@@ -392,7 +391,7 @@ int __kmp_debug_buf_atomic =
 
 char *__kmp_debug_buffer = NULL; /* Debug buffer itself */
 std::atomic<int> __kmp_debug_count =
-    ATOMIC_VAR_INIT(0); /* number of lines printed in buffer so far */
+    0; /* number of lines printed in buffer so far */
 int __kmp_debug_buf_warn_chars =
     0; /* Keep track of char increase recommended in warnings */
 /* end rotating debug buffer */
@@ -460,7 +459,7 @@ volatile kmp_info_t *__kmp_thread_pool = NULL;
 volatile kmp_team_t *__kmp_team_pool = NULL;
 
 KMP_ALIGN_CACHE
-std::atomic<int> __kmp_thread_pool_active_nth = ATOMIC_VAR_INIT(0);
+std::atomic<int> __kmp_thread_pool_active_nth = 0;
 
 /* -------------------------------------------------
  * GLOBAL/ROOT STATE */
@@ -553,13 +552,6 @@ int get_suspend_count_(void) {
 void set_suspend_count_(int *value) { __kmp_suspend_count = *value; }
 #endif
 
-// Symbols for MS mutual detection.
-int _You_must_link_with_exactly_one_OpenMP_library = 1;
-int _You_must_link_with_Intel_OpenMP_library = 1;
-#if KMP_OS_WINDOWS && (KMP_VERSION_MAJOR > 4)
-int _You_must_link_with_Microsoft_OpenMP_library = 1;
-#endif
-
 kmp_target_offload_kind_t __kmp_target_offload = tgt_default;
 
 // OMP Pause Resources
@@ -570,4 +562,17 @@ int __kmp_nesting_mode = 0;
 int __kmp_nesting_mode_nlevels = 1;
 int *__kmp_nesting_nth_level;
 
+#if OMPX_TASKGRAPH
+// TDG record & replay
+int __kmp_tdg_dot = 0;
+kmp_int32 __kmp_max_tdgs = 100;
+kmp_tdg_info_t **__kmp_global_tdgs = NULL;
+kmp_int32 __kmp_curr_tdg_idx =
+    0; // Id of the current TDG being recorded or executed
+kmp_int32 __kmp_num_tdg = 0;
+kmp_int32 __kmp_successors_size = 10; // Initial succesor size list for
+                                      // recording
+std::atomic<kmp_int32> __kmp_tdg_task_id = 0;
+#endif
 // end of file //
+
diff --git a/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp b/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp
index d77d4809a7e..86cf16470e1 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_gsupport.cpp
@@ -12,6 +12,7 @@
 
 #include "kmp.h"
 #include "kmp_atomic.h"
+#include "kmp_utils.h"
 
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
@@ -143,7 +144,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_BARRIER)(void) {
 
 // Mutual exclusion
 
-// The symbol that icc/ifort generates for unnamed for unnamed critical sections
+// The symbol that icc/ifort generates for unnamed critical sections
 // - .gomp_critical_user_ - is defined using .comm in any objects reference it.
 // We can't reference it directly here in C code, as the symbol contains a ".".
 //
@@ -356,7 +357,8 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ORDERED_END)(void) {
 // They come in two flavors: 64-bit unsigned, and either 32-bit signed
 // (IA-32 architecture) or 64-bit signed (Intel(R) 64).
 
-#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS
+#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_WASM ||          \
+    KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
 #define KMP_DISPATCH_INIT __kmp_aux_dispatch_init_4
 #define KMP_DISPATCH_FINI_CHUNK __kmp_aux_dispatch_fini_chunk_4
 #define KMP_DISPATCH_NEXT __kmpc_dispatch_next_4
@@ -1280,7 +1282,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data,
       KMP_ASSERT(depend);
       kmp_gomp_depends_info_t gomp_depends(depend);
       kmp_int32 ndeps = gomp_depends.get_num_deps();
-      kmp_depend_info_t dep_list[ndeps];
+      SimpleVLA<kmp_depend_info_t> dep_list(ndeps);
       for (kmp_int32 i = 0; i < ndeps; i++)
         dep_list[i] = gomp_depends.get_kmp_depend(i);
       kmp_int32 ndeps_cnv;
@@ -1309,7 +1311,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data,
       KMP_ASSERT(depend);
       kmp_gomp_depends_info_t gomp_depends(depend);
       kmp_int32 ndeps = gomp_depends.get_num_deps();
-      kmp_depend_info_t dep_list[ndeps];
+      SimpleVLA<kmp_depend_info_t> dep_list(ndeps);
       for (kmp_int32 i = 0; i < ndeps; i++)
         dep_list[i] = gomp_depends.get_kmp_depend(i);
       __kmpc_omp_wait_deps(&loc, gtid, ndeps, dep_list, 0, NULL);
@@ -1993,7 +1995,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKWAIT_DEPEND)(void **depend) {
   KA_TRACE(20, ("GOMP_taskwait_depend: T#%d\n", gtid));
   kmp_gomp_depends_info_t gomp_depends(depend);
   kmp_int32 ndeps = gomp_depends.get_num_deps();
-  kmp_depend_info_t dep_list[ndeps];
+  SimpleVLA<kmp_depend_info_t> dep_list(ndeps);
   for (kmp_int32 i = 0; i < ndeps; i++)
     dep_list[i] = gomp_depends.get_kmp_depend(i);
 #if OMPT_SUPPORT
diff --git a/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc b/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc
index 776cca2b66c..ec0f81d9cf5 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc
+++ b/contrib/libs/cxxsupp/openmp/kmp_i18n_default.inc
@@ -1,5 +1,5 @@
 // Do not edit this file! //
-// The file was generated from en_US.txt by message-converter.pl. //
+// The file was generated from en_US.txt by message-converter.py on Fri Jul 11 21:54:37 2025 (fixed date by patch) //
 
 static char const *
 __kmp_i18n_default_meta[] =
@@ -414,6 +414,9 @@ __kmp_i18n_default_messages[] =
         "KMP_HW_SUBSET ignored: all hardware resources would be filtered, please reduce the filter.",
         "KMP_HW_SUBSET ignored: Too many attributes specified. This machine is not a hybrid architecutre.",
         "KMP_HW_SUBSET: ignoring %1$s attribute. This machine is not a hybrid architecutre.",
+        "Target memory not available, will use default allocator.",
+        "%1$s ignored: This machine is not a hybrid architecutre. Using \"%2$s\" instead.",
+        "%1$s ignored: %2$s is not available. Using \"%3$s\" instead.",
         NULL
     };
 
@@ -421,7 +424,7 @@ static char const *
 __kmp_i18n_default_hints[] =
     {
         NULL,
-        "Please submit a bug report with this message, compile and run commands used, and machine configuration info including native compiler and operating system versions. Faster response will be obtained by including all program sources. For information on submitting this issue, please see https://bugs.llvm.org/.",
+        "Please submit a bug report with this message, compile and run commands used, and machine configuration info including native compiler and operating system versions. Faster response will be obtained by including all program sources. For information on submitting this issue, please see https://github.com/llvm/llvm-project/issues/.",
         "Check NLSPATH environment variable, its value is \"%1$s\".",
         "Please try changing the shell stack limit or adjusting the OMP_STACKSIZE environment variable.",
         "Consider unsetting KMP_DEVICE_THREAD_LIMIT (KMP_ALL_THREADS), KMP_TEAMS_THREAD_LIMIT, and OMP_THREAD_LIMIT (if any are set).",
@@ -466,7 +469,7 @@ __kmp_i18n_sections[] =
         { 5, __kmp_i18n_default_meta },
         { 79, __kmp_i18n_default_strings },
         { 6, __kmp_i18n_default_formats },
-        { 298, __kmp_i18n_default_messages },
+        { 301, __kmp_i18n_default_messages },
         { 29, __kmp_i18n_default_hints },
         { 0, NULL }
     };
diff --git a/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc b/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc
index a66f8117c2d..07872f02904 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc
+++ b/contrib/libs/cxxsupp/openmp/kmp_i18n_id.inc
@@ -1,5 +1,5 @@
 // Do not edit this file! //
-// The file was generated from en_US.txt by message-converter.pl. //
+// The file was generated from en_US.txt by message-converter.py on Fri Jul 11 21:54:37 2025 (fixed date by patch) //
 
 enum kmp_i18n_id {
 
@@ -408,6 +408,9 @@ enum kmp_i18n_id {
     kmp_i18n_msg_AffHWSubsetAllFiltered,
     kmp_i18n_msg_AffHWSubsetAttrsNonHybrid,
     kmp_i18n_msg_AffHWSubsetIgnoringAttr,
+    kmp_i18n_msg_TargetMemNotAvailable,
+    kmp_i18n_msg_AffIgnoringNonHybrid,
+    kmp_i18n_msg_AffIgnoringNotAvailable,
     kmp_i18n_msg_last,
 
     // Set #5, hints.
diff --git a/contrib/libs/cxxsupp/openmp/kmp_io.cpp b/contrib/libs/cxxsupp/openmp/kmp_io.cpp
index 578e6e671cd..0c52662bc23 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_io.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_io.cpp
@@ -50,24 +50,6 @@ static HANDLE __kmp_stderr = NULL;
 static int __kmp_console_exists = FALSE;
 static kmp_str_buf_t __kmp_console_buf;
 
-static int is_console(void) {
-  char buffer[128];
-  DWORD rc = 0;
-  DWORD err = 0;
-  // Try to get console title.
-  SetLastError(0);
-  // GetConsoleTitle does not reset last error in case of success or short
-  // buffer, so we need to clear it explicitly.
-  rc = GetConsoleTitle(buffer, sizeof(buffer));
-  if (rc == 0) {
-    // rc == 0 means getting console title failed. Let us find out why.
-    err = GetLastError();
-    // err == 0 means buffer too short (we suppose console exists).
-    // In Window applications we usually have err == 6 (invalid handle).
-  }
-  return rc > 0 || err == 0;
-}
-
 void __kmp_close_console(void) {
   /* wait until user presses return before closing window */
   /* TODO only close if a window was opened */
@@ -84,7 +66,6 @@ void __kmp_close_console(void) {
 static void __kmp_redirect_output(void) {
   __kmp_acquire_bootstrap_lock(&__kmp_console_lock);
 
-  (void)is_console;
   if (!__kmp_console_exists) {
     HANDLE ho;
     HANDLE he;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_lock.cpp b/contrib/libs/cxxsupp/openmp/kmp_lock.cpp
index 8fcddc71086..0ad14f862bc 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_lock.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_lock.cpp
@@ -2689,7 +2689,7 @@ void __kmp_spin_backoff(kmp_backoff_t *boff) {
 // lock word.
 static void __kmp_init_direct_lock(kmp_dyna_lock_t *lck,
                                    kmp_dyna_lockseq_t seq) {
-  TCW_4(*lck, KMP_GET_D_TAG(seq));
+  TCW_4(((kmp_base_tas_lock_t *)lck)->poll, KMP_GET_D_TAG(seq));
   KA_TRACE(
       20,
       ("__kmp_init_direct_lock: initialized direct lock with type#%d\n", seq));
@@ -3180,8 +3180,8 @@ kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
   lck->type = tag;
 
   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
-    *((kmp_lock_index_t *)user_lock) = idx
-                                       << 1; // indirect lock word must be even
+    *(kmp_lock_index_t *)&(((kmp_base_tas_lock_t *)user_lock)->poll) =
+        idx << 1; // indirect lock word must be even
   } else {
     *((kmp_indirect_lock_t **)user_lock) = lck;
   }
@@ -3809,7 +3809,7 @@ static kmp_lock_index_t __kmp_lock_table_insert(kmp_user_lock_p lck) {
                sizeof(kmp_user_lock_p) * (__kmp_user_lock_table.used - 1));
     table[0] = (kmp_user_lock_p)__kmp_user_lock_table.table;
     // We cannot free the previous table now, since it may be in use by other
-    // threads. So save the pointer to the previous table in in the first
+    // threads. So save the pointer to the previous table in the first
     // element of the new table. All the tables will be organized into a list,
     // and could be freed when library shutting down.
     __kmp_user_lock_table.table = table;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_lock.h b/contrib/libs/cxxsupp/openmp/kmp_lock.h
index a19f4ca323b..6202f3d617c 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_lock.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_lock.h
@@ -50,7 +50,7 @@ typedef struct ident ident_t;
 // recent versions), but we are bounded by the pointer-sized chunks that
 // the Intel compiler allocates.
 
-#if KMP_OS_LINUX && defined(KMP_GOMP_COMPAT)
+#if (KMP_OS_LINUX || KMP_OS_AIX) && defined(KMP_GOMP_COMPAT)
 #define OMP_LOCK_T_SIZE sizeof(int)
 #define OMP_NEST_LOCK_T_SIZE sizeof(void *)
 #else
@@ -120,8 +120,16 @@ extern void __kmp_validate_locks(void);
 
 struct kmp_base_tas_lock {
   // KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) &&     \
+    __LP64__
+  // Flip the ordering of the high and low 32-bit member to be consistent
+  // with the memory layout of the address in 64-bit big-endian.
+  kmp_int32 depth_locked; // depth locked, for nested locks only
+  std::atomic<kmp_int32> poll;
+#else
   std::atomic<kmp_int32> poll;
   kmp_int32 depth_locked; // depth locked, for nested locks only
+#endif
 };
 
 typedef struct kmp_base_tas_lock kmp_base_tas_lock_t;
@@ -138,7 +146,7 @@ typedef union kmp_tas_lock kmp_tas_lock_t;
 //    kmp_tas_lock_t xlock = KMP_TAS_LOCK_INITIALIZER( xlock );
 #define KMP_TAS_LOCK_INITIALIZER(lock)                                         \
   {                                                                            \
-    { ATOMIC_VAR_INIT(KMP_LOCK_FREE(tas)), 0 }                                 \
+    { KMP_LOCK_FREE(tas), 0 }                                                  \
   }
 
 extern int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
@@ -276,11 +284,7 @@ typedef union kmp_ticket_lock kmp_ticket_lock_t;
 // Note the macro argument. It is important to make var properly initialized.
 #define KMP_TICKET_LOCK_INITIALIZER(lock)                                      \
   {                                                                            \
-    {                                                                          \
-      ATOMIC_VAR_INIT(true)                                                    \
-      , &(lock), NULL, ATOMIC_VAR_INIT(0U), ATOMIC_VAR_INIT(0U),               \
-          ATOMIC_VAR_INIT(0), ATOMIC_VAR_INIT(-1)                              \
-    }                                                                          \
+    { true, &(lock), NULL, 0U, 0U, 0, -1 }                                     \
   }
 
 extern int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid);
@@ -1142,11 +1146,13 @@ extern int (**__kmp_indirect_test)(kmp_user_lock_p, kmp_int32);
 
 // Extracts direct lock tag from a user lock pointer
 #define KMP_EXTRACT_D_TAG(l)                                                   \
-  (*((kmp_dyna_lock_t *)(l)) & ((1 << KMP_LOCK_SHIFT) - 1) &                   \
-   -(*((kmp_dyna_lock_t *)(l)) & 1))
+  ((kmp_dyna_lock_t)((kmp_base_tas_lock_t *)(l))->poll &                       \
+   ((1 << KMP_LOCK_SHIFT) - 1) &                                               \
+   -((kmp_dyna_lock_t)((kmp_tas_lock_t *)(l))->lk.poll & 1))
 
 // Extracts indirect lock index from a user lock pointer
-#define KMP_EXTRACT_I_INDEX(l) (*(kmp_lock_index_t *)(l) >> 1)
+#define KMP_EXTRACT_I_INDEX(l)                                                 \
+  ((kmp_lock_index_t)((kmp_base_tas_lock_t *)(l))->poll >> 1)
 
 // Returns function pointer to the direct lock function with l (kmp_dyna_lock_t
 // *) and op (operation type).
diff --git a/contrib/libs/cxxsupp/openmp/kmp_os.h b/contrib/libs/cxxsupp/openmp/kmp_os.h
index 02efaa1b261..29a281f0968 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_os.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_os.h
@@ -75,7 +75,9 @@
 #error Unknown compiler
 #endif
 
-#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD)
+#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD || KMP_OS_NETBSD ||      \
+     KMP_OS_DRAGONFLY || KMP_OS_AIX) &&                                        \
+    !KMP_OS_WASI && !KMP_OS_EMSCRIPTEN
 #define KMP_AFFINITY_SUPPORTED 1
 #if KMP_OS_WINDOWS && KMP_ARCH_X86_64
 #define KMP_GROUP_AFFINITY 1
@@ -105,8 +107,9 @@
    128-bit extended precision type yet */
 typedef long double _Quad;
 #elif KMP_COMPILER_GCC
-/* GCC on NetBSD lacks __multc3/__divtc3 builtins needed for quad */
-#if !KMP_OS_NETBSD
+/* GCC on NetBSD lacks __multc3/__divtc3 builtins needed for quad until
+   NetBSD 10.0 which ships with GCC 10.5 */
+#if (!KMP_OS_NETBSD || __GNUC__ >= 10)
 typedef __float128 _Quad;
 #undef KMP_HAVE_QUAD
 #define KMP_HAVE_QUAD 1
@@ -175,16 +178,18 @@ typedef unsigned long long kmp_uint64;
 #define KMP_UINT64_SPEC "llu"
 #endif /* KMP_OS_UNIX */
 
-#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS
+#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_WASM ||          \
+    KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
 #define KMP_SIZE_T_SPEC KMP_UINT32_SPEC
 #elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                 \
-    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
+    KMP_ARCH_VE || KMP_ARCH_S390X
 #define KMP_SIZE_T_SPEC KMP_UINT64_SPEC
 #else
 #error "Can't determine size_t printf format specifier."
 #endif
 
-#if KMP_ARCH_X86
+#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_WASM || KMP_ARCH_PPC
 #define KMP_SIZE_T_MAX (0xFFFFFFFF)
 #else
 #define KMP_SIZE_T_MAX (0xFFFFFFFFFFFFFFFF)
@@ -213,8 +218,9 @@ typedef kmp_uint32 kmp_uint;
 #define KMP_INT_MIN ((kmp_int32)0x80000000)
 
 // stdarg handling
-#if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) &&                   \
-    (KMP_OS_FREEBSD || KMP_OS_LINUX)
+#if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64 || KMP_ARCH_WASM) &&  \
+    (KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD || KMP_OS_DRAGONFLY ||  \
+     KMP_OS_LINUX || KMP_OS_WASI)
 typedef va_list *kmp_va_list;
 #define kmp_va_deref(ap) (*(ap))
 #define kmp_va_addr_of(ap) (&(ap))
@@ -303,6 +309,8 @@ template <> struct traits_t<unsigned long long> {
    !KMP_MIC)
 
 #if KMP_OS_WINDOWS
+// Don't include everything related to NT status code, we'll do that explicitly
+#define WIN32_NO_STATUS
 #include <windows.h>
 
 static inline int KMP_GET_PAGE_SIZE(void) {
@@ -456,13 +464,13 @@ enum kmp_mem_fence_type {
 
 // Synchronization primitives
 
-#if KMP_ASM_INTRINS && KMP_OS_WINDOWS
+#if KMP_ASM_INTRINS && KMP_OS_WINDOWS && !((KMP_ARCH_AARCH64 || KMP_ARCH_ARM) && (KMP_COMPILER_CLANG || KMP_COMPILER_GCC))
 
 #if KMP_MSVC_COMPAT && !KMP_COMPILER_CLANG
 #pragma intrinsic(InterlockedExchangeAdd)
 #pragma intrinsic(InterlockedCompareExchange)
 #pragma intrinsic(InterlockedExchange)
-#if !(KMP_COMPILER_ICX && KMP_32_BIT_ARCH)
+#if !KMP_32_BIT_ARCH
 #pragma intrinsic(InterlockedExchange64)
 #endif
 #endif
@@ -596,27 +604,26 @@ inline kmp_int32 __kmp_compare_and_store_ptr(void *volatile *p, void *cv,
 }
 
 // The _RET versions return the value instead of a bool
-/*
+
 #define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
    _InterlockedCompareExchange8((p), (sv), (cv))
 #define KMP_COMPARE_AND_STORE_RET16(p, cv, sv)                                 \
   _InterlockedCompareExchange16((p), (sv), (cv))
-*/
+
 #define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
   _InterlockedCompareExchange64((volatile kmp_int64 *)(p), (kmp_int64)(sv),    \
                                 (kmp_int64)(cv))
 
-/*
+
 #define KMP_XCHG_FIXED8(p, v)                                                  \
   _InterlockedExchange8((volatile kmp_int8 *)(p), (kmp_int8)(v));
-*/
-// #define KMP_XCHG_FIXED16(p, v) _InterlockedExchange16((p), (v));
-// #define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v)));
+#define KMP_XCHG_FIXED16(p, v) _InterlockedExchange16((p), (v));
+#define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));
 
-// inline kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v) {
-//   kmp_int64 tmp = _InterlockedExchange64((volatile kmp_int64 *)p, *(kmp_int64
-//   *)&v); return *(kmp_real64 *)&tmp;
-// }
+inline kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v) {
+  kmp_int64 tmp = _InterlockedExchange64((volatile kmp_int64 *)p, *(kmp_int64
+  *)&v); return *(kmp_real64 *)&tmp;
+}
 
 #else // !KMP_ARCH_AARCH64
 
@@ -1044,7 +1051,8 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
 #endif /* KMP_OS_WINDOWS */
 
 #if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS ||     \
-    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
+    KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
 #if KMP_OS_WINDOWS
 #undef KMP_MB
 #define KMP_MB() std::atomic_thread_fence(std::memory_order_seq_cst)
@@ -1058,6 +1066,15 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if KMP_MIC
+// fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
+// We shouldn't need it, though, since the ABI rules require that
+// * If the compiler generates NGO stores it also generates the fence
+// * If users hand-code NGO stores they should insert the fence
+// therefore no incomplete unordered stores should be visible.
+#define KMP_MFENCE() /* Nothing */
+#define KMP_SFENCE() /* Nothing */
+#else
 #if KMP_COMPILER_ICC || KMP_COMPILER_ICX
 #define KMP_MFENCE_() _mm_mfence()
 #define KMP_SFENCE_() _mm_sfence()
@@ -1076,6 +1093,7 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
     KMP_MFENCE_();                                                             \
   }
 #define KMP_SFENCE() KMP_SFENCE_()
+#endif
 #else
 #define KMP_MFENCE() KMP_MB()
 #define KMP_SFENCE() KMP_MB()
@@ -1134,7 +1152,7 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
   KMP_COMPARE_AND_STORE_REL64((volatile kmp_int64 *)(volatile void *)&(a),     \
                               (kmp_int64)(b), (kmp_int64)(c))
 
-#if KMP_ARCH_X86 || KMP_ARCH_MIPS
+#if KMP_ARCH_X86 || KMP_ARCH_MIPS || KMP_ARCH_WASM || KMP_ARCH_PPC
 // What about ARM?
 #define TCR_PTR(a) ((void *)TCR_4(a))
 #define TCW_PTR(a, b) TCW_4((a), (b))
@@ -1273,12 +1291,29 @@ bool __kmp_atomic_compare_store_rel(std::atomic<T> *p, T expected, T desired) {
 
 // Symbol lookup on Linux/Windows
 #if KMP_OS_WINDOWS
-extern void *__kmp_lookup_symbol(const char *name);
+extern void *__kmp_lookup_symbol(const char *name, bool next = false);
 #define KMP_DLSYM(name) __kmp_lookup_symbol(name)
+#define KMP_DLSYM_NEXT(name) __kmp_lookup_symbol(name, true)
+#elif KMP_OS_WASI || KMP_OS_EMSCRIPTEN
+#define KMP_DLSYM(name) nullptr
 #define KMP_DLSYM_NEXT(name) nullptr
 #else
 #define KMP_DLSYM(name) dlsym(RTLD_DEFAULT, name)
 #define KMP_DLSYM_NEXT(name) dlsym(RTLD_NEXT, name)
 #endif
 
+// MSVC doesn't have this, but clang/clang-cl does.
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+// Same as LLVM_BUILTIN_UNREACHABLE. States that it is UB to reach this point.
+#if __has_builtin(__builtin_unreachable) || defined(__GNUC__)
+#define KMP_BUILTIN_UNREACHABLE __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define KMP_BUILTIN_UNREACHABLE __assume(false)
+#else
+#define KMP_BUILTIN_UNREACHABLE
+#endif
+
 #endif /* KMP_OS_H */
diff --git a/contrib/libs/cxxsupp/openmp/kmp_platform.h b/contrib/libs/cxxsupp/openmp/kmp_platform.h
index bbbd72dd695..9c221514046 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_platform.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_platform.h
@@ -23,6 +23,9 @@
 #define KMP_OS_DARWIN 0
 #define KMP_OS_WINDOWS 0
 #define KMP_OS_HURD 0
+#define KMP_OS_SOLARIS 0
+#define KMP_OS_WASI 0
+#define KMP_OS_EMSCRIPTEN 0
 #define KMP_OS_UNIX 0 /* disjunction of KMP_OS_LINUX, KMP_OS_DARWIN etc. */
 
 #ifdef _WIN32
@@ -42,6 +45,11 @@
 #elif (defined __linux__)
 #undef KMP_OS_LINUX
 #define KMP_OS_LINUX 1
+#elif defined(__EMSCRIPTEN__)
+#undef KMP_OS_LINUX
+#undef KMP_OS_EMSCRIPTEN
+#define KMP_OS_LINUX 1
+#define KMP_OS_EMSCRIPTEN 1
 #else
 #endif
 
@@ -70,13 +78,30 @@
 #define KMP_OS_HURD 1
 #endif
 
+#if (defined __sun__ && defined __svr4__)
+#undef KMP_OS_SOLARIS
+#define KMP_OS_SOLARIS 1
+#endif
+
+#if (defined __wasi__)
+#undef KMP_OS_WASI
+#define KMP_OS_WASI 1
+#endif
+
+#if (defined _AIX)
+#undef KMP_OS_AIX
+#define KMP_OS_AIX 1
+#endif
+
 #if (1 != KMP_OS_LINUX + KMP_OS_DRAGONFLY + KMP_OS_FREEBSD + KMP_OS_NETBSD +   \
-              KMP_OS_OPENBSD + KMP_OS_DARWIN + KMP_OS_WINDOWS + KMP_OS_HURD)
+              KMP_OS_OPENBSD + KMP_OS_DARWIN + KMP_OS_WINDOWS + KMP_OS_HURD +  \
+              KMP_OS_SOLARIS + KMP_OS_WASI + KMP_OS_AIX)
 #error Unknown OS
 #endif
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-    KMP_OS_OPENBSD || KMP_OS_DARWIN || KMP_OS_HURD
+    KMP_OS_OPENBSD || KMP_OS_DARWIN || KMP_OS_HURD || KMP_OS_SOLARIS ||        \
+    KMP_OS_WASI || KMP_OS_AIX
 #undef KMP_OS_UNIX
 #define KMP_OS_UNIX 1
 #endif
@@ -86,12 +111,17 @@
 #define KMP_ARCH_X86 0
 #define KMP_ARCH_X86_64 0
 #define KMP_ARCH_AARCH64 0
+#define KMP_ARCH_AARCH64_32 0
 #define KMP_ARCH_PPC64_ELFv1 0
 #define KMP_ARCH_PPC64_ELFv2 0
-#define KMP_ARCH_PPC64 (KMP_ARCH_PPC64_ELFv2 || KMP_ARCH_PPC64_ELFv1)
+#define KMP_ARCH_PPC64_XCOFF 0
+#define KMP_ARCH_PPC_XCOFF 0
 #define KMP_ARCH_MIPS 0
 #define KMP_ARCH_MIPS64 0
 #define KMP_ARCH_RISCV64 0
+#define KMP_ARCH_LOONGARCH64 0
+#define KMP_ARCH_VE 0
+#define KMP_ARCH_S390X 0
 
 #if KMP_OS_WINDOWS
 #if defined(_M_AMD64) || defined(__x86_64)
@@ -100,6 +130,9 @@
 #elif defined(__aarch64__) || defined(_M_ARM64)
 #undef KMP_ARCH_AARCH64
 #define KMP_ARCH_AARCH64 1
+#elif defined(__arm__) || defined(_M_ARM)
+#undef KMP_ARCH_ARMV7
+#define KMP_ARCH_ARMV7 1
 #else
 #undef KMP_ARCH_X86
 #define KMP_ARCH_X86 1
@@ -114,13 +147,26 @@
 #undef KMP_ARCH_X86
 #define KMP_ARCH_X86 1
 #elif defined __powerpc64__
-#if defined(_CALL_ELF) && _CALL_ELF == 2
+#if defined(_CALL_ELF)
+#if _CALL_ELF == 2
 #undef KMP_ARCH_PPC64_ELFv2
 #define KMP_ARCH_PPC64_ELFv2 1
 #else
 #undef KMP_ARCH_PPC64_ELFv1
 #define KMP_ARCH_PPC64_ELFv1 1
 #endif
+#elif defined KMP_OS_AIX
+#undef KMP_ARCH_PPC64_XCOFF
+#define KMP_ARCH_PPC64_XCOFF 1
+#endif
+#elif defined(__powerpc__) && defined(KMP_OS_AIX)
+#undef KMP_ARCH_PPC_XCOFF
+#define KMP_ARCH_PPC_XCOFF 1
+#undef KMP_ARCH_PPC
+#define KMP_ARCH_PPC 1
+#elif defined __ARM64_ARCH_8_32__
+#undef KMP_ARCH_AARCH64_32
+#define KMP_ARCH_AARCH64_32 1
 #elif defined __aarch64__
 #undef KMP_ARCH_AARCH64
 #define KMP_ARCH_AARCH64 1
@@ -135,6 +181,15 @@
 #elif defined __riscv && __riscv_xlen == 64
 #undef KMP_ARCH_RISCV64
 #define KMP_ARCH_RISCV64 1
+#elif defined __loongarch__ && __loongarch_grlen == 64
+#undef KMP_ARCH_LOONGARCH64
+#define KMP_ARCH_LOONGARCH64 1
+#elif defined __ve__
+#undef KMP_ARCH_VE
+#define KMP_ARCH_VE 1
+#elif defined __s390x__
+#undef KMP_ARCH_S390X
+#define KMP_ARCH_S390X 1
 #endif
 #endif
 
@@ -174,6 +229,13 @@
 #define KMP_ARCH_ARM 1
 #endif
 
+#if defined(__wasm32__)
+#define KMP_ARCH_WASM 1
+#endif
+
+#define KMP_ARCH_PPC64                                                         \
+  (KMP_ARCH_PPC64_ELFv2 || KMP_ARCH_PPC64_ELFv1 || KMP_ARCH_PPC64_XCOFF)
+
 #if defined(__MIC__) || defined(__MIC2__)
 #define KMP_MIC 1
 #if __MIC2__ || __KNC__
@@ -190,7 +252,9 @@
 #endif
 
 /* Specify 32 bit architectures here */
-#define KMP_32_BIT_ARCH (KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS)
+#define KMP_32_BIT_ARCH                                                        \
+  (KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_WASM ||           \
+   KMP_ARCH_PPC || KMP_ARCH_AARCH64_32)
 
 // Platforms which support Intel(R) Many Integrated Core Architecture
 #define KMP_MIC_SUPPORTED                                                      \
@@ -199,7 +263,9 @@
 // TODO: Fixme - This is clever, but really fugly
 #if (1 != KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 +     \
               KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64 +             \
-              KMP_ARCH_RISCV64)
+              KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64 + KMP_ARCH_VE +          \
+              KMP_ARCH_S390X + KMP_ARCH_WASM + KMP_ARCH_PPC +                  \
+              KMP_ARCH_AARCH64_32)
 #error Unknown or unsupported architecture
 #endif
 
diff --git a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
index bfbff03bd62..c26992ab98b 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
@@ -24,6 +24,7 @@
 #include "kmp_wait_release.h"
 #include "kmp_wrapper_getpid.h"
 #include "kmp_dispatch.h"
+#include "kmp_utils.h"
 #if KMP_USE_HIER_SCHED
 #error #include "kmp_dispatch_hier.h"
 #endif
@@ -47,8 +48,9 @@ static char *ProfileTraceFile = nullptr;
 #include <process.h>
 #endif
 
-#if KMP_OS_WINDOWS
-// windows does not need include files as it doesn't use shared memory
+#ifndef KMP_USE_SHM
+// Windows and WASI do not need these include files as they don't use shared
+// memory.
 #else
 #include <sys/mman.h>
 #include <sys/stat.h>
@@ -111,6 +113,21 @@ void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
                                int new_nthreads);
 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
 
+static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
+                                                        int level) {
+  kmp_nested_nthreads_t *new_nested_nth =
+      (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
+          sizeof(kmp_nested_nthreads_t));
+  int new_size = level + thr->th.th_set_nested_nth_sz;
+  new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
+  for (int i = 0; i < level + 1; ++i)
+    new_nested_nth->nth[i] = 0;
+  for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
+    new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
+  new_nested_nth->size = new_nested_nth->used = new_size;
+  return new_nested_nth;
+}
+
 /* Calculate the identifier of the current thread */
 /* fast (and somewhat portable) way to get unique identifier of executing
    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
@@ -178,7 +195,12 @@ int __kmp_get_global_thread_id() {
       if (stack_diff <= stack_size) {
         /* The only way we can be closer than the allocated */
         /* stack size is if we are running on this thread. */
-        KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
+        // __kmp_gtid_get_specific can return negative value because this
+        // function can be called by thread destructor. However, before the
+        // thread destructor is called, the value of the corresponding
+        // thread-specific data will be reset to NULL.
+        KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
+                         __kmp_gtid_get_specific() == i);
         return i;
       }
     }
@@ -196,6 +218,12 @@ int __kmp_get_global_thread_id() {
   if (i < 0)
     return i;
 
+  // other_threads[i] can be nullptr at this point because the corresponding
+  // thread could have already been destructed. It can happen when this function
+  // is called in end library routine.
+  if (!TCR_SYNC_PTR(other_threads[i]))
+    return i;
+
   /* dynamically updated stack window for uber threads to avoid get_specific
      call */
   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
@@ -405,6 +433,8 @@ void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
   }
 #endif /* KMP_PRINT_DATA_PLACEMENT */
   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+
+  va_end(ap);
 }
 
 void __kmp_warn(char const *format, ...) {
@@ -433,26 +463,26 @@ void __kmp_abort_process() {
     __kmp_dump_debug_buffer();
   }
 
-  if (KMP_OS_WINDOWS) {
-    // Let other threads know of abnormal termination and prevent deadlock
-    // if abort happened during library initialization or shutdown
-    __kmp_global.g.g_abort = SIGABRT;
-
-    /* On Windows* OS by default abort() causes pop-up error box, which stalls
-       nightly testing. Unfortunately, we cannot reliably suppress pop-up error
-       boxes. _set_abort_behavior() works well, but this function is not
-       available in VS7 (this is not problem for DLL, but it is a problem for
-       static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
-       help, at least in some versions of MS C RTL.
-
-       It seems following sequence is the only way to simulate abort() and
-       avoid pop-up error box. */
-    raise(SIGABRT);
-    _exit(3); // Just in case, if signal ignored, exit anyway.
-  } else {
-    __kmp_unregister_library();
-    abort();
-  }
+#if KMP_OS_WINDOWS
+  // Let other threads know of abnormal termination and prevent deadlock
+  // if abort happened during library initialization or shutdown
+  __kmp_global.g.g_abort = SIGABRT;
+
+  /* On Windows* OS by default abort() causes pop-up error box, which stalls
+     nightly testing. Unfortunately, we cannot reliably suppress pop-up error
+     boxes. _set_abort_behavior() works well, but this function is not
+     available in VS7 (this is not problem for DLL, but it is a problem for
+     static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
+     help, at least in some versions of MS C RTL.
+
+     It seems following sequence is the only way to simulate abort() and
+     avoid pop-up error box. */
+  raise(SIGABRT);
+  _exit(3); // Just in case, if signal ignored, exit anyway.
+#else
+  __kmp_unregister_library();
+  abort();
+#endif
 
   __kmp_infinite_loop();
   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
@@ -553,6 +583,14 @@ static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
 
 /* ------------------------------------------------------------------------ */
 
+#if ENABLE_LIBOMPTARGET
+static void __kmp_init_omptarget() {
+  __kmp_init_target_task();
+}
+#endif
+
+/* ------------------------------------------------------------------------ */
+
 #if KMP_DYNAMIC_LIB
 #if KMP_OS_WINDOWS
 
@@ -907,6 +945,11 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
                   __kmp_get_gtid(), new_nthreads, set_nthreads));
   }
 #endif // KMP_DEBUG
+
+  if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
+    __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
+                 this_thr->th.th_nt_msg);
+  }
   return new_nthreads;
 }
 
@@ -1011,6 +1054,47 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
       __kmp_partition_places(team);
     }
 #endif
+
+    if (team->t.t_nproc > 1 &&
+        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      team->t.b->update_num_threads(team->t.t_nproc);
+      __kmp_add_threads_to_team(team, team->t.t_nproc);
+    }
+  }
+
+  // Take care of primary thread's task state
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    if (use_hot_team) {
+      KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
+      KA_TRACE(
+          20,
+          ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
+           "%p, new task_team %p / team %p\n",
+           __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
+           team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
+           team));
+
+      // Store primary thread's current task state on new team
+      KMP_CHECK_UPDATE(team->t.t_primary_task_state,
+                       master_th->th.th_task_state);
+
+      // Restore primary thread's task state to hot team's state
+      // by using thread 1's task state
+      if (team->t.t_nproc > 1) {
+        KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
+                         team->t.t_threads[1]->th.th_task_state == 1);
+        KMP_CHECK_UPDATE(master_th->th.th_task_state,
+                         team->t.t_threads[1]->th.th_task_state);
+      } else {
+        master_th->th.th_task_state = 0;
+      }
+    } else {
+      // Store primary thread's current task_state on new team
+      KMP_CHECK_UPDATE(team->t.t_primary_task_state,
+                       master_th->th.th_task_state);
+      // Are not using hot team, so set task state to 0.
+      master_th->th.th_task_state = 0;
+    }
   }
 
   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
@@ -1116,18 +1200,6 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   KMP_DEBUG_ASSERT(serial_team);
   KMP_MB();
 
-  if (__kmp_tasking_mode != tskm_immediate_exec) {
-    KMP_DEBUG_ASSERT(
-        this_thr->th.th_task_team ==
-        this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
-    KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
-                     NULL);
-    KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
-                  "team %p, new task_team = NULL\n",
-                  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
-    this_thr->th.th_task_team = NULL;
-  }
-
   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
     proc_bind = proc_bind_false;
@@ -1139,6 +1211,9 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   // Reset for next parallel region
   this_thr->th.th_set_proc_bind = proc_bind_default;
 
+  // Reset num_threads for next parallel region
+  this_thr->th.th_set_nproc = 0;
+
 #if OMPT_SUPPORT
   ompt_data_t ompt_parallel_data = ompt_data_none;
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
@@ -1210,6 +1285,12 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     serial_team->t.t_serialized = 1;
     serial_team->t.t_nproc = 1;
     serial_team->t.t_parent = this_thr->th.th_team;
+    if (this_thr->th.th_team->t.t_nested_nth)
+      serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
+    else
+      serial_team->t.t_nested_nth = &__kmp_nested_nth;
+    // Save previous team's task state on serial team structure
+    serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
     this_thr->th.th_team = serial_team;
     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
@@ -1229,9 +1310,11 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 
     // Thread value exists in the nested nthreads array for the next nested
     // level
-    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
-      this_thr->th.th_current_task->td_icvs.nproc =
-          __kmp_nested_nth.nth[level + 1];
+    kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
+    if (this_thr->th.th_team->t.t_nested_nth)
+      nested_nth = this_thr->th.th_team->t.t_nested_nth;
+    if (nested_nth->used && (level + 1 < nested_nth->used)) {
+      this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
     }
 
     if (__kmp_nested_proc_bind.used &&
@@ -1249,6 +1332,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     this_thr->th.th_team_nproc = 1;
     this_thr->th.th_team_master = this_thr;
     this_thr->th.th_team_serialized = 1;
+    this_thr->th.th_task_team = NULL;
+    this_thr->th.th_task_state = 0;
 
     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
@@ -1280,10 +1365,14 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     int level = this_thr->th.th_team->t.t_level;
     // Thread value exists in the nested nthreads array for the next nested
     // level
-    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
-      this_thr->th.th_current_task->td_icvs.nproc =
-          __kmp_nested_nth.nth[level + 1];
+
+    kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
+    if (serial_team->t.t_nested_nth)
+      nested_nth = serial_team->t.t_nested_nth;
+    if (nested_nth->used && (level + 1 < nested_nth->used)) {
+      this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
     }
+
     serial_team->t.t_level++;
     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
                   "of serial team %p to %d\n",
@@ -1300,6 +1389,9 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     }
     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
 
+    /* allocate/push task team stack */
+    __kmp_push_task_team_node(this_thr, serial_team);
+
     KMP_MB();
   }
   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
@@ -1350,6 +1442,486 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 #endif
 }
 
+// Test if this fork is for a team closely nested in a teams construct
+static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
+                                          microtask_t microtask, int level,
+                                          int teams_level, kmp_va_list ap) {
+  return (master_th->th.th_teams_microtask && ap &&
+          microtask != (microtask_t)__kmp_teams_master && level == teams_level);
+}
+
+// Test if this fork is for the teams construct, i.e. to form the outer league
+// of teams
+static inline bool __kmp_is_entering_teams(int active_level, int level,
+                                           int teams_level, kmp_va_list ap) {
+  return ((ap == NULL && active_level == 0) ||
+          (ap && teams_level > 0 && teams_level == level));
+}
+
+// AC: This is start of parallel that is nested inside teams construct.
+// The team is actual (hot), all workers are ready at the fork barrier.
+// No lock needed to initialize the team a bit, then free workers.
+static inline int
+__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
+                    kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
+                    enum fork_context_e call_context, microtask_t microtask,
+                    launch_t invoker, int master_set_numthreads, int level,
+#if OMPT_SUPPORT
+                    ompt_data_t ompt_parallel_data, void *return_address,
+#endif
+                    kmp_va_list ap) {
+  void **argv;
+  int i;
+
+  parent_team->t.t_ident = loc;
+  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
+  parent_team->t.t_argc = argc;
+  argv = (void **)parent_team->t.t_argv;
+  for (i = argc - 1; i >= 0; --i) {
+    *argv++ = va_arg(kmp_va_deref(ap), void *);
+  }
+  // Increment our nested depth levels, but not increase the serialization
+  if (parent_team == master_th->th.th_serial_team) {
+    // AC: we are in serialized parallel
+    __kmpc_serialized_parallel(loc, gtid);
+    KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
+
+    if (call_context == fork_context_gnu) {
+      // AC: need to decrement t_serialized for enquiry functions to work
+      // correctly, will restore at join time
+      parent_team->t.t_serialized--;
+      return TRUE;
+    }
+
+#if OMPD_SUPPORT
+    parent_team->t.t_pkfn = microtask;
+#endif
+
+#if OMPT_SUPPORT
+    void *dummy;
+    void **exit_frame_p;
+    ompt_data_t *implicit_task_data;
+    ompt_lw_taskteam_t lw_taskteam;
+
+    if (ompt_enabled.enabled) {
+      __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                              &ompt_parallel_data, return_address);
+      exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
+
+      __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+      // Don't use lw_taskteam after linking. Content was swapped.
+
+      /* OMPT implicit task begin */
+      implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
+      if (ompt_enabled.ompt_callback_implicit_task) {
+        OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
+        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+            ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
+            1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+      }
+
+      /* OMPT state */
+      master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+    } else {
+      exit_frame_p = &dummy;
+    }
+#endif
+
+    // AC: need to decrement t_serialized for enquiry functions to work
+    // correctly, will restore at join time
+    parent_team->t.t_serialized--;
+
+    {
+      KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
+      KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
+      __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
+#if OMPT_SUPPORT
+                             ,
+                             exit_frame_p
+#endif
+                             );
+    }
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      *exit_frame_p = NULL;
+      OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
+      if (ompt_enabled.ompt_callback_implicit_task) {
+        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+            ompt_scope_end, NULL, implicit_task_data, 1,
+            OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+      }
+      ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
+      __ompt_lw_taskteam_unlink(master_th);
+      if (ompt_enabled.ompt_callback_parallel_end) {
+        ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+            &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
+            OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
+      }
+      master_th->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
+    return TRUE;
+  }
+
+  parent_team->t.t_pkfn = microtask;
+  parent_team->t.t_invoke = invoker;
+  KMP_ATOMIC_INC(&root->r.r_in_parallel);
+  parent_team->t.t_active_level++;
+  parent_team->t.t_level++;
+  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
+
+  // If the threads allocated to the team are less than the thread limit, update
+  // the thread limit here. th_teams_size.nth is specific to this team nested
+  // in a teams construct, the team is fully created, and we're about to do
+  // the actual fork. Best to do this here so that the subsequent uses below
+  // and in the join have the correct value.
+  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    ompt_lw_taskteam_t lw_taskteam;
+    __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
+                            return_address);
+    __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
+  }
+#endif
+
+  /* Change number of threads in the team if requested */
+  if (master_set_numthreads) { // The parallel has num_threads clause
+    if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
+      // AC: only can reduce number of threads dynamically, can't increase
+      kmp_info_t **other_threads = parent_team->t.t_threads;
+      // NOTE: if using distributed barrier, we need to run this code block
+      // even when the team size appears not to have changed from the max.
+      int old_proc = master_th->th.th_teams_size.nth;
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
+        __kmp_add_threads_to_team(parent_team, master_set_numthreads);
+      }
+      parent_team->t.t_nproc = master_set_numthreads;
+      for (i = 0; i < master_set_numthreads; ++i) {
+        other_threads[i]->th.th_team_nproc = master_set_numthreads;
+      }
+    }
+    // Keep extra threads hot in the team for possible next parallels
+    master_th->th.th_set_nproc = 0;
+  }
+
+#if USE_DEBUGGER
+  if (__kmp_debugging) { // Let debugger override number of threads.
+    int nth = __kmp_omp_num_threads(loc);
+    if (nth > 0) { // 0 means debugger doesn't want to change num threads
+      master_set_numthreads = nth;
+    }
+  }
+#endif
+
+  // Figure out the proc_bind policy for the nested parallel within teams
+  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
+  // proc_bind_default means don't update
+  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
+  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
+    proc_bind = proc_bind_false;
+  } else {
+    // No proc_bind clause specified; use current proc-bind-var
+    if (proc_bind == proc_bind_default) {
+      proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
+    }
+    /* else: The proc_bind policy was specified explicitly on parallel clause.
+       This overrides proc-bind-var for this parallel region, but does not
+       change proc-bind-var. */
+    // Figure the value of proc-bind-var for the child threads.
+    if ((level + 1 < __kmp_nested_proc_bind.used) &&
+        (__kmp_nested_proc_bind.bind_types[level + 1] !=
+         master_th->th.th_current_task->td_icvs.proc_bind)) {
+      proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+    }
+  }
+  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
+  // Need to change the bind-var ICV to correct value for each implicit task
+  if (proc_bind_icv != proc_bind_default &&
+      master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
+    kmp_info_t **other_threads = parent_team->t.t_threads;
+    for (i = 0; i < master_th->th.th_team_nproc; ++i) {
+      other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
+    }
+  }
+  // Reset for next parallel region
+  master_th->th.th_set_proc_bind = proc_bind_default;
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
+       KMP_ITT_DEBUG) &&
+      __kmp_forkjoin_frames_mode == 3 &&
+      parent_team->t.t_active_level == 1 // only report frames at level 1
+      && master_th->th.th_teams_size.nteams == 1) {
+    kmp_uint64 tmp_time = __itt_get_timestamp();
+    master_th->th.th_frame_time = tmp_time;
+    parent_team->t.t_region_time = tmp_time;
+  }
+  if (__itt_stack_caller_create_ptr) {
+    KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
+    // create new stack stitching id before entering fork barrier
+    parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
+  }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+#if KMP_AFFINITY_SUPPORTED
+  __kmp_partition_places(parent_team);
+#endif
+
+  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
+                "master_th=%p, gtid=%d\n",
+                root, parent_team, master_th, gtid));
+  __kmp_internal_fork(loc, gtid, parent_team);
+  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
+                "master_th=%p, gtid=%d\n",
+                root, parent_team, master_th, gtid));
+
+  if (call_context == fork_context_gnu)
+    return TRUE;
+
+  /* Invoke microtask for PRIMARY thread */
+  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
+                parent_team->t.t_id, parent_team->t.t_pkfn));
+
+  if (!parent_team->t.t_invoke(gtid)) {
+    KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
+  }
+  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
+                parent_team->t.t_id, parent_team->t.t_pkfn));
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
+
+  return TRUE;
+}
+
+// Create a serialized parallel region
+static inline int
+__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
+                       kmp_int32 argc, microtask_t microtask, launch_t invoker,
+                       kmp_info_t *master_th, kmp_team_t *parent_team,
+#if OMPT_SUPPORT
+                       ompt_data_t *ompt_parallel_data, void **return_address,
+                       ompt_data_t **parent_task_data,
+#endif
+                       kmp_va_list ap) {
+  kmp_team_t *team;
+  int i;
+  void **argv;
+
+/* josh todo: hypothetical question: what do we do for OS X*? */
+#if KMP_OS_LINUX &&                                                            \
+    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+  SimpleVLA<void *> args(argc);
+#else
+  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
+#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
+          KMP_ARCH_AARCH64) */
+
+  KA_TRACE(
+      20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
+
+  __kmpc_serialized_parallel(loc, gtid);
+
+#if OMPD_SUPPORT
+  master_th->th.th_serial_team->t.t_pkfn = microtask;
+#endif
+
+  if (call_context == fork_context_intel) {
+    /* TODO this sucks, use the compiler itself to pass args! :) */
+    master_th->th.th_serial_team->t.t_ident = loc;
+    if (!ap) {
+      // revert change made in __kmpc_serialized_parallel()
+      master_th->th.th_serial_team->t.t_level--;
+// Get args from parent team for teams construct
+
+#if OMPT_SUPPORT
+      void *dummy;
+      void **exit_frame_p;
+      ompt_task_info_t *task_info;
+      ompt_lw_taskteam_t lw_taskteam;
+
+      if (ompt_enabled.enabled) {
+        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                                ompt_parallel_data, *return_address);
+
+        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+        // don't use lw_taskteam after linking. content was swaped
+        task_info = OMPT_CUR_TASK_INFO(master_th);
+        exit_frame_p = &(task_info->frame.exit_frame.ptr);
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+              &(task_info->task_data), 1,
+              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+        }
+
+        /* OMPT state */
+        master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+      } else {
+        exit_frame_p = &dummy;
+      }
+#endif
+
+      {
+        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
+        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
+        __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
+#if OMPT_SUPPORT
+                               ,
+                               exit_frame_p
+#endif
+                               );
+      }
+
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        *exit_frame_p = NULL;
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_end, NULL, &(task_info->task_data), 1,
+              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+        }
+        *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
+        __ompt_lw_taskteam_unlink(master_th);
+        if (ompt_enabled.ompt_callback_parallel_end) {
+          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+              ompt_parallel_data, *parent_task_data,
+              OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
+        }
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+      }
+#endif
+    } else if (microtask == (microtask_t)__kmp_teams_master) {
+      KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
+      team = master_th->th.th_team;
+      // team->t.t_pkfn = microtask;
+      team->t.t_invoke = invoker;
+      __kmp_alloc_argv_entries(argc, team, TRUE);
+      team->t.t_argc = argc;
+      argv = (void **)team->t.t_argv;
+      for (i = argc - 1; i >= 0; --i)
+        *argv++ = va_arg(kmp_va_deref(ap), void *);
+      // AC: revert change made in __kmpc_serialized_parallel()
+      //     because initial code in teams should have level=0
+      team->t.t_level--;
+      // AC: call special invoker for outer "parallel" of teams construct
+      invoker(gtid);
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_end, NULL, &(task_info->task_data), 0,
+              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
+        }
+        if (ompt_enabled.ompt_callback_parallel_end) {
+          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+              ompt_parallel_data, *parent_task_data,
+              OMPT_INVOKER(call_context) | ompt_parallel_league,
+              *return_address);
+        }
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+      }
+#endif
+    } else {
+      argv = args;
+      for (i = argc - 1; i >= 0; --i)
+        *argv++ = va_arg(kmp_va_deref(ap), void *);
+      KMP_MB();
+
+#if OMPT_SUPPORT
+      void *dummy;
+      void **exit_frame_p;
+      ompt_task_info_t *task_info;
+      ompt_lw_taskteam_t lw_taskteam;
+      ompt_data_t *implicit_task_data;
+
+      if (ompt_enabled.enabled) {
+        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                                ompt_parallel_data, *return_address);
+        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+        // don't use lw_taskteam after linking. content was swaped
+        task_info = OMPT_CUR_TASK_INFO(master_th);
+        exit_frame_p = &(task_info->frame.exit_frame.ptr);
+
+        /* OMPT implicit task begin */
+        implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+              implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
+              ompt_task_implicit);
+          OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
+        }
+
+        /* OMPT state */
+        master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+      } else {
+        exit_frame_p = &dummy;
+      }
+#endif
+
+      {
+        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
+        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
+        __kmp_invoke_microtask(microtask, gtid, 0, argc, args
+#if OMPT_SUPPORT
+                               ,
+                               exit_frame_p
+#endif
+                               );
+      }
+
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        *exit_frame_p = NULL;
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_end, NULL, &(task_info->task_data), 1,
+              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+        }
+
+        *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
+        __ompt_lw_taskteam_unlink(master_th);
+        if (ompt_enabled.ompt_callback_parallel_end) {
+          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+              ompt_parallel_data, *parent_task_data,
+              OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
+        }
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+      }
+#endif
+    }
+  } else if (call_context == fork_context_gnu) {
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      ompt_lw_taskteam_t lwt;
+      __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
+                              *return_address);
+
+      lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
+      __ompt_lw_taskteam_link(&lwt, master_th, 1);
+    }
+// don't use lw_taskteam after linking. content was swaped
+#endif
+
+    // we were called from GNU native code
+    KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
+    return FALSE;
+  } else {
+    KMP_ASSERT2(call_context < fork_context_last,
+                "__kmp_serial_fork_call: unknown fork_context parameter");
+  }
+
+  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
+  KMP_MB();
+  return FALSE;
+}
+
 /* most of the work for a fork */
 /* return true if we really went parallel, false if serialized */
 int __kmp_fork_call(ident_t *loc, int gtid,
@@ -1367,6 +1939,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
   int nthreads;
   int master_active;
   int master_set_numthreads;
+  int task_thread_limit = 0;
   int level;
   int active_level;
   int teams_level;
@@ -1395,20 +1968,23 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     __kmp_resume_if_soft_paused();
 
     /* setup current data */
-    master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
-    // shutdown
+    // AC: potentially unsafe, not in sync with library shutdown,
+    // __kmp_threads can be freed
+    master_th = __kmp_threads[gtid];
+
     parent_team = master_th->th.th_team;
     master_tid = master_th->th.th_info.ds.ds_tid;
     master_this_cons = master_th->th.th_local.this_construct;
     root = master_th->th.th_root;
     master_active = root->r.r_active;
     master_set_numthreads = master_th->th.th_set_nproc;
+    task_thread_limit =
+        master_th->th.th_current_task->td_icvs.task_thread_limit;
 
 #if OMPT_SUPPORT
     ompt_data_t ompt_parallel_data = ompt_data_none;
-    ompt_data_t *parent_task_data;
-    ompt_frame_t *ompt_frame;
-    ompt_data_t *implicit_task_data;
+    ompt_data_t *parent_task_data = NULL;
+    ompt_frame_t *ompt_frame = NULL;
     void *return_address = NULL;
 
     if (ompt_enabled.enabled) {
@@ -1458,267 +2034,44 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
     master_th->th.th_ident = loc;
 
-    if (master_th->th.th_teams_microtask && ap &&
-        microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
-      // AC: This is start of parallel that is nested inside teams construct.
-      // The team is actual (hot), all workers are ready at the fork barrier.
-      // No lock needed to initialize the team a bit, then free workers.
-      parent_team->t.t_ident = loc;
-      __kmp_alloc_argv_entries(argc, parent_team, TRUE);
-      parent_team->t.t_argc = argc;
-      argv = (void **)parent_team->t.t_argv;
-      for (i = argc - 1; i >= 0; --i)
-        *argv++ = va_arg(kmp_va_deref(ap), void *);
-      // Increment our nested depth levels, but not increase the serialization
-      if (parent_team == master_th->th.th_serial_team) {
-        // AC: we are in serialized parallel
-        __kmpc_serialized_parallel(loc, gtid);
-        KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
-
-        if (call_context == fork_context_gnu) {
-          // AC: need to decrement t_serialized for enquiry functions to work
-          // correctly, will restore at join time
-          parent_team->t.t_serialized--;
-          return TRUE;
-        }
-
-#if OMPD_SUPPORT
-        parent_team->t.t_pkfn = microtask;
-#endif
-
+    // Parallel closely nested in teams construct:
+    if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
+      return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
+                                 call_context, microtask, invoker,
+                                 master_set_numthreads, level,
 #if OMPT_SUPPORT
-        void *dummy;
-        void **exit_frame_p;
-
-        ompt_lw_taskteam_t lw_taskteam;
-
-        if (ompt_enabled.enabled) {
-          __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
-                                  &ompt_parallel_data, return_address);
-          exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
-
-          __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
-          // don't use lw_taskteam after linking. content was swaped
-
-          /* OMPT implicit task begin */
-          implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
-          if (ompt_enabled.ompt_callback_implicit_task) {
-            OMPT_CUR_TASK_INFO(master_th)->thread_num =
-                __kmp_tid_from_gtid(gtid);
-            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
-                implicit_task_data, 1,
-                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
-          }
-
-          /* OMPT state */
-          master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
-        } else {
-          exit_frame_p = &dummy;
-        }
-#endif
-        // AC: need to decrement t_serialized for enquiry functions to work
-        // correctly, will restore at join time
-        parent_team->t.t_serialized--;
-
-        {
-          KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-          KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-          __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
-#if OMPT_SUPPORT
-                                 ,
-                                 exit_frame_p
-#endif
-          );
-        }
-
-#if OMPT_SUPPORT
-        if (ompt_enabled.enabled) {
-          *exit_frame_p = NULL;
-          OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
-          if (ompt_enabled.ompt_callback_implicit_task) {
-            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                ompt_scope_end, NULL, implicit_task_data, 1,
-                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
-          }
-          ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
-          __ompt_lw_taskteam_unlink(master_th);
-          if (ompt_enabled.ompt_callback_parallel_end) {
-            ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
-                &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
-                OMPT_INVOKER(call_context) | ompt_parallel_team,
-                return_address);
-          }
-          master_th->th.ompt_thread_info.state = ompt_state_overhead;
-        }
-#endif
-        return TRUE;
-      }
-
-      parent_team->t.t_pkfn = microtask;
-      parent_team->t.t_invoke = invoker;
-      KMP_ATOMIC_INC(&root->r.r_in_parallel);
-      parent_team->t.t_active_level++;
-      parent_team->t.t_level++;
-      parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
-
-#if OMPT_SUPPORT
-      if (ompt_enabled.enabled) {
-        ompt_lw_taskteam_t lw_taskteam;
-        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
-                                &ompt_parallel_data, return_address);
-        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
-      }
-#endif
-
-      /* Change number of threads in the team if requested */
-      if (master_set_numthreads) { // The parallel has num_threads clause
-        if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
-          // AC: only can reduce number of threads dynamically, can't increase
-          kmp_info_t **other_threads = parent_team->t.t_threads;
-          // NOTE: if using distributed barrier, we need to run this code block
-          // even when the team size appears not to have changed from the max.
-          int old_proc = master_th->th.th_teams_size.nth;
-          if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
-              bp_dist_bar) {
-            __kmp_resize_dist_barrier(parent_team, old_proc,
-                                      master_set_numthreads);
-            __kmp_add_threads_to_team(parent_team, master_set_numthreads);
-          }
-          parent_team->t.t_nproc = master_set_numthreads;
-          for (i = 0; i < master_set_numthreads; ++i) {
-            other_threads[i]->th.th_team_nproc = master_set_numthreads;
-          }
-        }
-        // Keep extra threads hot in the team for possible next parallels
-        master_th->th.th_set_nproc = 0;
-      }
-
-#if USE_DEBUGGER
-      if (__kmp_debugging) { // Let debugger override number of threads.
-        int nth = __kmp_omp_num_threads(loc);
-        if (nth > 0) { // 0 means debugger doesn't want to change num threads
-          master_set_numthreads = nth;
-        }
-      }
-#endif
-
-      // Figure out the proc_bind policy for the nested parallel within teams
-      kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
-      // proc_bind_default means don't update
-      kmp_proc_bind_t proc_bind_icv = proc_bind_default;
-      if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
-        proc_bind = proc_bind_false;
-      } else {
-        // No proc_bind clause specified; use current proc-bind-var
-        if (proc_bind == proc_bind_default) {
-          proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
-        }
-        /* else: The proc_bind policy was specified explicitly on parallel
-           clause.
-           This overrides proc-bind-var for this parallel region, but does not
-           change proc-bind-var. */
-        // Figure the value of proc-bind-var for the child threads.
-        if ((level + 1 < __kmp_nested_proc_bind.used) &&
-            (__kmp_nested_proc_bind.bind_types[level + 1] !=
-             master_th->th.th_current_task->td_icvs.proc_bind)) {
-          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
-        }
-      }
-      KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
-      // Need to change the bind-var ICV to correct value for each implicit task
-      if (proc_bind_icv != proc_bind_default &&
-          master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
-        kmp_info_t **other_threads = parent_team->t.t_threads;
-        for (i = 0; i < master_th->th.th_team_nproc; ++i) {
-          other_threads[i]->th.th_current_task->td_icvs.proc_bind =
-              proc_bind_icv;
-        }
-      }
-      // Reset for next parallel region
-      master_th->th.th_set_proc_bind = proc_bind_default;
-
-#if USE_ITT_BUILD && USE_ITT_NOTIFY
-      if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
-           KMP_ITT_DEBUG) &&
-          __kmp_forkjoin_frames_mode == 3 &&
-          parent_team->t.t_active_level == 1 // only report frames at level 1
-          && master_th->th.th_teams_size.nteams == 1) {
-        kmp_uint64 tmp_time = __itt_get_timestamp();
-        master_th->th.th_frame_time = tmp_time;
-        parent_team->t.t_region_time = tmp_time;
-      }
-      if (__itt_stack_caller_create_ptr) {
-        KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
-        // create new stack stitching id before entering fork barrier
-        parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
-      }
-#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
-#if KMP_AFFINITY_SUPPORTED
-      __kmp_partition_places(parent_team);
-#endif
-
-      KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
-                    "master_th=%p, gtid=%d\n",
-                    root, parent_team, master_th, gtid));
-      __kmp_internal_fork(loc, gtid, parent_team);
-      KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
-                    "master_th=%p, gtid=%d\n",
-                    root, parent_team, master_th, gtid));
-
-      if (call_context == fork_context_gnu)
-        return TRUE;
-
-      /* Invoke microtask for PRIMARY thread */
-      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
-                    parent_team->t.t_id, parent_team->t.t_pkfn));
-
-      if (!parent_team->t.t_invoke(gtid)) {
-        KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
-      }
-      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
-                    parent_team->t.t_id, parent_team->t.t_pkfn));
-      KMP_MB(); /* Flush all pending memory write invalidates.  */
-
-      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
-
-      return TRUE;
-    } // Parallel closely nested in teams construct
-
-#if KMP_DEBUG
-    if (__kmp_tasking_mode != tskm_immediate_exec) {
-      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
-                       parent_team->t.t_task_team[master_th->th.th_task_state]);
-    }
+                                 ompt_parallel_data, return_address,
 #endif
+                                 ap);
+    } // End parallel closely nested in teams construct
 
     // Need this to happen before we determine the number of threads, not while
     // we are allocating the team
     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
-    int enter_teams = 0;
-    if (parent_team->t.t_active_level >=
-        master_th->th.th_current_task->td_icvs.max_active_levels) {
+
+    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
+
+    // Determine the number of threads
+    int enter_teams =
+        __kmp_is_entering_teams(active_level, level, teams_level, ap);
+    if ((!enter_teams &&
+         (parent_team->t.t_active_level >=
+          master_th->th.th_current_task->td_icvs.max_active_levels)) ||
+        (__kmp_library == library_serial)) {
+      KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
       nthreads = 1;
     } else {
-      enter_teams = ((ap == NULL && active_level == 0) ||
-                     (ap && teams_level > 0 && teams_level == level));
       nthreads = master_set_numthreads
                      ? master_set_numthreads
                      // TODO: get nproc directly from current task
                      : get__nproc_2(parent_team, master_tid);
+      // Use the thread_limit set for the current target task if exists, else go
+      // with the deduced nthreads
+      nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
+                     ? task_thread_limit
+                     : nthreads;
       // Check if we need to take forkjoin lock? (no need for serialized
-      // parallel out of teams construct). This code moved here from
-      // __kmp_reserve_threads() to speedup nested serialized parallels.
-      if (nthreads > 1) {
-        if ((get__max_active_levels(master_th) == 1 &&
-             (root->r.r_in_parallel && !enter_teams)) ||
-            (__kmp_library == library_serial)) {
-          KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
-                        " threads\n",
-                        gtid, nthreads));
-          nthreads = 1;
-        }
-      }
+      // parallel out of teams construct).
       if (nthreads > 1) {
         /* determine how many new threads we can use */
         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
@@ -1741,232 +2094,14 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     // If we temporarily changed the set number of threads then restore it now
     master_th->th.th_set_nproc = 0;
 
-    /* create a serialized parallel region? */
     if (nthreads == 1) {
-/* josh todo: hypothetical question: what do we do for OS X*? */
-#if KMP_OS_LINUX &&                                                            \
-    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
-      void *args[argc];
-#else
-      void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
-#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
-          KMP_ARCH_AARCH64) */
-
-      KA_TRACE(20,
-               ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
-
-      __kmpc_serialized_parallel(loc, gtid);
-
-#if OMPD_SUPPORT
-      master_th->th.th_serial_team->t.t_pkfn = microtask;
-#endif
-
-      if (call_context == fork_context_intel) {
-        /* TODO this sucks, use the compiler itself to pass args! :) */
-        master_th->th.th_serial_team->t.t_ident = loc;
-        if (!ap) {
-          // revert change made in __kmpc_serialized_parallel()
-          master_th->th.th_serial_team->t.t_level--;
-          // Get args from parent team for teams construct
-
-#if OMPT_SUPPORT
-          void *dummy;
-          void **exit_frame_p;
-          ompt_task_info_t *task_info;
-
-          ompt_lw_taskteam_t lw_taskteam;
-
-          if (ompt_enabled.enabled) {
-            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
-                                    &ompt_parallel_data, return_address);
-
-            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
-            // don't use lw_taskteam after linking. content was swaped
-
-            task_info = OMPT_CUR_TASK_INFO(master_th);
-            exit_frame_p = &(task_info->frame.exit_frame.ptr);
-            if (ompt_enabled.ompt_callback_implicit_task) {
-              OMPT_CUR_TASK_INFO(master_th)->thread_num =
-                  __kmp_tid_from_gtid(gtid);
-              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
-                  &(task_info->task_data), 1,
-                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
-                  ompt_task_implicit);
-            }
-
-            /* OMPT state */
-            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
-          } else {
-            exit_frame_p = &dummy;
-          }
-#endif
-
-          {
-            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-            __kmp_invoke_microtask(microtask, gtid, 0, argc,
-                                   parent_team->t.t_argv
-#if OMPT_SUPPORT
-                                   ,
-                                   exit_frame_p
-#endif
-            );
-          }
-
-#if OMPT_SUPPORT
-          if (ompt_enabled.enabled) {
-            *exit_frame_p = NULL;
-            if (ompt_enabled.ompt_callback_implicit_task) {
-              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                  ompt_scope_end, NULL, &(task_info->task_data), 1,
-                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
-                  ompt_task_implicit);
-            }
-            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
-            __ompt_lw_taskteam_unlink(master_th);
-            if (ompt_enabled.ompt_callback_parallel_end) {
-              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
-                  &ompt_parallel_data, parent_task_data,
-                  OMPT_INVOKER(call_context) | ompt_parallel_team,
-                  return_address);
-            }
-            master_th->th.ompt_thread_info.state = ompt_state_overhead;
-          }
-#endif
-        } else if (microtask == (microtask_t)__kmp_teams_master) {
-          KMP_DEBUG_ASSERT(master_th->th.th_team ==
-                           master_th->th.th_serial_team);
-          team = master_th->th.th_team;
-          // team->t.t_pkfn = microtask;
-          team->t.t_invoke = invoker;
-          __kmp_alloc_argv_entries(argc, team, TRUE);
-          team->t.t_argc = argc;
-          argv = (void **)team->t.t_argv;
-          if (ap) {
-            for (i = argc - 1; i >= 0; --i)
-              *argv++ = va_arg(kmp_va_deref(ap), void *);
-          } else {
-            for (i = 0; i < argc; ++i)
-              // Get args from parent team for teams construct
-              argv[i] = parent_team->t.t_argv[i];
-          }
-          // AC: revert change made in __kmpc_serialized_parallel()
-          //     because initial code in teams should have level=0
-          team->t.t_level--;
-          // AC: call special invoker for outer "parallel" of teams construct
-          invoker(gtid);
-#if OMPT_SUPPORT
-          if (ompt_enabled.enabled) {
-            ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
-            if (ompt_enabled.ompt_callback_implicit_task) {
-              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                  ompt_scope_end, NULL, &(task_info->task_data), 0,
-                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
-            }
-            if (ompt_enabled.ompt_callback_parallel_end) {
-              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
-                  &ompt_parallel_data, parent_task_data,
-                  OMPT_INVOKER(call_context) | ompt_parallel_league,
-                  return_address);
-            }
-            master_th->th.ompt_thread_info.state = ompt_state_overhead;
-          }
-#endif
-        } else {
-          argv = args;
-          for (i = argc - 1; i >= 0; --i)
-            *argv++ = va_arg(kmp_va_deref(ap), void *);
-          KMP_MB();
-
-#if OMPT_SUPPORT
-          void *dummy;
-          void **exit_frame_p;
-          ompt_task_info_t *task_info;
-
-          ompt_lw_taskteam_t lw_taskteam;
-
-          if (ompt_enabled.enabled) {
-            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
-                                    &ompt_parallel_data, return_address);
-            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
-            // don't use lw_taskteam after linking. content was swaped
-            task_info = OMPT_CUR_TASK_INFO(master_th);
-            exit_frame_p = &(task_info->frame.exit_frame.ptr);
-
-            /* OMPT implicit task begin */
-            implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
-            if (ompt_enabled.ompt_callback_implicit_task) {
-              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
-                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
-                  ompt_task_implicit);
-              OMPT_CUR_TASK_INFO(master_th)->thread_num =
-                  __kmp_tid_from_gtid(gtid);
-            }
-
-            /* OMPT state */
-            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
-          } else {
-            exit_frame_p = &dummy;
-          }
-#endif
-
-          {
-            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-            __kmp_invoke_microtask(microtask, gtid, 0, argc, args
+      return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
+                                    invoker, master_th, parent_team,
 #if OMPT_SUPPORT
-                                   ,
-                                   exit_frame_p
+                                    &ompt_parallel_data, &return_address,
+                                    &parent_task_data,
 #endif
-            );
-          }
-
-#if OMPT_SUPPORT
-          if (ompt_enabled.enabled) {
-            *exit_frame_p = NULL;
-            if (ompt_enabled.ompt_callback_implicit_task) {
-              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                  ompt_scope_end, NULL, &(task_info->task_data), 1,
-                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
-                  ompt_task_implicit);
-            }
-
-            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
-            __ompt_lw_taskteam_unlink(master_th);
-            if (ompt_enabled.ompt_callback_parallel_end) {
-              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
-                  &ompt_parallel_data, parent_task_data,
-                  OMPT_INVOKER(call_context) | ompt_parallel_team,
-                  return_address);
-            }
-            master_th->th.ompt_thread_info.state = ompt_state_overhead;
-          }
-#endif
-        }
-      } else if (call_context == fork_context_gnu) {
-#if OMPT_SUPPORT
-        ompt_lw_taskteam_t lwt;
-        __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
-                                return_address);
-
-        lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
-        __ompt_lw_taskteam_link(&lwt, master_th, 1);
-// don't use lw_taskteam after linking. content was swaped
-#endif
-
-        // we were called from GNU native code
-        KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
-        return FALSE;
-      } else {
-        KMP_ASSERT2(call_context < fork_context_last,
-                    "__kmp_fork_call: unknown fork_context parameter");
-      }
-
-      KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
-      KMP_MB();
-      return FALSE;
+                                    ap);
     } // if (nthreads == 1)
 
     // GEH: only modify the executing flag in the case when not serialized
@@ -1988,9 +2123,18 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
     // See if we need to make a copy of the ICVs.
     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
-    if ((level + 1 < __kmp_nested_nth.used) &&
-        (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
-      nthreads_icv = __kmp_nested_nth.nth[level + 1];
+    kmp_nested_nthreads_t *nested_nth = NULL;
+    if (!master_th->th.th_set_nested_nth &&
+        (level + 1 < parent_team->t.t_nested_nth->used) &&
+        (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
+      nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
+    } else if (master_th->th.th_set_nested_nth) {
+      nested_nth = __kmp_override_nested_nth(master_th, level);
+      if ((level + 1 < nested_nth->used) &&
+          (nested_nth->nth[level + 1] != nthreads_icv))
+        nthreads_icv = nested_nth->nth[level + 1];
+      else
+        nthreads_icv = 0; // don't update
     } else {
       nthreads_icv = 0; // don't update
     }
@@ -2099,6 +2243,24 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
 
+    // Check if hot team has potentially outdated list, and if so, free it
+    if (team->t.t_nested_nth &&
+        team->t.t_nested_nth != parent_team->t.t_nested_nth) {
+      KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
+      KMP_INTERNAL_FREE(team->t.t_nested_nth);
+      team->t.t_nested_nth = NULL;
+    }
+    team->t.t_nested_nth = parent_team->t.t_nested_nth;
+    if (master_th->th.th_set_nested_nth) {
+      if (!nested_nth)
+        nested_nth = __kmp_override_nested_nth(master_th, level);
+      team->t.t_nested_nth = nested_nth;
+      KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
+      master_th->th.th_set_nested_nth = NULL;
+      master_th->th.th_set_nested_nth_sz = 0;
+      master_th->th.th_nt_strict = false;
+    }
+
     // Update the floating point rounding in the team if required.
     propagateFPControl(team);
 #if OMPD_SUPPORT
@@ -2106,64 +2268,6 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       ompd_bp_parallel_begin();
 #endif
 
-    if (__kmp_tasking_mode != tskm_immediate_exec) {
-      // Set primary thread's task team to team's task team. Unless this is hot
-      // team, it should be NULL.
-      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
-                       parent_team->t.t_task_team[master_th->th.th_task_state]);
-      KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
-                    "%p, new task_team %p / team %p\n",
-                    __kmp_gtid_from_thread(master_th),
-                    master_th->th.th_task_team, parent_team,
-                    team->t.t_task_team[master_th->th.th_task_state], team));
-
-      if (active_level || master_th->th.th_task_team) {
-        // Take a memo of primary thread's task_state
-        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
-        if (master_th->th.th_task_state_top >=
-            master_th->th.th_task_state_stack_sz) { // increase size
-          kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
-          kmp_uint8 *old_stack, *new_stack;
-          kmp_uint32 i;
-          new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
-          for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
-            new_stack[i] = master_th->th.th_task_state_memo_stack[i];
-          }
-          for (i = master_th->th.th_task_state_stack_sz; i < new_size;
-               ++i) { // zero-init rest of stack
-            new_stack[i] = 0;
-          }
-          old_stack = master_th->th.th_task_state_memo_stack;
-          master_th->th.th_task_state_memo_stack = new_stack;
-          master_th->th.th_task_state_stack_sz = new_size;
-          __kmp_free(old_stack);
-        }
-        // Store primary thread's task_state on stack
-        master_th->th
-            .th_task_state_memo_stack[master_th->th.th_task_state_top] =
-            master_th->th.th_task_state;
-        master_th->th.th_task_state_top++;
-#if KMP_NESTED_HOT_TEAMS
-        if (master_th->th.th_hot_teams &&
-            active_level < __kmp_hot_teams_max_level &&
-            team == master_th->th.th_hot_teams[active_level].hot_team) {
-          // Restore primary thread's nested state if nested hot team
-          master_th->th.th_task_state =
-              master_th->th
-                  .th_task_state_memo_stack[master_th->th.th_task_state_top];
-        } else {
-#endif
-          master_th->th.th_task_state = 0;
-#if KMP_NESTED_HOT_TEAMS
-        }
-#endif
-      }
-#if !KMP_NESTED_HOT_TEAMS
-      KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
-                       (team == root->r.r_hot_team));
-#endif
-    }
-
     KA_TRACE(
         20,
         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
@@ -2371,8 +2475,7 @@ void __kmp_join_call(ident_t *loc, int gtid
                   __kmp_gtid_from_thread(master_th), team,
                   team->t.t_task_team[master_th->th.th_task_state],
                   master_th->th.th_task_team));
-    KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
-                     team->t.t_task_team[master_th->th.th_task_state]);
+    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
   }
 #endif
 
@@ -2396,6 +2499,9 @@ void __kmp_join_call(ident_t *loc, int gtid
 
 #if OMPT_SUPPORT
     if (ompt_enabled.enabled) {
+      if (fork_context == fork_context_gnu) {
+        __ompt_lw_taskteam_unlink(master_th);
+      }
       __kmp_join_restore_state(master_th, parent_team);
     }
 #endif
@@ -2430,12 +2536,6 @@ void __kmp_join_call(ident_t *loc, int gtid
       parent_team->t.t_stack_id = NULL;
     }
 #endif
-
-    if (team->t.t_nproc > 1 &&
-        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
-      team->t.b->update_num_threads(team->t.t_nproc);
-      __kmp_add_threads_to_team(team, team->t.t_nproc);
-    }
   }
 
   KMP_MB();
@@ -2613,18 +2713,11 @@ void __kmp_join_call(ident_t *loc, int gtid
   }
 
   if (__kmp_tasking_mode != tskm_immediate_exec) {
-    if (master_th->th.th_task_state_top >
-        0) { // Restore task state from memo stack
-      KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
-      // Remember primary thread's state if we re-use this nested hot team
-      master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
-          master_th->th.th_task_state;
-      --master_th->th.th_task_state_top; // pop
-      // Now restore state at this level
-      master_th->th.th_task_state =
-          master_th->th
-              .th_task_state_memo_stack[master_th->th.th_task_state_top];
-    }
+    // Restore primary thread's task state from team structure
+    KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
+                     team->t.t_primary_task_state == 1);
+    master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
+
     // Copy the task team from the parent team to the primary thread
     master_th->th.th_task_team =
         parent_team->t.t_task_team[master_th->th.th_task_state];
@@ -2642,7 +2735,7 @@ void __kmp_join_call(ident_t *loc, int gtid
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
 
 #if KMP_AFFINITY_SUPPORTED
-  if (master_th->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
+  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
     __kmp_reset_root_init_mask(gtid);
   }
 #endif
@@ -3223,6 +3316,8 @@ static kmp_internal_control_t __kmp_get_global_icvs(void) {
     // next parallel region (per thread)
     // (use a max ub on value if __kmp_parallel_initialize not called yet)
     __kmp_cg_max_nth, // int thread_limit;
+    __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
+    // on task. This is used in the case of target thread_limit
     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
     // for max_active_levels
     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
@@ -3299,6 +3394,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   root_team->t.t_serialized = 1;
   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
   root_team->t.t_sched.sched = r_sched.sched;
+  root_team->t.t_nested_nth = &__kmp_nested_nth;
   KA_TRACE(
       20,
       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
@@ -3336,6 +3432,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
   hot_team->t.t_sched.sched = r_sched.sched;
   hot_team->t.t_size_changed = 0;
+  hot_team->t.t_nested_nth = &__kmp_nested_nth;
 }
 
 #ifdef KMP_DEBUG
@@ -3934,7 +4031,7 @@ int __kmp_register_root(int initial_thread) {
   __kmp_root_counter++;
 
 #if OMPT_SUPPORT
-  if (!initial_thread && ompt_enabled.enabled) {
+  if (ompt_enabled.enabled) {
 
     kmp_info_t *root_thread = ompt_get_thread();
 
@@ -4202,6 +4299,7 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
   else // no tasking --> always safe to reap
     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
   this_thr->th.th_set_proc_bind = proc_bind_default;
+
 #if KMP_AFFINITY_SUPPORTED
   this_thr->th.th_new_place = this_thr->th.th_current_place;
 #endif
@@ -4311,17 +4409,6 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
 
   this_thr->th.th_next_pool = NULL;
 
-  if (!this_thr->th.th_task_state_memo_stack) {
-    size_t i;
-    this_thr->th.th_task_state_memo_stack =
-        (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
-    this_thr->th.th_task_state_top = 0;
-    this_thr->th.th_task_state_stack_sz = 4;
-    for (i = 0; i < this_thr->th.th_task_state_stack_sz;
-         ++i) // zero init the stack
-      this_thr->th.th_task_state_memo_stack[i] = 0;
-  }
-
   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
 
@@ -4346,8 +4433,10 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
 #endif
   KMP_MB();
 
-  /* first, try to get one from the thread pool */
-  if (__kmp_thread_pool) {
+  /* first, try to get one from the thread pool unless allocating thread is
+   * the main hidden helper thread. The hidden helper team should always
+   * allocate new OS threads. */
+  if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
     if (new_thr == __kmp_thread_pool_insert_pt) {
@@ -4376,8 +4465,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     TCW_4(__kmp_nth, __kmp_nth + 1);
 
     new_thr->th.th_task_state = 0;
-    new_thr->th.th_task_state_top = 0;
-    new_thr->th.th_task_state_stack_sz = 4;
 
     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
       // Make sure pool thread has transitioned to waiting on own thread struct
@@ -4412,7 +4499,7 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   }
 
   /* no, well fork a new one */
-  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
+  KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
 
 #if KMP_USE_MONITOR
@@ -4465,6 +4552,11 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   /* allocate space for it. */
   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
 
+  new_thr->th.th_nt_strict = false;
+  new_thr->th.th_nt_loc = NULL;
+  new_thr->th.th_nt_sev = severity_fatal;
+  new_thr->th.th_nt_msg = NULL;
+
   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
 
 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
@@ -4575,6 +4667,9 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   new_thr->th.th_active_in_pool = FALSE;
   TCW_4(new_thr->th.th_active, TRUE);
 
+  new_thr->th.th_set_nested_nth = NULL;
+  new_thr->th.th_set_nested_nth_sz = 0;
+
   /* adjust the global counters */
   __kmp_all_nth++;
   __kmp_nth++;
@@ -4603,6 +4698,11 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   }
 #endif /* KMP_ADJUST_BLOCKTIME */
 
+#if KMP_AFFINITY_SUPPORTED
+  // Set the affinity and topology information for new thread
+  __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
+#endif
+
   /* actually fork it and create the new worker thread */
   KF_TRACE(
       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
@@ -4695,26 +4795,20 @@ static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
 }
 
-#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
-/* Sets full mask for thread and returns old mask, no changes to structures. */
-static void
-__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
-  if (KMP_AFFINITY_CAPABLE()) {
-    int status;
-    if (old_mask != NULL) {
-      status = __kmp_get_system_affinity(old_mask, TRUE);
-      int error = errno;
-      if (status != 0) {
-        __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
-                    __kmp_msg_null);
-      }
-    }
-    __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
+#if KMP_AFFINITY_SUPPORTED
+static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
+                                          int first, int last, int newp) {
+  th->th.th_first_place = first;
+  th->th.th_last_place = last;
+  th->th.th_new_place = newp;
+  if (newp != th->th.th_current_place) {
+    if (__kmp_display_affinity && team->t.t_display_affinity != 1)
+      team->t.t_display_affinity = 1;
+    // Copy topology information associated with the new place
+    th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
+    th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
   }
 }
-#endif
-
-#if KMP_AFFINITY_SUPPORTED
 
 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
 // It calculates the worker + primary thread's partition based upon the parent
@@ -4731,6 +4825,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
   int first_place = master_th->th.th_first_place;
   int last_place = master_th->th.th_last_place;
   int masters_place = master_th->th.th_current_place;
+  int num_masks = __kmp_affinity.num_masks;
   team->t.t_first_place = first_place;
   team->t.t_last_place = last_place;
 
@@ -4753,13 +4848,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
     for (f = 1; f < n_th; f++) {
       kmp_info_t *th = team->t.t_threads[f];
       KMP_DEBUG_ASSERT(th != NULL);
-      th->th.th_first_place = first_place;
-      th->th.th_last_place = last_place;
-      th->th.th_new_place = masters_place;
-      if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
-          team->t.t_display_affinity != 1) {
-        team->t.t_display_affinity = 1;
-      }
+      __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
 
       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
                      "partition = [%d,%d]\n",
@@ -4775,7 +4864,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
     if (first_place <= last_place) {
       n_places = last_place - first_place + 1;
     } else {
-      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
+      n_places = num_masks - first_place + last_place + 1;
     }
     if (n_th <= n_places) {
       int place = masters_place;
@@ -4785,18 +4874,12 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
 
         if (place == last_place) {
           place = first_place;
-        } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+        } else if (place == (num_masks - 1)) {
           place = 0;
         } else {
           place++;
         }
-        th->th.th_first_place = first_place;
-        th->th.th_last_place = last_place;
-        th->th.th_new_place = place;
-        if (__kmp_display_affinity && place != th->th.th_current_place &&
-            team->t.t_display_affinity != 1) {
-          team->t.t_display_affinity = 1;
-        }
+        __kmp_set_thread_place(team, th, first_place, last_place, place);
 
         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
                        "partition = [%d,%d]\n",
@@ -4815,13 +4898,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
         kmp_info_t *th = team->t.t_threads[f];
         KMP_DEBUG_ASSERT(th != NULL);
 
-        th->th.th_first_place = first_place;
-        th->th.th_last_place = last_place;
-        th->th.th_new_place = place;
-        if (__kmp_display_affinity && place != th->th.th_current_place &&
-            team->t.t_display_affinity != 1) {
-          team->t.t_display_affinity = 1;
-        }
+        __kmp_set_thread_place(team, th, first_place, last_place, place);
         s_count++;
 
         if ((s_count == S) && rem && (gap_ct == gap)) {
@@ -4830,7 +4907,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
           // we added an extra thread to this place; move to next place
           if (place == last_place) {
             place = first_place;
-          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+          } else if (place == (num_masks - 1)) {
             place = 0;
           } else {
             place++;
@@ -4841,7 +4918,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
         } else if (s_count == S) { // place full; don't add extra
           if (place == last_place) {
             place = first_place;
-          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+          } else if (place == (num_masks - 1)) {
             place = 0;
           } else {
             place++;
@@ -4868,12 +4945,12 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
     if (first_place <= last_place) {
       n_places = last_place - first_place + 1;
     } else {
-      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
+      n_places = num_masks - first_place + last_place + 1;
     }
     if (n_th <= n_places) {
       int place = -1;
 
-      if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
+      if (n_places != num_masks) {
         int S = n_places / n_th;
         int s_count, rem, gap, gap_ct;
 
@@ -4888,17 +4965,12 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
           kmp_info_t *th = team->t.t_threads[f];
           KMP_DEBUG_ASSERT(th != NULL);
 
-          th->th.th_first_place = place;
-          th->th.th_new_place = place;
-          if (__kmp_display_affinity && place != th->th.th_current_place &&
-              team->t.t_display_affinity != 1) {
-            team->t.t_display_affinity = 1;
-          }
+          int fplace = place, nplace = place;
           s_count = 1;
           while (s_count < S) {
             if (place == last_place) {
               place = first_place;
-            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+            } else if (place == (num_masks - 1)) {
               place = 0;
             } else {
               place++;
@@ -4908,7 +4980,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
           if (rem && (gap_ct == gap)) {
             if (place == last_place) {
               place = first_place;
-            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+            } else if (place == (num_masks - 1)) {
               place = 0;
             } else {
               place++;
@@ -4916,12 +4988,12 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
             rem--;
             gap_ct = 0;
           }
-          th->th.th_last_place = place;
+          __kmp_set_thread_place(team, th, fplace, place, nplace);
           gap_ct++;
 
           if (place == last_place) {
             place = first_place;
-          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+          } else if (place == (num_masks - 1)) {
             place = 0;
           } else {
             place++;
@@ -4929,10 +5001,10 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
 
           KA_TRACE(100,
                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
-                    "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
+                    "partition = [%d,%d], num_masks: %u\n",
                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
                     f, th->th.th_new_place, th->th.th_first_place,
-                    th->th.th_last_place, __kmp_affinity_num_masks));
+                    th->th.th_last_place, num_masks));
         }
       } else {
         /* Having uniform space of available computation places I can create
@@ -4982,13 +5054,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
             KMP_DEBUG_ASSERT(last_place >= first_place);
             th = team->t.t_threads[f];
             KMP_DEBUG_ASSERT(th);
-            th->th.th_first_place = first;
-            th->th.th_new_place = place;
-            th->th.th_last_place = last;
-            if (__kmp_display_affinity && place != th->th.th_current_place &&
-                team->t.t_display_affinity != 1) {
-              team->t.t_display_affinity = 1;
-            }
+            __kmp_set_thread_place(team, th, first, last, place);
             KA_TRACE(100,
                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
                       "partition = [%d,%d], spacing = %.4f\n",
@@ -5014,13 +5080,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
         kmp_info_t *th = team->t.t_threads[f];
         KMP_DEBUG_ASSERT(th != NULL);
 
-        th->th.th_first_place = place;
-        th->th.th_last_place = place;
-        th->th.th_new_place = place;
-        if (__kmp_display_affinity && place != th->th.th_current_place &&
-            team->t.t_display_affinity != 1) {
-          team->t.t_display_affinity = 1;
-        }
+        __kmp_set_thread_place(team, th, place, place, place);
         s_count++;
 
         if ((s_count == S) && rem && (gap_ct == gap)) {
@@ -5029,7 +5089,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
           // we added an extra thread to this place; move on to next place
           if (place == last_place) {
             place = first_place;
-          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+          } else if (place == (num_masks - 1)) {
             place = 0;
           } else {
             place++;
@@ -5040,7 +5100,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
         } else if (s_count == S) { // place is full; don't add extra thread
           if (place == last_place) {
             place = first_place;
-          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+          } else if (place == (num_masks - 1)) {
             place = 0;
           } else {
             place++;
@@ -5210,6 +5270,15 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         // Activate team threads via th_used_in_team
         __kmp_add_threads_to_team(team, new_nproc);
       }
+      // When decreasing team size, threads no longer in the team should
+      // unref task team.
+      if (__kmp_tasking_mode != tskm_immediate_exec) {
+        for (f = new_nproc; f < team->t.t_nproc; f++) {
+          kmp_info_t *th = team->t.t_threads[f];
+          KMP_DEBUG_ASSERT(th);
+          th->th.th_task_team = NULL;
+        }
+      }
 #if KMP_NESTED_HOT_TEAMS
       if (__kmp_hot_teams_mode == 0) {
         // AC: saved number of threads should correspond to team's value in this
@@ -5220,11 +5289,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         /* release the extra threads we don't need any more */
         for (f = new_nproc; f < team->t.t_nproc; f++) {
           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
-          if (__kmp_tasking_mode != tskm_immediate_exec) {
-            // When decreasing team size, threads no longer in the team should
-            // unref task team.
-            team->t.t_threads[f]->th.th_task_team = NULL;
-          }
           __kmp_free_thread(team->t.t_threads[f]);
           team->t.t_threads[f] = NULL;
         }
@@ -5278,12 +5342,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #endif
       }
     } else { // team->t.t_nproc < new_nproc
-#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
-      kmp_affin_mask_t *old_mask;
-      if (KMP_AFFINITY_CAPABLE()) {
-        KMP_CPU_ALLOC(old_mask);
-      }
-#endif
 
       KA_TRACE(20,
                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
@@ -5326,13 +5384,14 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
           __kmp_reinitialize_team(team, new_icvs, NULL);
         }
 
-#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
+#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) &&   \
+    KMP_AFFINITY_SUPPORTED
         /* Temporarily set full mask for primary thread before creation of
            workers. The reason is that workers inherit the affinity from the
            primary thread, so if a lot of workers are created on the single
            core quickly, they don't get a chance to set their own affinity for
            a long time. */
-        __kmp_set_thread_affinity_mask_full_tmp(old_mask);
+        kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
 #endif
 
         /* allocate new threads for the hot team */
@@ -5362,12 +5421,10 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
           }
         }
 
-#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
-        if (KMP_AFFINITY_CAPABLE()) {
-          /* Restore initial primary thread's affinity mask */
-          __kmp_set_system_affinity(old_mask, TRUE);
-          KMP_CPU_FREE(old_mask);
-        }
+#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) &&   \
+    KMP_AFFINITY_SUPPORTED
+        /* Restore initial primary thread's affinity mask */
+        new_temp_affinity.restore();
 #endif
 #if KMP_NESTED_HOT_TEAMS
       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
@@ -5388,21 +5445,10 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         __kmp_initialize_info(team->t.t_threads[f], team, f,
                               __kmp_gtid_from_tid(f, team));
 
-      if (level) { // set th_task_state for new threads in nested hot team
-        // __kmp_initialize_info() no longer zeroes th_task_state, so we should
-        // only need to set the th_task_state for the new threads. th_task_state
-        // for primary thread will not be accurate until after this in
-        // __kmp_fork_call(), so we look to the primary thread's memo_stack to
-        // get the correct value.
-        for (f = old_nproc; f < team->t.t_nproc; ++f)
-          team->t.t_threads[f]->th.th_task_state =
-              team->t.t_threads[0]->th.th_task_state_memo_stack[level];
-      } else { // set th_task_state for new threads in non-nested hot team
-        // copy primary thread's state
-        kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
-        for (f = old_nproc; f < team->t.t_nproc; ++f)
-          team->t.t_threads[f]->th.th_task_state = old_state;
-      }
+      // set th_task_state for new threads in hot team with older thread's state
+      kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
+      for (f = old_nproc; f < team->t.t_nproc; ++f)
+        team->t.t_threads[f]->th.th_task_state = old_state;
 
 #ifdef KMP_DEBUG
       for (f = 0; f < team->t.t_nproc; ++f) {
@@ -5420,7 +5466,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       }
     } // Check changes in number of threads
 
-    kmp_info_t *master = team->t.t_threads[0];
     if (master->th.th_teams_microtask) {
       for (f = 1; f < new_nproc; ++f) {
         // propagate teams construct specific info to workers
@@ -5526,6 +5571,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       __ompt_team_assign_id(team, ompt_parallel_data);
 #endif
 
+      team->t.t_nested_nth = NULL;
+
       KMP_MB();
 
       return team;
@@ -5597,6 +5644,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 
   KMP_MB();
 
+  team->t.t_nested_nth = NULL;
+
   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
                 team->t.t_id));
 
@@ -5672,9 +5721,8 @@ void __kmp_free_team(kmp_root_t *root,
           }
 #endif
           // first check if thread is sleeping
-          kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
-          if (fl.is_sleeping())
-            fl.resume(__kmp_gtid_from_thread(th));
+          if (th->th.th_sleep_loc)
+            __kmp_null_resume_wrapper(th);
           KMP_CPU_PAUSE();
         }
       }
@@ -5700,6 +5748,14 @@ void __kmp_free_team(kmp_root_t *root,
       }
     }
 
+    // Before clearing parent pointer, check if nested_nth list should be freed
+    if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
+        team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
+      KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
+      KMP_INTERNAL_FREE(team->t.t_nested_nth);
+    }
+    team->t.t_nested_nth = NULL;
+
     // Reset pointer to parent team only for non-hot teams.
     team->t.t_parent = NULL;
     team->t.t_level = 0;
@@ -5709,8 +5765,8 @@ void __kmp_free_team(kmp_root_t *root,
     for (f = 1; f < team->t.t_nproc; ++f) {
       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
-        KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
-                                    1, 2);
+        (void)KMP_COMPARE_AND_STORE_ACQ32(
+            &(team->t.t_threads[f]->th.th_used_in_team), 1, 2);
       }
       __kmp_free_thread(team->t.t_threads[f]);
     }
@@ -6047,7 +6103,6 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) {
       __kmp_join_barrier(gtid);
     }
   }
-  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
 
 #if OMPD_SUPPORT
   if (ompd_state & OMPD_ENABLE_BP)
@@ -6216,11 +6271,6 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
     thread->th.th_pri_common = NULL;
   }
 
-  if (thread->th.th_task_state_memo_stack != NULL) {
-    __kmp_free(thread->th.th_task_state_memo_stack);
-    thread->th.th_task_state_memo_stack = NULL;
-  }
-
 #if KMP_USE_BGET
   if (thread->th.th_local.bget_data != NULL) {
     __kmp_finalize_bget(thread);
@@ -6683,6 +6733,13 @@ static inline char *__kmp_reg_status_name() {
 #endif
 } // __kmp_reg_status_get
 
+#if defined(KMP_USE_SHM)
+bool __kmp_shm_available = false;
+bool __kmp_tmp_available = false;
+// If /dev/shm is not accessible, we will create a temporary file under /tmp.
+char *temp_reg_status_file_name = nullptr;
+#endif
+
 void __kmp_register_library_startup(void) {
 
   char *name = __kmp_reg_status_name(); // Name of the environment variable.
@@ -6708,52 +6765,108 @@ void __kmp_register_library_startup(void) {
     char *value = NULL; // Actual value of the environment variable.
 
 #if defined(KMP_USE_SHM)
-    char *shm_name = __kmp_str_format("/%s", name);
-    int shm_preexist = 0;
-    char *data1;
-    int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
-    if ((fd1 == -1) && (errno == EEXIST)) {
-      // file didn't open because it already exists.
-      // try opening existing file
-      fd1 = shm_open(shm_name, O_RDWR, 0666);
-      if (fd1 == -1) { // file didn't open
-        // error out here
-        __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
-                    __kmp_msg_null);
-      } else {
-        // able to open existing file
-        shm_preexist = 1;
+    char *shm_name = nullptr;
+    char *data1 = nullptr;
+    __kmp_shm_available = __kmp_detect_shm();
+    if (__kmp_shm_available) {
+      int fd1 = -1;
+      shm_name = __kmp_str_format("/%s", name);
+      int shm_preexist = 0;
+      fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
+      if ((fd1 == -1) && (errno == EEXIST)) {
+        // file didn't open because it already exists.
+        // try opening existing file
+        fd1 = shm_open(shm_name, O_RDWR, 0600);
+        if (fd1 == -1) { // file didn't open
+          KMP_WARNING(FunctionError, "Can't open SHM");
+          __kmp_shm_available = false;
+        } else { // able to open existing file
+          shm_preexist = 1;
+        }
       }
-    } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
-      // already exists.
-      // error out here.
-      __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
-                  __kmp_msg_null);
-    }
-    if (shm_preexist == 0) {
-      // we created SHM now set size
-      if (ftruncate(fd1, SHM_SIZE) == -1) {
-        // error occured setting size;
-        __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
-                    KMP_ERR(errno), __kmp_msg_null);
+      if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
+        if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
+          KMP_WARNING(FunctionError, "Can't set size of SHM");
+          __kmp_shm_available = false;
+        }
       }
+      if (__kmp_shm_available) { // SHM exists, now map it
+        data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+                             fd1, 0);
+        if (data1 == MAP_FAILED) { // failed to map shared memory
+          KMP_WARNING(FunctionError, "Can't map SHM");
+          __kmp_shm_available = false;
+        }
+      }
+      if (__kmp_shm_available) { // SHM mapped
+        if (shm_preexist == 0) { // set data to SHM, set value
+          KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
+        }
+        // Read value from either what we just wrote or existing file.
+        value = __kmp_str_format("%s", data1); // read value from SHM
+        munmap(data1, SHM_SIZE);
+      }
+      if (fd1 != -1)
+        close(fd1);
+    }
+    if (!__kmp_shm_available)
+      __kmp_tmp_available = __kmp_detect_tmp();
+    if (!__kmp_shm_available && __kmp_tmp_available) {
+      // SHM failed to work due to an error other than that the file already
+      // exists. Try to create a temp file under /tmp.
+      // If /tmp isn't accessible, fall back to using environment variable.
+      // TODO: /tmp might not always be the temporary directory. For now we will
+      // not consider TMPDIR.
+      int fd1 = -1;
+      temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
+      int tmp_preexist = 0;
+      fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
+      if ((fd1 == -1) && (errno == EEXIST)) {
+        // file didn't open because it already exists.
+        // try opening existing file
+        fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
+        if (fd1 == -1) { // file didn't open if (fd1 == -1) {
+          KMP_WARNING(FunctionError, "Can't open TEMP");
+          __kmp_tmp_available = false;
+        } else {
+          tmp_preexist = 1;
+        }
+      }
+      if (__kmp_tmp_available && tmp_preexist == 0) {
+        // we created /tmp file now set size
+        if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
+          KMP_WARNING(FunctionError, "Can't set size of /tmp file");
+          __kmp_tmp_available = false;
+        }
+      }
+      if (__kmp_tmp_available) {
+        data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+                             fd1, 0);
+        if (data1 == MAP_FAILED) { // failed to map /tmp
+          KMP_WARNING(FunctionError, "Can't map /tmp");
+          __kmp_tmp_available = false;
+        }
+      }
+      if (__kmp_tmp_available) {
+        if (tmp_preexist == 0) { // set data to TMP, set value
+          KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
+        }
+        // Read value from either what we just wrote or existing file.
+        value = __kmp_str_format("%s", data1); // read value from SHM
+        munmap(data1, SHM_SIZE);
+      }
+      if (fd1 != -1)
+        close(fd1);
     }
-    data1 =
-        (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
-    if (data1 == MAP_FAILED) {
-      // failed to map shared memory
-      __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
-                  __kmp_msg_null);
-    }
-    if (shm_preexist == 0) { // set data to SHM, set value
-      KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
+    if (!__kmp_shm_available && !__kmp_tmp_available) {
+      // no /dev/shm and no /tmp -- fall back to environment variable
+      // Set environment variable, but do not overwrite if it exists.
+      __kmp_env_set(name, __kmp_registration_str, 0);
+      // read value to see if it got set
+      value = __kmp_env_get(name);
     }
-    // Read value from either what we just wrote or existing file.
-    value = __kmp_str_format("%s", data1); // read value from SHM
-    munmap(data1, SHM_SIZE);
-    close(fd1);
 #else // Windows and unix with static library
-    // Set environment variable, but do not overwrite if it is exist.
+    // Set environment variable, but do not overwrite if it exists.
     __kmp_env_set(name, __kmp_registration_str, 0);
     // read value to see if it got set
     value = __kmp_env_get(name);
@@ -6813,8 +6926,14 @@ void __kmp_register_library_startup(void) {
       case 2: { // Neighbor is dead.
 
 #if defined(KMP_USE_SHM)
-        // close shared memory.
-        shm_unlink(shm_name); // this removes file in /dev/shm
+        if (__kmp_shm_available) { // close shared memory.
+          shm_unlink(shm_name); // this removes file in /dev/shm
+        } else if (__kmp_tmp_available) {
+          unlink(temp_reg_status_file_name); // this removes the temp file
+        } else {
+          // Clear the variable and try to register library again.
+          __kmp_env_unset(name);
+        }
 #else
         // Clear the variable and try to register library again.
         __kmp_env_unset(name);
@@ -6827,7 +6946,8 @@ void __kmp_register_library_startup(void) {
     }
     KMP_INTERNAL_FREE((void *)value);
 #if defined(KMP_USE_SHM)
-    KMP_INTERNAL_FREE((void *)shm_name);
+    if (shm_name)
+      KMP_INTERNAL_FREE((void *)shm_name);
 #endif
   } // while
   KMP_INTERNAL_FREE((void *)name);
@@ -6840,18 +6960,32 @@ void __kmp_unregister_library(void) {
   char *value = NULL;
 
 #if defined(KMP_USE_SHM)
-  char *shm_name = __kmp_str_format("/%s", name);
-  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
-  if (fd1 == -1) {
-    // file did not open. return.
-    return;
-  }
-  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
-  if (data1 != MAP_FAILED) {
-    value = __kmp_str_format("%s", data1); // read value from SHM
-    munmap(data1, SHM_SIZE);
+  char *shm_name = nullptr;
+  int fd1;
+  if (__kmp_shm_available) {
+    shm_name = __kmp_str_format("/%s", name);
+    fd1 = shm_open(shm_name, O_RDONLY, 0600);
+    if (fd1 != -1) { // File opened successfully
+      char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
+      if (data1 != MAP_FAILED) {
+        value = __kmp_str_format("%s", data1); // read value from SHM
+        munmap(data1, SHM_SIZE);
+      }
+      close(fd1);
+    }
+  } else if (__kmp_tmp_available) { // try /tmp
+    fd1 = open(temp_reg_status_file_name, O_RDONLY);
+    if (fd1 != -1) { // File opened successfully
+      char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
+      if (data1 != MAP_FAILED) {
+        value = __kmp_str_format("%s", data1); // read value from /tmp
+        munmap(data1, SHM_SIZE);
+      }
+      close(fd1);
+    }
+  } else { // fall back to envirable
+    value = __kmp_env_get(name);
   }
-  close(fd1);
 #else
   value = __kmp_env_get(name);
 #endif
@@ -6861,14 +6995,23 @@ void __kmp_unregister_library(void) {
   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
 //  Ok, this is our variable. Delete it.
 #if defined(KMP_USE_SHM)
-    shm_unlink(shm_name); // this removes file in /dev/shm
+    if (__kmp_shm_available) {
+      shm_unlink(shm_name); // this removes file in /dev/shm
+    } else if (__kmp_tmp_available) {
+      unlink(temp_reg_status_file_name); // this removes the temp file
+    } else {
+      __kmp_env_unset(name);
+    }
 #else
     __kmp_env_unset(name);
 #endif
   }
 
 #if defined(KMP_USE_SHM)
-  KMP_INTERNAL_FREE(shm_name);
+  if (shm_name)
+    KMP_INTERNAL_FREE(shm_name);
+  if (temp_reg_status_file_name)
+    KMP_INTERNAL_FREE(temp_reg_status_file_name);
 #endif
 
   KMP_INTERNAL_FREE(__kmp_registration_str);
@@ -6967,6 +7110,11 @@ static void __kmp_do_serial_initialize(void) {
 
   __kmp_validate_locks();
 
+#if ENABLE_LIBOMPTARGET
+  /* Initialize functions from libomptarget */
+  __kmp_init_omptarget();
+#endif
+
   /* Initialize internal memory allocator */
   __kmp_init_allocator();
 
@@ -7192,10 +7340,12 @@ static void __kmp_do_serial_initialize(void) {
   __kmp_register_atfork();
 #endif
 
-#if !KMP_DYNAMIC_LIB
+#if !KMP_DYNAMIC_LIB ||                                                        \
+    ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
   {
     /* Invoke the exit handler when the program finishes, only for static
-       library. For dynamic library, we already have _fini and DllMain. */
+       library and macOS* dynamic. For other dynamic libraries, we already
+       have _fini and DllMain. */
     int rc = atexit(__kmp_internal_end_atexit);
     if (rc != 0) {
       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
@@ -7222,6 +7372,10 @@ static void __kmp_do_serial_initialize(void) {
 
   __kmp_init_serial = TRUE;
 
+  if (__kmp_version) {
+    __kmp_print_version_1();
+  }
+
   if (__kmp_settings) {
     __kmp_env_print();
   }
@@ -7275,7 +7429,7 @@ static void __kmp_do_middle_initialize(void) {
 #if KMP_AFFINITY_SUPPORTED
   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
   // number of cores on the machine.
-  __kmp_affinity_initialize();
+  __kmp_affinity_initialize(__kmp_affinity);
 
 #endif /* KMP_AFFINITY_SUPPORTED */
 
@@ -7461,6 +7615,14 @@ void __kmp_hidden_helper_initialize() {
     return;
   }
 
+#if KMP_AFFINITY_SUPPORTED
+  // Initialize hidden helper affinity settings.
+  // The above __kmp_parallel_initialize() will initialize
+  // regular affinity (and topology) if not already done.
+  if (!__kmp_hh_affinity.flags.initialized)
+    __kmp_affinity_initialize(__kmp_hh_affinity);
+#endif
+
   // Set the count of hidden helper tasks to be executed to zero
   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
 
@@ -7583,7 +7745,7 @@ int __kmp_invoke_task_func(int gtid) {
   );
 #if OMPT_SUPPORT
   *exit_frame_p = NULL;
-  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
+  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
 #endif
 
 #if KMP_STATS_ENABLED
@@ -7681,7 +7843,7 @@ int __kmp_invoke_teams_master(int gtid) {
 #endif
   __kmp_teams_master(gtid);
 #if OMPT_SUPPORT
-  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
+  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
 #endif
   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
   return 1;
@@ -7691,7 +7853,6 @@ int __kmp_invoke_teams_master(int gtid) {
    encountered by this team. since this should be enclosed in the forkjoin
    critical section it should avoid race conditions with asymmetrical nested
    parallelism */
-
 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
   kmp_info_t *thr = __kmp_threads[gtid];
 
@@ -7699,6 +7860,39 @@ void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
     thr->th.th_set_nproc = num_threads;
 }
 
+void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
+                                 int *num_threads_list) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+
+  KMP_DEBUG_ASSERT(list_length > 1);
+
+  if (num_threads_list[0] > 0)
+    thr->th.th_set_nproc = num_threads_list[0];
+  thr->th.th_set_nested_nth =
+      (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
+  for (kmp_uint32 i = 0; i < list_length; ++i)
+    thr->th.th_set_nested_nth[i] = num_threads_list[i];
+  thr->th.th_set_nested_nth_sz = list_length;
+}
+
+void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
+                                  const char *msg) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  thr->th.th_nt_strict = true;
+  thr->th.th_nt_loc = loc;
+  // if sev is unset make fatal
+  if (sev == severity_warning)
+    thr->th.th_nt_sev = sev;
+  else
+    thr->th.th_nt_sev = severity_fatal;
+  // if msg is unset, use an appropriate message
+  if (msg)
+    thr->th.th_nt_msg = msg;
+  else
+    thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
+                        "strict num_threads clause.";
+}
+
 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
                                     int num_threads) {
   KMP_DEBUG_ASSERT(thr);
@@ -7932,8 +8126,10 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
 
   __kmp_join_barrier(gtid); /* wait for everyone */
 #if OMPT_SUPPORT
+  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
   if (ompt_enabled.enabled &&
-      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
+      (ompt_state == ompt_state_wait_barrier_teams ||
+       ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
     int ds_tid = this_thr->th.th_info.ds.ds_tid;
     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
@@ -7944,15 +8140,16 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
 
+    ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
+    if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
+      sync_kind = ompt_sync_region_barrier_teams;
     if (ompt_enabled.ompt_callback_sync_region_wait) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
-          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
-          codeptr);
+          sync_kind, ompt_scope_end, NULL, task_data, codeptr);
     }
     if (ompt_enabled.ompt_callback_sync_region) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
-          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
-          codeptr);
+          sync_kind, ompt_scope_end, NULL, task_data, codeptr);
     }
 #endif
     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
@@ -8155,6 +8352,7 @@ void __kmp_cleanup(void) {
   __kmp_nested_nth.nth = NULL;
   __kmp_nested_nth.size = 0;
   __kmp_nested_nth.used = 0;
+
   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
   __kmp_nested_proc_bind.bind_types = NULL;
   __kmp_nested_proc_bind.size = 0;
@@ -8652,9 +8850,8 @@ void __kmp_aux_display_affinity(int gtid, const char *format) {
 }
 
 /* ------------------------------------------------------------------------ */
-
 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
-  int blocktime = arg; /* argument is in milliseconds */
+  int blocktime = arg; /* argument is in microseconds */
 #if KMP_USE_MONITOR
   int bt_intervals;
 #endif
@@ -8730,7 +8927,6 @@ __kmp_determine_reduction_method(
 
   int team_size;
 
-  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
 
 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
@@ -8751,10 +8947,12 @@ __kmp_determine_reduction_method(
     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
 
 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
-    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
+    KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
+    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||        \
+    KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
 
     int teamsize_cutoff = 4;
 
@@ -8778,11 +8976,15 @@ __kmp_determine_reduction_method(
 #else
 #error "Unknown or unsupported OS"
 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
-       // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
+       // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
+       // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
 
-#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
+#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS ||       \
+    KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
 
-#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS ||       \
+    KMP_OS_WASI || KMP_OS_AIX
 
     // basic tuning
 
@@ -8930,7 +9132,8 @@ int __kmp_pause_resource(kmp_pause_status_t level) {
       __kmp_soft_pause();
       return 0;
     }
-  } else if (level == kmp_hard_paused) { // requesting hard pause
+  } else if (level == kmp_hard_paused || level == kmp_stop_tool_paused) {
+    // requesting hard pause or stop_tool pause
     if (__kmp_pause_status != kmp_not_paused) {
       // error message about already being paused
       return 1;
@@ -9018,8 +9221,8 @@ void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
   // to wake it up.
   for (int f = 1; f < new_nthreads; ++f) {
     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
-    KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
-                                3);
+    (void)KMP_COMPARE_AND_STORE_ACQ32(
+        &(team->t.t_threads[f]->th.th_used_in_team), 0, 3);
     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
       __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
                       (kmp_flag_32<false, false> *)NULL);
@@ -9181,3 +9384,20 @@ void __kmp_set_nesting_mode_threads() {
   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
 }
+
+// Empty symbols to export (see exports_so.txt) when feature is disabled
+extern "C" {
+#if !KMP_STATS_ENABLED
+void __kmp_reset_stats() {}
+#endif
+#if !USE_DEBUGGER
+int __kmp_omp_debug_struct_info = FALSE;
+int __kmp_debugging = FALSE;
+#endif
+#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
+void __kmp_itt_fini_ittlib() {}
+void __kmp_itt_init_ittlib() {}
+#endif
+}
+
+// end of file
diff --git a/contrib/libs/cxxsupp/openmp/kmp_safe_c_api.h b/contrib/libs/cxxsupp/openmp/kmp_safe_c_api.h
index 3db1ada37b0..79f4a7f5732 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_safe_c_api.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_safe_c_api.h
@@ -30,6 +30,7 @@
 #define KMP_SSCANF sscanf_s
 #define KMP_STRCPY_S strcpy_s
 #define KMP_STRNCPY_S strncpy_s
+#define KMP_STRNCAT_S strncat_s
 
 // Use this only when buffer size is unknown
 #define KMP_MEMCPY(dst, src, cnt) memcpy_s(dst, cnt, src, cnt)
@@ -55,12 +56,17 @@ template <typename T> struct kmp_get_rmax_t<T, true> {
 
 // For now, these macros use the existing API.
 
+#if KMP_OS_NETBSD
+#define KMP_ALLOCA __builtin_alloca
+#else
 #define KMP_ALLOCA alloca
+#endif
 #define KMP_MEMCPY_S(dst, bsz, src, cnt) memcpy(dst, src, cnt)
 #define KMP_SNPRINTF snprintf
 #define KMP_SSCANF sscanf
 #define KMP_STRCPY_S(dst, bsz, src) strcpy(dst, src)
 #define KMP_STRNCPY_S(dst, bsz, src, cnt) strncpy(dst, src, cnt)
+#define KMP_STRNCAT_S(dst, bsz, src, cnt) strncat(dst, src, cnt)
 #define KMP_VSNPRINTF vsnprintf
 #define KMP_STRNCPY strncpy
 #define KMP_STRLEN strlen
diff --git a/contrib/libs/cxxsupp/openmp/kmp_sched.cpp b/contrib/libs/cxxsupp/openmp/kmp_sched.cpp
index acd75448d29..2e0dfac6eeb 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_sched.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_sched.cpp
@@ -52,6 +52,7 @@ char const *traits_t<long>::spec = "ld";
     } else if (i > 0) {                                                        \
       t = (u - l) / i + 1;                                                     \
     } else {                                                                   \
+      KMP_DEBUG_ASSERT(i != 0);                                                \
       t = (l - u) / (-i) + 1;                                                  \
     }                                                                          \
     KMP_COUNT_VALUE(stat, t);                                                  \
@@ -61,11 +62,13 @@ char const *traits_t<long>::spec = "ld";
 #define KMP_STATS_LOOP_END(stat) /* Nothing */
 #endif
 
+#if USE_ITT_BUILD || defined KMP_DEBUG
 static ident_t loc_stub = {0, KMP_IDENT_KMPC, 0, 0, ";unknown;unknown;0;0;;"};
 static inline void check_loc(ident_t *&loc) {
   if (loc == NULL)
     loc = &loc_stub; // may need to report location info to ittnotify
 }
+#endif
 
 template <typename T>
 static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
@@ -83,6 +86,9 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static);
   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static_scheduling);
 
+  // Clear monotonic/nonmonotonic bits (ignore it)
+  schedtype = SCHEDULE_WITHOUT_MODIFIERS(schedtype);
+
   typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;
   /*  this all has to be changed back to TID and such.. */
@@ -97,7 +103,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   ompt_team_info_t *team_info = NULL;
   ompt_task_info_t *task_info = NULL;
-  ompt_work_t ompt_work_type = ompt_work_loop;
+  ompt_work_t ompt_work_type = ompt_work_loop_static;
 
   static kmp_int8 warn = 0;
 
@@ -108,7 +114,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
     // Determine workshare type
     if (loc != NULL) {
       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
-        ompt_work_type = ompt_work_loop;
+        ompt_work_type = ompt_work_loop_static;
       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
         ompt_work_type = ompt_work_sections;
       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
@@ -279,6 +285,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
     // upper-lower can exceed the limit of signed type
     trip_count = (UT)(*pupper - *plower) / incr + 1;
   } else {
+    KMP_DEBUG_ASSERT(incr != 0);
     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
   }
 
@@ -313,6 +320,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
       if (plastiter != NULL)
         *plastiter = (tid == trip_count - 1);
     } else {
+      KMP_DEBUG_ASSERT(nth != 0);
       if (__kmp_static == kmp_sch_static_balanced) {
         UT small_chunk = trip_count / nth;
         UT extras = trip_count % nth;
@@ -353,6 +361,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
   case kmp_sch_static_chunked: {
     ST span;
     UT nchunks;
+    KMP_DEBUG_ASSERT(chunk != 0);
     if (chunk < 1)
       chunk = 1;
     else if ((UT)chunk > trip_count)
@@ -378,6 +387,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
   }
   case kmp_sch_static_balanced_chunked: {
     T old_upper = *pupper;
+    KMP_DEBUG_ASSERT(nth != 0);
     // round up to make sure the chunk is enough to cover all iterations
     UT span = (trip_count + nth - 1) / nth;
 
@@ -393,8 +403,10 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
     } else if (*pupper < old_upper)
       *pupper = old_upper;
 
-    if (plastiter != NULL)
+    if (plastiter != NULL) {
+      KMP_DEBUG_ASSERT(chunk != 0);
       *plastiter = (tid == ((trip_count - 1) / (UT)chunk));
+    }
     break;
   }
   default:
@@ -412,6 +424,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
     // Calculate chunk in case it was not specified; it is specified for
     // kmp_sch_static_chunked
     if (schedtype == kmp_sch_static) {
+      KMP_DEBUG_ASSERT(nth != 0);
       cur_chunk = trip_count / nth + ((trip_count % nth) ? 1 : 0);
     }
     // 0 - "static" schedule
@@ -542,6 +555,7 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
     // upper-lower can exceed the limit of signed type
     trip_count = (UT)(*pupper - *plower) / incr + 1;
   } else {
+    KMP_DEBUG_ASSERT(incr != 0);
     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
   }
 
@@ -563,6 +577,7 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
       *plastiter = (tid == 0 && team_id == trip_count - 1);
   } else {
     // Get the team's chunk first (each team gets at most one chunk)
+    KMP_DEBUG_ASSERT(nteams != 0);
     if (__kmp_static == kmp_sch_static_balanced) {
       UT chunkD = trip_count / nteams;
       UT extras = trip_count % nteams;
@@ -614,6 +629,7 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
       // upper-lower can exceed the limit of signed type
       trip_count = (UT)(*pupperDist - *plower) / incr + 1;
     } else {
+      KMP_DEBUG_ASSERT(incr != 0);
       trip_count = (UT)(*plower - *pupperDist) / (-incr) + 1;
     }
     KMP_DEBUG_ASSERT(trip_count);
@@ -632,6 +648,7 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
           if (*plastiter != 0 && !(tid == trip_count - 1))
             *plastiter = 0;
       } else {
+        KMP_DEBUG_ASSERT(nth != 0);
         if (__kmp_static == kmp_sch_static_balanced) {
           UT chunkL = trip_count / nth;
           UT extras = trip_count % nth;
@@ -679,9 +696,11 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
       *pstride = span * nth;
       *plower = *plower + (span * tid);
       *pupper = *plower + span - incr;
-      if (plastiter != NULL)
+      if (plastiter != NULL) {
+        KMP_DEBUG_ASSERT(chunk != 0);
         if (*plastiter != 0 && !(tid == ((trip_count - 1) / (UT)chunk) % nth))
           *plastiter = 0;
+      }
       break;
     }
     default:
@@ -804,6 +823,7 @@ static void __kmp_team_static_init(ident_t *loc, kmp_int32 gtid,
     // upper-lower can exceed the limit of signed type
     trip_count = (UT)(upper - lower) / incr + 1;
   } else {
+    KMP_DEBUG_ASSERT(incr != 0);
     trip_count = (UT)(lower - upper) / (-incr) + 1;
   }
   if (chunk < 1)
@@ -812,8 +832,10 @@ static void __kmp_team_static_init(ident_t *loc, kmp_int32 gtid,
   *p_st = span * nteams;
   *p_lb = lower + (span * team_id);
   *p_ub = *p_lb + span - incr;
-  if (p_last != NULL)
+  if (p_last != NULL) {
+    KMP_DEBUG_ASSERT(chunk != 0);
     *p_last = (team_id == ((trip_count - 1) / (UT)chunk) % nteams);
+  }
   // Correct upper bound if needed
   if (incr > 0) {
     if (*p_ub < *p_lb) // overflow?
diff --git a/contrib/libs/cxxsupp/openmp/kmp_settings.cpp b/contrib/libs/cxxsupp/openmp/kmp_settings.cpp
index 38ff15461b4..0037a8e0ccc 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_settings.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_settings.cpp
@@ -149,70 +149,6 @@ static size_t __kmp_round4k(size_t size) {
 } // __kmp_round4k
 #endif
 
-/* Here, multipliers are like __kmp_convert_to_seconds, but floating-point
-   values are allowed, and the return value is in milliseconds.  The default
-   multiplier is milliseconds.  Returns INT_MAX only if the value specified
-   matches "infinit*".  Returns -1 if specified string is invalid. */
-int __kmp_convert_to_milliseconds(char const *data) {
-  int ret, nvalues, factor;
-  char mult, extra;
-  double value;
-
-  if (data == NULL)
-    return (-1);
-  if (__kmp_str_match("infinit", -1, data))
-    return (INT_MAX);
-  value = (double)0.0;
-  mult = '\0';
-#if KMP_OS_WINDOWS && KMP_MSVC_COMPAT
-  // On Windows, each %c parameter needs additional size parameter for sscanf_s
-  nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, 1, &extra, 1);
-#else
-  nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, &extra);
-#endif
-  if (nvalues < 1)
-    return (-1);
-  if (nvalues == 1)
-    mult = '\0';
-  if (nvalues == 3)
-    return (-1);
-
-  if (value < 0)
-    return (-1);
-
-  switch (mult) {
-  case '\0':
-    /*  default is milliseconds  */
-    factor = 1;
-    break;
-  case 's':
-  case 'S':
-    factor = 1000;
-    break;
-  case 'm':
-  case 'M':
-    factor = 1000 * 60;
-    break;
-  case 'h':
-  case 'H':
-    factor = 1000 * 60 * 60;
-    break;
-  case 'd':
-  case 'D':
-    factor = 1000 * 24 * 60 * 60;
-    break;
-  default:
-    return (-1);
-  }
-
-  if (value >= ((INT_MAX - 1) / factor))
-    ret = INT_MAX - 1; /* Don't allow infinite value here */
-  else
-    ret = (int)(value * (double)factor); /* truncate to int  */
-
-  return ret;
-}
-
 static int __kmp_strcasecmp_with_sentinel(char const *a, char const *b,
                                           char sentinel) {
   if (a == NULL)
@@ -282,6 +218,25 @@ static int __kmp_stg_check_rivals( // 0 -- Ok, 1 -- errors found.
     kmp_setting_t **rivals // List of rival settings (must include current one).
 );
 
+// Helper struct that trims heading/trailing white spaces
+struct kmp_trimmed_str_t {
+  kmp_str_buf_t buf;
+  kmp_trimmed_str_t(const char *str) {
+    __kmp_str_buf_init(&buf);
+    size_t len = KMP_STRLEN(str);
+    if (len == 0)
+      return;
+    const char *begin = str;
+    const char *end = str + KMP_STRLEN(str) - 1;
+    SKIP_WS(begin);
+    while (begin < end && *end == ' ')
+      end--;
+    __kmp_str_buf_cat(&buf, begin, end - begin + 1);
+  }
+  ~kmp_trimmed_str_t() { __kmp_str_buf_free(&buf); }
+  const char *get() { return buf.str; }
+};
+
 // -----------------------------------------------------------------------------
 // Helper parse functions.
 
@@ -300,8 +255,13 @@ static void __kmp_stg_parse_bool(char const *name, char const *value,
 // placed here in order to use __kmp_round4k static function
 void __kmp_check_stksize(size_t *val) {
   // if system stack size is too big then limit the size for worker threads
+#if KMP_OS_AIX
+  if (*val > KMP_DEFAULT_STKSIZE * 2) // Use 2 times, 16 is too large for AIX.
+    *val = KMP_DEFAULT_STKSIZE * 2;
+#else
   if (*val > KMP_DEFAULT_STKSIZE * 16) // just a heuristics...
     *val = KMP_DEFAULT_STKSIZE * 16;
+#endif
   if (*val < __kmp_sys_min_stksize)
     *val = __kmp_sys_min_stksize;
   if (*val > KMP_MAX_STKSIZE)
@@ -712,24 +672,73 @@ static void __kmp_stg_print_use_yield(kmp_str_buf_t *buffer, char const *name,
 
 static void __kmp_stg_parse_blocktime(char const *name, char const *value,
                                       void *data) {
-  __kmp_dflt_blocktime = __kmp_convert_to_milliseconds(value);
-  if (__kmp_dflt_blocktime < 0) {
-    __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
+  const char *buf = value;
+  const char *next;
+  const int ms_mult = 1000;
+  int multiplier = 1;
+  int num;
+
+  // Read integer blocktime value
+  SKIP_WS(buf);
+  if ((*buf >= '0') && (*buf <= '9')) {
+    next = buf;
+    SKIP_DIGITS(next);
+    num = __kmp_basic_str_to_int(buf);
+    KMP_ASSERT(num >= 0);
+    buf = next;
+    SKIP_WS(buf);
+  } else {
+    num = -1;
+  }
+
+  // Read units: note that __kmp_dflt_blocktime units is now us
+  next = buf;
+  if (*buf == '\0' || __kmp_match_str("ms", buf, &next)) {
+    // units are in ms; convert
+    __kmp_dflt_blocktime = ms_mult * num;
+    __kmp_blocktime_units = 'm';
+    multiplier = ms_mult;
+  } else if (__kmp_match_str("us", buf, &next)) {
+    // units are in us
+    __kmp_dflt_blocktime = num;
+    __kmp_blocktime_units = 'u';
+  } else if (__kmp_match_str("infinite", buf, &next) ||
+             __kmp_match_str("infinity", buf, &next)) {
+    // units are in ms
+    __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
+    __kmp_blocktime_units = 'm';
+    multiplier = ms_mult;
+  } else {
+    KMP_WARNING(StgInvalidValue, name, value);
+    // default units are in ms
+    __kmp_dflt_blocktime = ms_mult * num;
+    __kmp_blocktime_units = 'm';
+    multiplier = ms_mult;
+  }
+
+  if (num < 0 && __kmp_dflt_blocktime < 0) { // num out of range
+    __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; // now in us
     __kmp_msg(kmp_ms_warning, KMP_MSG(InvalidValue, name, value),
               __kmp_msg_null);
-    KMP_INFORM(Using_int_Value, name, __kmp_dflt_blocktime);
+    // Inform in appropriate units
+    KMP_INFORM(Using_int_Value, name, __kmp_dflt_blocktime / multiplier);
     __kmp_env_blocktime = FALSE; // Revert to default as if var not set.
+  } else if (num > 0 && __kmp_dflt_blocktime < 0) { // overflow
+    __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
+    __kmp_msg(kmp_ms_warning, KMP_MSG(LargeValue, name, value), __kmp_msg_null);
+    KMP_INFORM(MaxValueUsing, name, __kmp_dflt_blocktime / multiplier);
+    __kmp_env_blocktime = TRUE; // KMP_BLOCKTIME was specified.
   } else {
     if (__kmp_dflt_blocktime < KMP_MIN_BLOCKTIME) {
       __kmp_dflt_blocktime = KMP_MIN_BLOCKTIME;
       __kmp_msg(kmp_ms_warning, KMP_MSG(SmallValue, name, value),
                 __kmp_msg_null);
-      KMP_INFORM(MinValueUsing, name, __kmp_dflt_blocktime);
+      KMP_INFORM(MinValueUsing, name, __kmp_dflt_blocktime / multiplier);
     } else if (__kmp_dflt_blocktime > KMP_MAX_BLOCKTIME) {
       __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
       __kmp_msg(kmp_ms_warning, KMP_MSG(LargeValue, name, value),
                 __kmp_msg_null);
-      KMP_INFORM(MaxValueUsing, name, __kmp_dflt_blocktime);
+      KMP_INFORM(MaxValueUsing, name, __kmp_dflt_blocktime / multiplier);
     }
     __kmp_env_blocktime = TRUE; // KMP_BLOCKTIME was specified.
   }
@@ -749,7 +758,17 @@ static void __kmp_stg_parse_blocktime(char const *name, char const *value,
 
 static void __kmp_stg_print_blocktime(kmp_str_buf_t *buffer, char const *name,
                                       void *data) {
-  __kmp_stg_print_int(buffer, name, __kmp_dflt_blocktime);
+  int num = __kmp_dflt_blocktime;
+  if (__kmp_blocktime_units == 'm') {
+    num = num / 1000;
+  }
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=", name);
+  }
+  __kmp_str_buf_print(buffer, "%d", num);
+  __kmp_str_buf_print(buffer, "%cs\n", __kmp_blocktime_units);
 } // __kmp_stg_print_blocktime
 
 // -----------------------------------------------------------------------------
@@ -859,6 +878,10 @@ static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
     case library_throughput: {
       value = "PASSIVE";
     } break;
+    case library_none:
+    case library_serial: {
+      value = NULL;
+    } break;
     }
   } else {
     switch (__kmp_library) {
@@ -871,6 +894,9 @@ static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
     case library_throughput: {
       value = "throughput";
     } break;
+    case library_none: {
+      value = NULL;
+    } break;
     }
   }
   if (value != NULL) {
@@ -1238,6 +1264,28 @@ static void __kmp_stg_parse_num_threads(char const *name, char const *value,
   K_DIAG(1, ("__kmp_dflt_team_nth == %d\n", __kmp_dflt_team_nth));
 } // __kmp_stg_parse_num_threads
 
+#if OMPX_TASKGRAPH
+static void __kmp_stg_parse_max_tdgs(char const *name, char const *value,
+                                     void *data) {
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_max_tdgs);
+} // __kmp_stg_parse_max_tdgs
+
+static void __kmp_std_print_max_tdgs(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_max_tdgs);
+} // __kmp_std_print_max_tdgs
+
+static void __kmp_stg_parse_tdg_dot(char const *name, char const *value,
+                                   void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_tdg_dot);
+} // __kmp_stg_parse_tdg_dot
+
+static void __kmp_stg_print_tdg_dot(kmp_str_buf_t *buffer, char const *name,
+                                   void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_tdg_dot);
+} // __kmp_stg_print_tdg_dot
+#endif
+
 static void __kmp_stg_parse_num_hidden_helper_threads(char const *name,
                                                       char const *value,
                                                       void *data) {
@@ -1247,7 +1295,7 @@ static void __kmp_stg_parse_num_hidden_helper_threads(char const *name,
   if (__kmp_hidden_helper_threads_num == 0) {
     __kmp_enable_hidden_helper = FALSE;
   } else {
-    // Since the main thread of hidden helper team dooes not participate
+    // Since the main thread of hidden helper team does not participate
     // in tasks execution let's increment the number of threads by one
     // so that requested number of threads do actual job.
     __kmp_hidden_helper_threads_num++;
@@ -1373,14 +1421,13 @@ static void __kmp_stg_print_default_device(kmp_str_buf_t *buffer,
 // OpenMP 5.0: OMP_TARGET_OFFLOAD
 static void __kmp_stg_parse_target_offload(char const *name, char const *value,
                                            void *data) {
-  const char *next = value;
-  const char *scan = next;
-
+  kmp_trimmed_str_t value_str(value);
+  const char *scan = value_str.get();
   __kmp_target_offload = tgt_default;
-  SKIP_WS(next);
-  if (*next == '\0')
+
+  if (*scan == '\0')
     return;
-  scan = next;
+
   if (!__kmp_strcasecmp_with_sentinel("mandatory", scan, 0)) {
     __kmp_target_offload = tgt_mandatory;
   } else if (!__kmp_strcasecmp_with_sentinel("disabled", scan, 0)) {
@@ -1390,7 +1437,6 @@ static void __kmp_stg_parse_target_offload(char const *name, char const *value,
   } else {
     KMP_WARNING(SyntaxErrorUsing, name, "DEFAULT");
   }
-
 } // __kmp_stg_parse_target_offload
 
 static void __kmp_stg_print_target_offload(kmp_str_buf_t *buffer,
@@ -1428,7 +1474,7 @@ static void __kmp_stg_print_max_task_priority(kmp_str_buf_t *buffer,
 // taskloop threshold to switch from recursive to linear tasks creation
 static void __kmp_stg_parse_taskloop_min_tasks(char const *name,
                                                char const *value, void *data) {
-  int tmp;
+  int tmp = 0;
   __kmp_stg_parse_int(name, value, 0, INT_MAX, &tmp);
   __kmp_taskloop_min_tasks = tmp;
 } // __kmp_stg_parse_taskloop_min_tasks
@@ -1559,7 +1605,7 @@ static void __kmp_stg_parse_debug(char const *name, char const *value,
 static void __kmp_stg_parse_debug_buf(char const *name, char const *value,
                                       void *data) {
   __kmp_stg_parse_bool(name, value, &__kmp_debug_buf);
-  // !!! TODO: Move buffer initialization of of this file! It may works
+  // !!! TODO: Move buffer initialization of this file! It may works
   // incorrectly if KMP_DEBUG_BUF is parsed before KMP_DEBUG_BUF_LINES or
   // KMP_DEBUG_BUF_CHARS.
   if (__kmp_debug_buf) {
@@ -1966,6 +2012,23 @@ static void __kmp_stg_print_foreign_threads_threadprivate(kmp_str_buf_t *buffer,
 // -----------------------------------------------------------------------------
 // KMP_AFFINITY, GOMP_CPU_AFFINITY, KMP_TOPOLOGY_METHOD
 
+static inline const char *
+__kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
+  switch (type) {
+  case KMP_HW_CORE_TYPE_UNKNOWN:
+  case KMP_HW_MAX_NUM_CORE_TYPES:
+    return "unknown";
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  case KMP_HW_CORE_TYPE_ATOM:
+    return "intel_atom";
+  case KMP_HW_CORE_TYPE_CORE:
+    return "intel_core";
+#endif
+  }
+  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
+}
+
 #if KMP_AFFINITY_SUPPORTED
 // Parse the proc id list.  Return TRUE if successful, FALSE otherwise.
 static int __kmp_parse_affinity_proc_id_list(const char *var, const char *env,
@@ -2148,12 +2211,7 @@ static int __kmp_parse_affinity_proc_id_list(const char *var, const char *env,
 static kmp_setting_t *__kmp_affinity_notype = NULL;
 
 static void __kmp_parse_affinity_env(char const *name, char const *value,
-                                     enum affinity_type *out_type,
-                                     char **out_proclist, int *out_verbose,
-                                     int *out_warn, int *out_respect,
-                                     kmp_hw_t *out_gran, int *out_gran_levels,
-                                     int *out_dups, int *out_compact,
-                                     int *out_offset) {
+                                     kmp_affinity_t *out_affinity) {
   char *buffer = NULL; // Copy of env var value.
   char *buf = NULL; // Buffer for strtok_r() function.
   char *next = NULL; // end of token / start of next.
@@ -2219,19 +2277,20 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
     ++_guard;                                                                  \
   }
 
-#define set_type(val) _set_param(type, *out_type, val)
-#define set_verbose(val) _set_param(verbose, *out_verbose, val)
-#define set_warnings(val) _set_param(warnings, *out_warn, val)
-#define set_respect(val) _set_param(respect, *out_respect, val)
-#define set_dups(val) _set_param(dups, *out_dups, val)
-#define set_proclist(val) _set_param(proclist, *out_proclist, val)
-#define set_reset(val) _set_param(reset, __kmp_affin_reset, val)
+#define set_type(val) _set_param(type, out_affinity->type, val)
+#define set_verbose(val) _set_param(verbose, out_affinity->flags.verbose, val)
+#define set_warnings(val)                                                      \
+  _set_param(warnings, out_affinity->flags.warnings, val)
+#define set_respect(val) _set_param(respect, out_affinity->flags.respect, val)
+#define set_dups(val) _set_param(dups, out_affinity->flags.dups, val)
+#define set_proclist(val) _set_param(proclist, out_affinity->proclist, val)
+#define set_reset(val) _set_param(reset, out_affinity->flags.reset, val)
 
 #define set_gran(val, levels)                                                  \
   {                                                                            \
     if (gran == 0) {                                                           \
-      *out_gran = val;                                                         \
-      *out_gran_levels = levels;                                               \
+      out_affinity->gran = val;                                                \
+      out_affinity->gran_levels = levels;                                      \
     } else {                                                                   \
       EMIT_WARN(FALSE, (AffParamDefined, name, start));                        \
     }                                                                          \
@@ -2324,14 +2383,32 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
 
       buf = next;
 
-      // Try any hardware topology type for granularity
-      KMP_FOREACH_HW_TYPE(type) {
-        const char *name = __kmp_hw_get_keyword(type);
-        if (__kmp_match_str(name, buf, CCAST(const char **, &next))) {
-          set_gran(type, -1);
-          buf = next;
-          set = true;
-          break;
+      // Have to try core_type and core_efficiency matches first since "core"
+      // will register as core granularity with "extra chars"
+      if (__kmp_match_str("core_type", buf, CCAST(const char **, &next))) {
+        set_gran(KMP_HW_CORE, -1);
+        out_affinity->flags.core_types_gran = 1;
+        buf = next;
+        set = true;
+      } else if (__kmp_match_str("core_efficiency", buf,
+                                 CCAST(const char **, &next)) ||
+                 __kmp_match_str("core_eff", buf,
+                                 CCAST(const char **, &next))) {
+        set_gran(KMP_HW_CORE, -1);
+        out_affinity->flags.core_effs_gran = 1;
+        buf = next;
+        set = true;
+      }
+      if (!set) {
+        // Try any hardware topology type for granularity
+        KMP_FOREACH_HW_TYPE(type) {
+          const char *name = __kmp_hw_get_keyword(type);
+          if (__kmp_match_str(name, buf, CCAST(const char **, &next))) {
+            set_gran(type, -1);
+            buf = next;
+            set = true;
+            break;
+          }
         }
       }
       if (!set) {
@@ -2448,20 +2525,20 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
   if (proclist) {
     if (!type) {
       KMP_WARNING(AffProcListNoType, name);
-      *out_type = affinity_explicit;
+      out_affinity->type = affinity_explicit;
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
-    } else if (*out_type != affinity_explicit) {
+    } else if (out_affinity->type != affinity_explicit) {
       KMP_WARNING(AffProcListNotExplicit, name);
-      KMP_ASSERT(*out_proclist != NULL);
-      KMP_INTERNAL_FREE(*out_proclist);
-      *out_proclist = NULL;
+      KMP_ASSERT(out_affinity->proclist != NULL);
+      KMP_INTERNAL_FREE(out_affinity->proclist);
+      out_affinity->proclist = NULL;
     }
   }
-  switch (*out_type) {
+  switch (out_affinity->type) {
   case affinity_logical:
   case affinity_physical: {
     if (count > 0) {
-      *out_offset = number[0];
+      out_affinity->offset = number[0];
     }
     if (count > 1) {
       KMP_WARNING(AffManyParamsForLogic, name, number[1]);
@@ -2469,42 +2546,44 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
   } break;
   case affinity_balanced: {
     if (count > 0) {
-      *out_compact = number[0];
+      out_affinity->compact = number[0];
     }
     if (count > 1) {
-      *out_offset = number[1];
+      out_affinity->offset = number[1];
     }
 
-    if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
+    if (__kmp_affinity.gran == KMP_HW_UNKNOWN) {
+      int verbose = out_affinity->flags.verbose;
+      int warnings = out_affinity->flags.warnings;
 #if KMP_MIC_SUPPORTED
       if (__kmp_mic_type != non_mic) {
-        if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
-          KMP_WARNING(AffGranUsing, "KMP_AFFINITY", "fine");
+        if (verbose || warnings) {
+          KMP_WARNING(AffGranUsing, out_affinity->env_var, "fine");
         }
-        __kmp_affinity_gran = KMP_HW_THREAD;
+        out_affinity->gran = KMP_HW_THREAD;
       } else
 #endif
       {
-        if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
-          KMP_WARNING(AffGranUsing, "KMP_AFFINITY", "core");
+        if (verbose || warnings) {
+          KMP_WARNING(AffGranUsing, out_affinity->env_var, "core");
         }
-        __kmp_affinity_gran = KMP_HW_CORE;
+        out_affinity->gran = KMP_HW_CORE;
       }
     }
   } break;
   case affinity_scatter:
   case affinity_compact: {
     if (count > 0) {
-      *out_compact = number[0];
+      out_affinity->compact = number[0];
     }
     if (count > 1) {
-      *out_offset = number[1];
+      out_affinity->offset = number[1];
     }
   } break;
   case affinity_explicit: {
-    if (*out_proclist == NULL) {
+    if (out_affinity->proclist == NULL) {
       KMP_WARNING(AffNoProcList, name);
-      __kmp_affinity_type = affinity_none;
+      out_affinity->type = affinity_none;
     }
     if (count > 0) {
       KMP_WARNING(AffNoParam, name, "explicit");
@@ -2541,74 +2620,91 @@ static void __kmp_stg_parse_affinity(char const *name, char const *value,
     return;
   }
 
-  __kmp_parse_affinity_env(name, value, &__kmp_affinity_type,
-                           &__kmp_affinity_proclist, &__kmp_affinity_verbose,
-                           &__kmp_affinity_warnings,
-                           &__kmp_affinity_respect_mask, &__kmp_affinity_gran,
-                           &__kmp_affinity_gran_levels, &__kmp_affinity_dups,
-                           &__kmp_affinity_compact, &__kmp_affinity_offset);
+  __kmp_parse_affinity_env(name, value, &__kmp_affinity);
 
 } // __kmp_stg_parse_affinity
+static void __kmp_stg_parse_hh_affinity(char const *name, char const *value,
+                                        void *data) {
+  __kmp_parse_affinity_env(name, value, &__kmp_hh_affinity);
+  // Warn about unused parts of hidden helper affinity settings if specified.
+  if (__kmp_hh_affinity.flags.reset) {
+    KMP_WARNING(AffInvalidParam, name, "reset");
+  }
+  if (__kmp_hh_affinity.flags.respect != affinity_respect_mask_default) {
+    KMP_WARNING(AffInvalidParam, name, "respect");
+  }
+}
 
-static void __kmp_stg_print_affinity(kmp_str_buf_t *buffer, char const *name,
-                                     void *data) {
+static void __kmp_print_affinity_env(kmp_str_buf_t *buffer, char const *name,
+                                     const kmp_affinity_t &affinity) {
+  bool is_hh_affinity = (&affinity == &__kmp_hh_affinity);
   if (__kmp_env_format) {
     KMP_STR_BUF_PRINT_NAME_EX(name);
   } else {
     __kmp_str_buf_print(buffer, "   %s='", name);
   }
-  if (__kmp_affinity_verbose) {
+  if (affinity.flags.verbose) {
     __kmp_str_buf_print(buffer, "%s,", "verbose");
   } else {
     __kmp_str_buf_print(buffer, "%s,", "noverbose");
   }
-  if (__kmp_affinity_warnings) {
+  if (affinity.flags.warnings) {
     __kmp_str_buf_print(buffer, "%s,", "warnings");
   } else {
     __kmp_str_buf_print(buffer, "%s,", "nowarnings");
   }
   if (KMP_AFFINITY_CAPABLE()) {
-    if (__kmp_affinity_respect_mask) {
-      __kmp_str_buf_print(buffer, "%s,", "respect");
-    } else {
-      __kmp_str_buf_print(buffer, "%s,", "norespect");
+    // Hidden helper affinity does not affect global reset
+    // or respect flags. That is still solely controlled by KMP_AFFINITY.
+    if (!is_hh_affinity) {
+      if (affinity.flags.respect) {
+        __kmp_str_buf_print(buffer, "%s,", "respect");
+      } else {
+        __kmp_str_buf_print(buffer, "%s,", "norespect");
+      }
+      if (affinity.flags.reset) {
+        __kmp_str_buf_print(buffer, "%s,", "reset");
+      } else {
+        __kmp_str_buf_print(buffer, "%s,", "noreset");
+      }
     }
-    if (__kmp_affin_reset) {
-      __kmp_str_buf_print(buffer, "%s,", "reset");
+    __kmp_str_buf_print(buffer, "granularity=");
+    if (affinity.flags.core_types_gran)
+      __kmp_str_buf_print(buffer, "core_type,");
+    else if (affinity.flags.core_effs_gran) {
+      __kmp_str_buf_print(buffer, "core_eff,");
     } else {
-      __kmp_str_buf_print(buffer, "%s,", "noreset");
+      __kmp_str_buf_print(
+          buffer, "%s,", __kmp_hw_get_keyword(affinity.gran, /*plural=*/false));
     }
-    __kmp_str_buf_print(buffer, "granularity=%s,",
-                        __kmp_hw_get_keyword(__kmp_affinity_gran, false));
   }
   if (!KMP_AFFINITY_CAPABLE()) {
     __kmp_str_buf_print(buffer, "%s", "disabled");
-  } else
-    switch (__kmp_affinity_type) {
+  } else {
+    int compact = affinity.compact;
+    int offset = affinity.offset;
+    switch (affinity.type) {
     case affinity_none:
       __kmp_str_buf_print(buffer, "%s", "none");
       break;
     case affinity_physical:
-      __kmp_str_buf_print(buffer, "%s,%d", "physical", __kmp_affinity_offset);
+      __kmp_str_buf_print(buffer, "%s,%d", "physical", offset);
       break;
     case affinity_logical:
-      __kmp_str_buf_print(buffer, "%s,%d", "logical", __kmp_affinity_offset);
+      __kmp_str_buf_print(buffer, "%s,%d", "logical", offset);
       break;
     case affinity_compact:
-      __kmp_str_buf_print(buffer, "%s,%d,%d", "compact", __kmp_affinity_compact,
-                          __kmp_affinity_offset);
+      __kmp_str_buf_print(buffer, "%s,%d,%d", "compact", compact, offset);
       break;
     case affinity_scatter:
-      __kmp_str_buf_print(buffer, "%s,%d,%d", "scatter", __kmp_affinity_compact,
-                          __kmp_affinity_offset);
+      __kmp_str_buf_print(buffer, "%s,%d,%d", "scatter", compact, offset);
       break;
     case affinity_explicit:
-      __kmp_str_buf_print(buffer, "%s=[%s],%s", "proclist",
-                          __kmp_affinity_proclist, "explicit");
+      __kmp_str_buf_print(buffer, "%s=[%s],%s", "proclist", affinity.proclist,
+                          "explicit");
       break;
     case affinity_balanced:
-      __kmp_str_buf_print(buffer, "%s,%d,%d", "balanced",
-                          __kmp_affinity_compact, __kmp_affinity_offset);
+      __kmp_str_buf_print(buffer, "%s,%d,%d", "balanced", compact, offset);
       break;
     case affinity_disabled:
       __kmp_str_buf_print(buffer, "%s", "disabled");
@@ -2620,9 +2716,19 @@ static void __kmp_stg_print_affinity(kmp_str_buf_t *buffer, char const *name,
       __kmp_str_buf_print(buffer, "%s", "<unknown>");
       break;
     }
+  }
   __kmp_str_buf_print(buffer, "'\n");
 } //__kmp_stg_print_affinity
 
+static void __kmp_stg_print_affinity(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  __kmp_print_affinity_env(buffer, name, __kmp_affinity);
+}
+static void __kmp_stg_print_hh_affinity(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_print_affinity_env(buffer, name, __kmp_hh_affinity);
+}
+
 #ifdef KMP_GOMP_COMPAT
 
 static void __kmp_stg_parse_gomp_cpu_affinity(char const *name,
@@ -2649,9 +2755,9 @@ static void __kmp_stg_parse_gomp_cpu_affinity(char const *name,
     SKIP_WS(next);
     if (*next == '\0') {
       // GOMP_CPU_AFFINITY => granularity=fine,explicit,proclist=...
-      __kmp_affinity_proclist = temp_proclist;
-      __kmp_affinity_type = affinity_explicit;
-      __kmp_affinity_gran = KMP_HW_THREAD;
+      __kmp_affinity.proclist = temp_proclist;
+      __kmp_affinity.type = affinity_explicit;
+      __kmp_affinity.gran = KMP_HW_THREAD;
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
     } else {
       KMP_WARNING(AffSyntaxError, name);
@@ -2661,7 +2767,7 @@ static void __kmp_stg_parse_gomp_cpu_affinity(char const *name,
     }
   } else {
     // Warning already emitted
-    __kmp_affinity_type = affinity_none;
+    __kmp_affinity.type = affinity_none;
     __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
   }
 } // __kmp_stg_parse_gomp_cpu_affinity
@@ -2688,11 +2794,7 @@ signed := + signed
 signed := - signed
 -----------------------------------------------------------------------------*/
 
-// Warning to issue for syntax error during parsing of OMP_PLACES
-static inline void __kmp_omp_places_syntax_warn(const char *var) {
-  KMP_WARNING(SyntaxErrorUsing, var, "\"cores\"");
-}
-
+// Return TRUE if successful parse, FALSE otherwise
 static int __kmp_parse_subplace_list(const char *var, const char **scan) {
   const char *next;
 
@@ -2704,7 +2806,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
     //
     SKIP_WS(*scan);
     if ((**scan < '0') || (**scan > '9')) {
-      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     next = *scan;
@@ -2723,7 +2824,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
       continue;
     }
     if (**scan != ':') {
-      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     (*scan)++; // skip ':'
@@ -2731,7 +2831,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
     // Read count parameter
     SKIP_WS(*scan);
     if ((**scan < '0') || (**scan > '9')) {
-      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     next = *scan;
@@ -2750,7 +2849,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
       continue;
     }
     if (**scan != ':') {
-      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     (*scan)++; // skip ':'
@@ -2772,7 +2870,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
     }
     SKIP_WS(*scan);
     if ((**scan < '0') || (**scan > '9')) {
-      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     next = *scan;
@@ -2791,13 +2888,12 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
       (*scan)++; // skip ','
       continue;
     }
-
-    __kmp_omp_places_syntax_warn(var);
     return FALSE;
   }
   return TRUE;
 }
 
+// Return TRUE if successful parse, FALSE otherwise
 static int __kmp_parse_place(const char *var, const char **scan) {
   const char *next;
 
@@ -2809,7 +2905,6 @@ static int __kmp_parse_place(const char *var, const char **scan) {
       return FALSE;
     }
     if (**scan != '}') {
-      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     (*scan)++; // skip '}'
@@ -2823,12 +2918,12 @@ static int __kmp_parse_place(const char *var, const char **scan) {
     KMP_ASSERT(proc >= 0);
     *scan = next;
   } else {
-    __kmp_omp_places_syntax_warn(var);
     return FALSE;
   }
   return TRUE;
 }
 
+// Return TRUE if successful parse, FALSE otherwise
 static int __kmp_parse_place_list(const char *var, const char *env,
                                   char **place_list) {
   const char *scan = env;
@@ -2851,7 +2946,6 @@ static int __kmp_parse_place_list(const char *var, const char *env,
       continue;
     }
     if (*scan != ':') {
-      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     scan++; // skip ':'
@@ -2859,7 +2953,6 @@ static int __kmp_parse_place_list(const char *var, const char *env,
     // Read count parameter
     SKIP_WS(scan);
     if ((*scan < '0') || (*scan > '9')) {
-      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     next = scan;
@@ -2878,7 +2971,6 @@ static int __kmp_parse_place_list(const char *var, const char *env,
       continue;
     }
     if (*scan != ':') {
-      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     scan++; // skip ':'
@@ -2900,7 +2992,6 @@ static int __kmp_parse_place_list(const char *var, const char *env,
     }
     SKIP_WS(scan);
     if ((*scan < '0') || (*scan > '9')) {
-      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     next = scan;
@@ -2920,7 +3011,6 @@ static int __kmp_parse_place_list(const char *var, const char *env,
       continue;
     }
 
-    __kmp_omp_places_syntax_warn(var);
     return FALSE;
   }
 
@@ -2934,6 +3024,22 @@ static int __kmp_parse_place_list(const char *var, const char *env,
   return TRUE;
 }
 
+static inline void __kmp_places_set(enum affinity_type type, kmp_hw_t kind) {
+  __kmp_affinity.type = type;
+  __kmp_affinity.gran = kind;
+  __kmp_affinity.flags.dups = FALSE;
+  __kmp_affinity.flags.omp_places = TRUE;
+}
+
+static void __kmp_places_syntax_error_fallback(char const *name,
+                                               kmp_hw_t kind) {
+  const char *str = __kmp_hw_get_catalog_string(kind, /*plural=*/true);
+  KMP_WARNING(SyntaxErrorUsing, name, str);
+  __kmp_places_set(affinity_compact, kind);
+  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default)
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+}
+
 static void __kmp_stg_parse_places(char const *name, char const *value,
                                    void *data) {
   struct kmp_place_t {
@@ -2944,7 +3050,6 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
   bool set = false;
   const char *scan = value;
   const char *next = scan;
-  const char *kind = "\"threads\"";
   kmp_place_t std_places[] = {{"threads", KMP_HW_THREAD},
                               {"cores", KMP_HW_CORE},
                               {"numa_domains", KMP_HW_NUMA},
@@ -2963,10 +3068,54 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
     const kmp_place_t &place = std_places[i];
     if (__kmp_match_str(place.name, scan, &next)) {
       scan = next;
-      __kmp_affinity_type = affinity_compact;
-      __kmp_affinity_gran = place.type;
-      __kmp_affinity_dups = FALSE;
+      __kmp_places_set(affinity_compact, place.type);
       set = true;
+      // Parse core attribute if it exists
+      if (KMP_HW_MAX_NUM_CORE_TYPES > 1) {
+        SKIP_WS(scan);
+        if (*scan == ':') {
+          if (place.type != KMP_HW_CORE) {
+            __kmp_places_syntax_error_fallback(name, place.type);
+            return;
+          }
+          scan++; // skip ':'
+          SKIP_WS(scan);
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+          if (__kmp_match_str("intel_core", scan, &next)) {
+            __kmp_affinity.core_attr_gran.core_type = KMP_HW_CORE_TYPE_CORE;
+            __kmp_affinity.core_attr_gran.valid = 1;
+            scan = next;
+          } else if (__kmp_match_str("intel_atom", scan, &next)) {
+            __kmp_affinity.core_attr_gran.core_type = KMP_HW_CORE_TYPE_ATOM;
+            __kmp_affinity.core_attr_gran.valid = 1;
+            scan = next;
+          } else
+#endif
+              if (__kmp_match_str("eff", scan, &next)) {
+            int eff;
+            if (!isdigit(*next)) {
+              __kmp_places_syntax_error_fallback(name, place.type);
+              return;
+            }
+            scan = next;
+            SKIP_DIGITS(next);
+            eff = __kmp_str_to_int(scan, *next);
+            if (eff < 0) {
+              __kmp_places_syntax_error_fallback(name, place.type);
+              return;
+            }
+            if (eff >= KMP_HW_MAX_NUM_CORE_EFFS)
+              eff = KMP_HW_MAX_NUM_CORE_EFFS - 1;
+            __kmp_affinity.core_attr_gran.core_eff = eff;
+            __kmp_affinity.core_attr_gran.valid = 1;
+            scan = next;
+          }
+          if (!__kmp_affinity.core_attr_gran.valid) {
+            __kmp_places_syntax_error_fallback(name, place.type);
+            return;
+          }
+        }
+      }
       break;
     }
   }
@@ -2978,36 +3127,56 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
         continue;
       if (__kmp_match_str(name, scan, &next)) {
         scan = next;
-        __kmp_affinity_type = affinity_compact;
-        __kmp_affinity_gran = type;
-        __kmp_affinity_dups = FALSE;
+        __kmp_places_set(affinity_compact, type);
         set = true;
         break;
       }
     }
   }
+  // Implementation choices for OMP_PLACES based on core attributes
+  if (!set) {
+    if (__kmp_match_str("core_types", scan, &next)) {
+      scan = next;
+      if (*scan != '\0') {
+        KMP_WARNING(ParseExtraCharsWarn, name, scan);
+      }
+      __kmp_places_set(affinity_compact, KMP_HW_CORE);
+      __kmp_affinity.flags.core_types_gran = 1;
+      set = true;
+    } else if (__kmp_match_str("core_effs", scan, &next) ||
+               __kmp_match_str("core_efficiencies", scan, &next)) {
+      scan = next;
+      if (*scan != '\0') {
+        KMP_WARNING(ParseExtraCharsWarn, name, scan);
+      }
+      __kmp_places_set(affinity_compact, KMP_HW_CORE);
+      __kmp_affinity.flags.core_effs_gran = 1;
+      set = true;
+    }
+  }
+  // Explicit place list
   if (!set) {
-    if (__kmp_affinity_proclist != NULL) {
-      KMP_INTERNAL_FREE((void *)__kmp_affinity_proclist);
-      __kmp_affinity_proclist = NULL;
-    }
-    if (__kmp_parse_place_list(name, value, &__kmp_affinity_proclist)) {
-      __kmp_affinity_type = affinity_explicit;
-      __kmp_affinity_gran = KMP_HW_THREAD;
-      __kmp_affinity_dups = FALSE;
+    if (__kmp_affinity.proclist != NULL) {
+      KMP_INTERNAL_FREE((void *)__kmp_affinity.proclist);
+      __kmp_affinity.proclist = NULL;
+    }
+    if (__kmp_parse_place_list(name, value, &__kmp_affinity.proclist)) {
+      __kmp_places_set(affinity_explicit, KMP_HW_THREAD);
     } else {
       // Syntax error fallback
-      __kmp_affinity_type = affinity_compact;
-      __kmp_affinity_gran = KMP_HW_CORE;
-      __kmp_affinity_dups = FALSE;
+      __kmp_places_syntax_error_fallback(name, KMP_HW_CORE);
     }
     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
     }
     return;
   }
-  if (__kmp_affinity_gran != KMP_HW_UNKNOWN) {
-    kind = __kmp_hw_get_keyword(__kmp_affinity_gran);
+
+  kmp_hw_t gran = __kmp_affinity.gran;
+  if (__kmp_affinity.gran != KMP_HW_UNKNOWN) {
+    gran = __kmp_affinity.gran;
+  } else {
+    gran = KMP_HW_CORE;
   }
 
   if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
@@ -3021,7 +3190,7 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
 
   // Parse option count parameter in parentheses
   if (*scan != '(') {
-    KMP_WARNING(SyntaxErrorUsing, name, kind);
+    __kmp_places_syntax_error_fallback(name, gran);
     return;
   }
   scan++; // skip '('
@@ -3035,7 +3204,7 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
 
   SKIP_WS(scan);
   if (*scan != ')') {
-    KMP_WARNING(SyntaxErrorUsing, name, kind);
+    __kmp_places_syntax_error_fallback(name, gran);
     return;
   }
   scan++; // skip ')'
@@ -3049,6 +3218,10 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
 
 static void __kmp_stg_print_places(kmp_str_buf_t *buffer, char const *name,
                                    void *data) {
+  enum affinity_type type = __kmp_affinity.type;
+  const char *proclist = __kmp_affinity.proclist;
+  kmp_hw_t gran = __kmp_affinity.gran;
+
   if (__kmp_env_format) {
     KMP_STR_BUF_PRINT_NAME;
   } else {
@@ -3058,28 +3231,53 @@ static void __kmp_stg_print_places(kmp_str_buf_t *buffer, char const *name,
       (__kmp_nested_proc_bind.bind_types == NULL) ||
       (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
     __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
-  } else if (__kmp_affinity_type == affinity_explicit) {
-    if (__kmp_affinity_proclist != NULL) {
-      __kmp_str_buf_print(buffer, "='%s'\n", __kmp_affinity_proclist);
+  } else if (type == affinity_explicit) {
+    if (proclist != NULL) {
+      __kmp_str_buf_print(buffer, "='%s'\n", proclist);
     } else {
       __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
     }
-  } else if (__kmp_affinity_type == affinity_compact) {
+  } else if (type == affinity_compact) {
     int num;
-    if (__kmp_affinity_num_masks > 0) {
-      num = __kmp_affinity_num_masks;
+    if (__kmp_affinity.num_masks > 0) {
+      num = __kmp_affinity.num_masks;
     } else if (__kmp_affinity_num_places > 0) {
       num = __kmp_affinity_num_places;
     } else {
       num = 0;
     }
-    if (__kmp_affinity_gran != KMP_HW_UNKNOWN) {
-      const char *name = __kmp_hw_get_keyword(__kmp_affinity_gran, true);
-      if (num > 0) {
-        __kmp_str_buf_print(buffer, "='%s(%d)'\n", name, num);
-      } else {
-        __kmp_str_buf_print(buffer, "='%s'\n", name);
+    if (gran != KMP_HW_UNKNOWN) {
+      // If core_types or core_effs, just print and return
+      if (__kmp_affinity.flags.core_types_gran) {
+        __kmp_str_buf_print(buffer, "='%s'\n", "core_types");
+        return;
+      }
+      if (__kmp_affinity.flags.core_effs_gran) {
+        __kmp_str_buf_print(buffer, "='%s'\n", "core_effs");
+        return;
+      }
+
+      // threads, cores, sockets, cores:<attribute>, etc.
+      const char *name = __kmp_hw_get_keyword(gran, true);
+      __kmp_str_buf_print(buffer, "='%s", name);
+
+      // Add core attributes if it exists
+      if (__kmp_affinity.core_attr_gran.valid) {
+        kmp_hw_core_type_t ct =
+            (kmp_hw_core_type_t)__kmp_affinity.core_attr_gran.core_type;
+        int eff = __kmp_affinity.core_attr_gran.core_eff;
+        if (ct != KMP_HW_CORE_TYPE_UNKNOWN) {
+          const char *ct_name = __kmp_hw_get_core_type_keyword(ct);
+          __kmp_str_buf_print(buffer, ":%s", name, ct_name);
+        } else if (eff >= 0 && eff < KMP_HW_MAX_NUM_CORE_EFFS) {
+          __kmp_str_buf_print(buffer, ":eff%d", name, eff);
+        }
       }
+
+      // Add the '(#)' part if it exists
+      if (num > 0)
+        __kmp_str_buf_print(buffer, "(%d)", num);
+      __kmp_str_buf_print(buffer, "'\n");
     } else {
       __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
     }
@@ -3306,7 +3504,7 @@ static void __kmp_stg_parse_proc_bind(char const *name, char const *value,
     buf = next;
     SKIP_WS(buf);
 #if KMP_AFFINITY_SUPPORTED
-    __kmp_affinity_type = affinity_disabled;
+    __kmp_affinity.type = affinity_disabled;
 #endif /* KMP_AFFINITY_SUPPORTED */
     __kmp_nested_proc_bind.used = 1;
     __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
@@ -3315,7 +3513,7 @@ static void __kmp_stg_parse_proc_bind(char const *name, char const *value,
     buf = next;
     SKIP_WS(buf);
 #if KMP_AFFINITY_SUPPORTED
-    __kmp_affinity_type = affinity_none;
+    __kmp_affinity.type = affinity_none;
 #endif /* KMP_AFFINITY_SUPPORTED */
     __kmp_nested_proc_bind.used = 1;
     __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
@@ -4175,8 +4373,8 @@ static void __kmp_stg_parse_omp_schedule(char const *name, char const *value,
                                          void *data) {
   size_t length;
   const char *ptr = value;
-  SKIP_WS(ptr);
-  if (value) {
+  if (ptr) {
+    SKIP_WS(ptr);
     length = KMP_STRLEN(value);
     if (length) {
       if (value[length - 1] == '"' || value[length - 1] == '\'')
@@ -4244,6 +4442,10 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
     case kmp_sch_auto:
       __kmp_str_buf_print(buffer, "%s,%d'\n", "auto", __kmp_chunk);
       break;
+    default:
+      KMP_ASSERT2(false, "Unhandled sched_type enumeration");
+      KMP_BUILTIN_UNREACHABLE;
+      break;
     }
   } else {
     switch (sched) {
@@ -4269,6 +4471,10 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
     case kmp_sch_auto:
       __kmp_str_buf_print(buffer, "%s'\n", "auto");
       break;
+    default:
+      KMP_ASSERT2(false, "Unhandled sched_type enumeration");
+      KMP_BUILTIN_UNREACHABLE;
+      break;
     }
   }
 } // __kmp_stg_print_omp_schedule
@@ -4683,9 +4889,6 @@ static void __kmp_stg_parse_spin_backoff_params(const char *name,
       if (num <= 0) { // The number of retries should be > 0
         msg = KMP_I18N_STR(ValueTooSmall);
         num = 1;
-      } else if (num > KMP_INT_MAX) {
-        msg = KMP_I18N_STR(ValueTooLarge);
-        num = KMP_INT_MAX;
       }
       if (msg != NULL) {
         // Message is not empty. Print warning.
@@ -4782,9 +4985,6 @@ static void __kmp_stg_parse_adaptive_lock_props(const char *name,
       if (num < 0) { // The number of retries should be >= 0
         msg = KMP_I18N_STR(ValueTooSmall);
         num = 1;
-      } else if (num > KMP_INT_MAX) {
-        msg = KMP_I18N_STR(ValueTooLarge);
-        num = KMP_INT_MAX;
       }
       if (msg != NULL) {
         // Message is not empty. Print warning.
@@ -5078,21 +5278,6 @@ err:
   return;
 }
 
-static inline const char *
-__kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
-  switch (type) {
-  case KMP_HW_CORE_TYPE_UNKNOWN:
-    return "unknown";
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
-  case KMP_HW_CORE_TYPE_ATOM:
-    return "intel_atom";
-  case KMP_HW_CORE_TYPE_CORE:
-    return "intel_core";
-#endif
-  }
-  return "unknown";
-}
-
 static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
                                       void *data) {
   kmp_str_buf_t buf;
@@ -5470,6 +5655,8 @@ static kmp_setting_t __kmp_stg_table[] = {
 #if KMP_AFFINITY_SUPPORTED
     {"KMP_AFFINITY", __kmp_stg_parse_affinity, __kmp_stg_print_affinity, NULL,
      0, 0},
+    {"KMP_HIDDEN_HELPER_AFFINITY", __kmp_stg_parse_hh_affinity,
+     __kmp_stg_print_hh_affinity, NULL, 0, 0},
 #ifdef KMP_GOMP_COMPAT
     {"GOMP_CPU_AFFINITY", __kmp_stg_parse_gomp_cpu_affinity, NULL,
      /* no print */ NULL, 0, 0},
@@ -5568,6 +5755,11 @@ static kmp_setting_t __kmp_stg_table[] = {
     {"LIBOMP_NUM_HIDDEN_HELPER_THREADS",
      __kmp_stg_parse_num_hidden_helper_threads,
      __kmp_stg_print_num_hidden_helper_threads, NULL, 0, 0},
+#if OMPX_TASKGRAPH
+    {"KMP_MAX_TDGS", __kmp_stg_parse_max_tdgs, __kmp_std_print_max_tdgs, NULL,
+     0, 0},
+    {"KMP_TDG_DOT", __kmp_stg_parse_tdg_dot, __kmp_stg_print_tdg_dot, NULL, 0, 0},
+#endif
 
 #if OMPT_SUPPORT
     {"OMP_TOOL", __kmp_stg_parse_omp_tool, __kmp_stg_print_omp_tool, NULL, 0,
@@ -5887,6 +6079,22 @@ static int __kmp_env_toPrint(char const *name, int flag) {
   return rc;
 }
 
+#if defined(KMP_DEBUG) && KMP_AFFINITY_SUPPORTED
+static void __kmp_print_affinity_settings(const kmp_affinity_t *affinity) {
+  K_DIAG(1, ("%s:\n", affinity->env_var));
+  K_DIAG(1, ("    type     : %d\n", affinity->type));
+  K_DIAG(1, ("    compact  : %d\n", affinity->compact));
+  K_DIAG(1, ("    offset   : %d\n", affinity->offset));
+  K_DIAG(1, ("    verbose  : %u\n", affinity->flags.verbose));
+  K_DIAG(1, ("    warnings : %u\n", affinity->flags.warnings));
+  K_DIAG(1, ("    respect  : %u\n", affinity->flags.respect));
+  K_DIAG(1, ("    reset    : %u\n", affinity->flags.reset));
+  K_DIAG(1, ("    dups     : %u\n", affinity->flags.dups));
+  K_DIAG(1, ("    gran     : %d\n", (int)affinity->gran));
+  KMP_DEBUG_ASSERT(affinity->type != affinity_default);
+}
+#endif
+
 static void __kmp_aux_env_initialize(kmp_env_blk_t *block) {
 
   char const *value;
@@ -5900,7 +6108,13 @@ static void __kmp_aux_env_initialize(kmp_env_blk_t *block) {
   /* KMP_BLOCKTIME */
   value = __kmp_env_blk_var(block, "KMP_BLOCKTIME");
   if (value) {
-    kmpc_set_blocktime(__kmp_dflt_blocktime);
+    int gtid, tid;
+    kmp_info_t *thread;
+
+    gtid = __kmp_entry_gtid();
+    tid = __kmp_tid_from_gtid(gtid);
+    thread = __kmp_thread_from_gtid(gtid);
+    __kmp_aux_set_blocktime(__kmp_dflt_blocktime, thread, tid);
   }
 
   /* OMP_NESTED */
@@ -5973,9 +6187,9 @@ void __kmp_env_initialize(char const *string) {
     // specifier, even as substrings.
     //
     // I can't find a case-insensitive version of strstr on Windows* OS.
-    // Use the case-sensitive version for now.
+    // Use the case-sensitive version for now. AIX does the same.
 
-#if KMP_OS_WINDOWS
+#if KMP_OS_WINDOWS || KMP_OS_AIX
 #define FIND strstr
 #else
 #define FIND strcasestr
@@ -5994,20 +6208,20 @@ void __kmp_env_initialize(char const *string) {
       // A new affinity type is specified.
       // Reset the affinity flags to their default values,
       // in case this is called from kmp_set_defaults().
-      __kmp_affinity_type = affinity_default;
-      __kmp_affinity_gran = KMP_HW_UNKNOWN;
+      __kmp_affinity.type = affinity_default;
+      __kmp_affinity.gran = KMP_HW_UNKNOWN;
       __kmp_affinity_top_method = affinity_top_method_default;
-      __kmp_affinity_respect_mask = affinity_respect_mask_default;
+      __kmp_affinity.flags.respect = affinity_respect_mask_default;
     }
 #undef FIND
 
     // Also reset the affinity flags if OMP_PROC_BIND is specified.
     aff_str = __kmp_env_blk_var(&block, "OMP_PROC_BIND");
     if (aff_str != NULL) {
-      __kmp_affinity_type = affinity_default;
-      __kmp_affinity_gran = KMP_HW_UNKNOWN;
+      __kmp_affinity.type = affinity_default;
+      __kmp_affinity.gran = KMP_HW_UNKNOWN;
       __kmp_affinity_top_method = affinity_top_method_default;
-      __kmp_affinity_respect_mask = affinity_respect_mask_default;
+      __kmp_affinity.flags.respect = affinity_respect_mask_default;
     }
   }
 
@@ -6083,12 +6297,12 @@ void __kmp_env_initialize(char const *string) {
         __kmp_affinity_top_method == affinity_top_method_default)
       if (__kmp_hw_subset->specified(KMP_HW_NUMA) ||
           __kmp_hw_subset->specified(KMP_HW_TILE) ||
-          __kmp_affinity_gran == KMP_HW_TILE ||
-          __kmp_affinity_gran == KMP_HW_NUMA)
+          __kmp_affinity.gran == KMP_HW_TILE ||
+          __kmp_affinity.gran == KMP_HW_NUMA)
         __kmp_affinity_top_method = affinity_top_method_hwloc;
     // Force using hwloc when tiles or numa nodes requested for OMP_PLACES
-    if (__kmp_affinity_gran == KMP_HW_NUMA ||
-        __kmp_affinity_gran == KMP_HW_TILE)
+    if (__kmp_affinity.gran == KMP_HW_NUMA ||
+        __kmp_affinity.gran == KMP_HW_TILE)
       __kmp_affinity_top_method = affinity_top_method_hwloc;
 #endif
     // Determine if the machine/OS is actually capable of supporting
@@ -6105,25 +6319,25 @@ void __kmp_env_initialize(char const *string) {
       __kmp_affinity_top_method = affinity_top_method_all;
     }
 #endif
-    if (__kmp_affinity_type == affinity_disabled) {
+    if (__kmp_affinity.type == affinity_disabled) {
       KMP_AFFINITY_DISABLE();
     } else if (!KMP_AFFINITY_CAPABLE()) {
       __kmp_affinity_dispatch->determine_capable(var);
       if (!KMP_AFFINITY_CAPABLE()) {
-        if (__kmp_affinity_verbose ||
-            (__kmp_affinity_warnings &&
-             (__kmp_affinity_type != affinity_default) &&
-             (__kmp_affinity_type != affinity_none) &&
-             (__kmp_affinity_type != affinity_disabled))) {
+        if (__kmp_affinity.flags.verbose ||
+            (__kmp_affinity.flags.warnings &&
+             (__kmp_affinity.type != affinity_default) &&
+             (__kmp_affinity.type != affinity_none) &&
+             (__kmp_affinity.type != affinity_disabled))) {
           KMP_WARNING(AffNotSupported, var);
         }
-        __kmp_affinity_type = affinity_disabled;
-        __kmp_affinity_respect_mask = 0;
-        __kmp_affinity_gran = KMP_HW_THREAD;
+        __kmp_affinity.type = affinity_disabled;
+        __kmp_affinity.flags.respect = FALSE;
+        __kmp_affinity.gran = KMP_HW_THREAD;
       }
     }
 
-    if (__kmp_affinity_type == affinity_disabled) {
+    if (__kmp_affinity.type == affinity_disabled) {
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
     } else if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_true) {
       // OMP_PROC_BIND=true maps to OMP_PROC_BIND=spread.
@@ -6165,48 +6379,54 @@ void __kmp_env_initialize(char const *string) {
       // processor groups, or if the user requested it, and OMP 4.0
       // affinity is not in effect.
       if (__kmp_num_proc_groups > 1 &&
-          __kmp_affinity_type == affinity_default &&
+          __kmp_affinity.type == affinity_default &&
           __kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
         // Do not respect the initial processor affinity mask if it is assigned
         // exactly one Windows Processor Group since this is interpreted as the
         // default OS assignment. Not respecting the mask allows the runtime to
         // use all the logical processors in all groups.
-        if (__kmp_affinity_respect_mask == affinity_respect_mask_default &&
+        if (__kmp_affinity.flags.respect == affinity_respect_mask_default &&
             exactly_one_group) {
-          __kmp_affinity_respect_mask = FALSE;
+          __kmp_affinity.flags.respect = FALSE;
         }
         // Use compact affinity with anticipation of pinning to at least the
         // group granularity since threads can only be bound to one group.
-        if (__kmp_affinity_type == affinity_default) {
-          __kmp_affinity_type = affinity_compact;
+        if (__kmp_affinity.type == affinity_default) {
+          __kmp_affinity.type = affinity_compact;
           __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
         }
+        if (__kmp_hh_affinity.type == affinity_default)
+          __kmp_hh_affinity.type = affinity_compact;
         if (__kmp_affinity_top_method == affinity_top_method_default)
           __kmp_affinity_top_method = affinity_top_method_all;
-        if (__kmp_affinity_gran == KMP_HW_UNKNOWN)
-          __kmp_affinity_gran = KMP_HW_PROC_GROUP;
+        if (__kmp_affinity.gran == KMP_HW_UNKNOWN)
+          __kmp_affinity.gran = KMP_HW_PROC_GROUP;
+        if (__kmp_hh_affinity.gran == KMP_HW_UNKNOWN)
+          __kmp_hh_affinity.gran = KMP_HW_PROC_GROUP;
       } else
 
 #endif /* KMP_GROUP_AFFINITY */
 
       {
-        if (__kmp_affinity_respect_mask == affinity_respect_mask_default) {
+        if (__kmp_affinity.flags.respect == affinity_respect_mask_default) {
 #if KMP_GROUP_AFFINITY
           if (__kmp_num_proc_groups > 1 && exactly_one_group) {
-            __kmp_affinity_respect_mask = FALSE;
+            __kmp_affinity.flags.respect = FALSE;
           } else
 #endif /* KMP_GROUP_AFFINITY */
           {
-            __kmp_affinity_respect_mask = TRUE;
+            __kmp_affinity.flags.respect = TRUE;
           }
         }
         if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
             (__kmp_nested_proc_bind.bind_types[0] != proc_bind_default)) {
-          if (__kmp_affinity_type == affinity_default) {
-            __kmp_affinity_type = affinity_compact;
-            __kmp_affinity_dups = FALSE;
+          if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)
+            __kmp_affinity.type = affinity_none;
+          if (__kmp_affinity.type == affinity_default) {
+            __kmp_affinity.type = affinity_compact;
+            __kmp_affinity.flags.dups = FALSE;
           }
-        } else if (__kmp_affinity_type == affinity_default) {
+        } else if (__kmp_affinity.type == affinity_default) {
 #if KMP_MIC_SUPPORTED
           if (__kmp_mic_type != non_mic) {
             __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
@@ -6217,51 +6437,63 @@ void __kmp_env_initialize(char const *string) {
           }
 #if KMP_MIC_SUPPORTED
           if (__kmp_mic_type != non_mic) {
-            __kmp_affinity_type = affinity_scatter;
+            __kmp_affinity.type = affinity_scatter;
+          } else
+#endif
+          {
+            __kmp_affinity.type = affinity_none;
+          }
+        }
+        if (__kmp_hh_affinity.type == affinity_default)
+          __kmp_hh_affinity.type = affinity_none;
+        if ((__kmp_affinity.gran == KMP_HW_UNKNOWN) &&
+            (__kmp_affinity.gran_levels < 0)) {
+#if KMP_MIC_SUPPORTED
+          if (__kmp_mic_type != non_mic) {
+            __kmp_affinity.gran = KMP_HW_THREAD;
           } else
 #endif
           {
-            __kmp_affinity_type = affinity_none;
+            __kmp_affinity.gran = KMP_HW_CORE;
           }
         }
-        if ((__kmp_affinity_gran == KMP_HW_UNKNOWN) &&
-            (__kmp_affinity_gran_levels < 0)) {
+        if ((__kmp_hh_affinity.gran == KMP_HW_UNKNOWN) &&
+            (__kmp_hh_affinity.gran_levels < 0)) {
 #if KMP_MIC_SUPPORTED
           if (__kmp_mic_type != non_mic) {
-            __kmp_affinity_gran = KMP_HW_THREAD;
+            __kmp_hh_affinity.gran = KMP_HW_THREAD;
           } else
 #endif
           {
-            __kmp_affinity_gran = KMP_HW_CORE;
+            __kmp_hh_affinity.gran = KMP_HW_CORE;
           }
         }
         if (__kmp_affinity_top_method == affinity_top_method_default) {
           __kmp_affinity_top_method = affinity_top_method_all;
         }
       }
+    } else {
+      // If affinity is disabled, then still need to assign topology method
+      // to attempt machine detection and affinity types
+      if (__kmp_affinity_top_method == affinity_top_method_default)
+        __kmp_affinity_top_method = affinity_top_method_all;
+      if (__kmp_affinity.type == affinity_default)
+        __kmp_affinity.type = affinity_disabled;
+      if (__kmp_hh_affinity.type == affinity_default)
+        __kmp_hh_affinity.type = affinity_disabled;
     }
 
-    K_DIAG(1, ("__kmp_affinity_type         == %d\n", __kmp_affinity_type));
-    K_DIAG(1, ("__kmp_affinity_compact      == %d\n", __kmp_affinity_compact));
-    K_DIAG(1, ("__kmp_affinity_offset       == %d\n", __kmp_affinity_offset));
-    K_DIAG(1, ("__kmp_affinity_verbose      == %d\n", __kmp_affinity_verbose));
-    K_DIAG(1, ("__kmp_affinity_warnings     == %d\n", __kmp_affinity_warnings));
-    K_DIAG(1, ("__kmp_affinity_respect_mask == %d\n",
-               __kmp_affinity_respect_mask));
-    K_DIAG(1, ("__kmp_affinity_gran         == %d\n", __kmp_affinity_gran));
-
-    KMP_DEBUG_ASSERT(__kmp_affinity_type != affinity_default);
+#ifdef KMP_DEBUG
+    for (const kmp_affinity_t *affinity : __kmp_affinities)
+      __kmp_print_affinity_settings(affinity);
     KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.bind_types[0] != proc_bind_default);
     K_DIAG(1, ("__kmp_nested_proc_bind.bind_types[0] == %d\n",
                __kmp_nested_proc_bind.bind_types[0]));
+#endif
   }
 
 #endif /* KMP_AFFINITY_SUPPORTED */
 
-  if (__kmp_version) {
-    __kmp_print_version_1();
-  }
-
   // Post-initialization step: some env. vars need their value's further
   // processing
   if (string != NULL) { // kmp_set_defaults() was called
diff --git a/contrib/libs/cxxsupp/openmp/kmp_settings.h b/contrib/libs/cxxsupp/openmp/kmp_settings.h
index f63f105940e..92bbcff5241 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_settings.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_settings.h
@@ -24,7 +24,6 @@ void __kmp_env_dump();
 
 int __kmp_initial_threads_capacity(int req_nproc);
 void __kmp_init_dflt_team_nth();
-int __kmp_convert_to_milliseconds(char const *);
 int __kmp_default_tp_capacity(int, int, int);
 
 #if KMP_MIC
diff --git a/contrib/libs/cxxsupp/openmp/kmp_stats.h b/contrib/libs/cxxsupp/openmp/kmp_stats.h
index 0e3ea3b9cf8..1c6144b99b8 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_stats.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_stats.h
@@ -102,6 +102,7 @@ enum stats_state_e {
   macro(OMP_BARRIER, 0, arg)                                                   \
   macro(OMP_CRITICAL, 0, arg)                                                  \
   macro(OMP_SINGLE, 0, arg)                                                    \
+  macro(OMP_SECTIONS, 0, arg)                                                  \
   macro(OMP_MASTER, 0, arg)                                                    \
   macro(OMP_MASKED, 0, arg)                                                    \
   macro(OMP_TEAMS, 0, arg)                                                     \
@@ -150,6 +151,8 @@ enum stats_state_e {
   macro (OMP_critical, 0, arg)                                                 \
   macro (OMP_critical_wait, 0, arg)                                            \
   macro (OMP_single, 0, arg)                                                   \
+  macro (OMP_sections, 0, arg)                                                 \
+  macro (OMP_sections_overhead, 0, arg)                                        \
   macro (OMP_master, 0, arg)                                                   \
   macro (OMP_masked, 0, arg)                                                   \
   macro (OMP_task_immediate, 0, arg)                                           \
@@ -593,7 +596,7 @@ public:
 
     *MORE ON NEST_LEVEL*
     The nest level is used in the bar graph that represents the timeline.
-    Its main purpose is for showing how events are nested inside eachother.
+    Its main purpose is for showing how events are nested inside each other.
     For example, say events, A, B, and C are recorded.  If the timeline
     looks like this:
 
diff --git a/contrib/libs/cxxsupp/openmp/kmp_str.cpp b/contrib/libs/cxxsupp/openmp/kmp_str.cpp
index e64f989fbc6..6ee2df72448 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_str.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_str.cpp
@@ -137,8 +137,8 @@ void __kmp_str_buf_cat(kmp_str_buf_t *buffer, char const *str, size_t len) {
   KMP_DEBUG_ASSERT(len >= 0);
 
   __kmp_str_buf_reserve(buffer, buffer->used + len + 1);
-  KMP_MEMCPY(buffer->str + buffer->used, str, len);
-  buffer->str[buffer->used + len] = 0;
+  buffer->str[buffer->used] = '\0';
+  KMP_STRNCAT_S(buffer->str + buffer->used, len + 1, str, len);
   __kmp_type_convert(buffer->used + len, &(buffer->used));
   KMP_STR_BUF_INVARIANT(buffer);
 } // __kmp_str_buf_cat
@@ -151,8 +151,8 @@ void __kmp_str_buf_catbuf(kmp_str_buf_t *dest, const kmp_str_buf_t *src) {
   if (!src->str || !src->used)
     return;
   __kmp_str_buf_reserve(dest, dest->used + src->used + 1);
-  KMP_MEMCPY(dest->str + dest->used, src->str, src->used);
-  dest->str[dest->used + src->used] = 0;
+  dest->str[dest->used] = '\0';
+  KMP_STRNCAT_S(dest->str + dest->used, src->used + 1, src->str, src->used);
   dest->used += src->used;
   KMP_STR_BUF_INVARIANT(dest);
 } // __kmp_str_buf_catbuf
@@ -619,6 +619,21 @@ char *__kmp_str_token(
   return token;
 } // __kmp_str_token
 
+int __kmp_basic_str_to_int(char const *str) {
+  int result;
+  char const *t;
+
+  result = 0;
+
+  for (t = str; *t != '\0'; ++t) {
+    if (*t < '0' || *t > '9')
+      break;
+    result = (result * 10) + (*t - '0');
+  }
+
+  return result;
+}
+
 int __kmp_str_to_int(char const *str, char sentinel) {
   int result, factor;
   char const *t;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_str.h b/contrib/libs/cxxsupp/openmp/kmp_str.h
index 855b5df55d6..11f633cd802 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_str.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_str.h
@@ -112,6 +112,7 @@ int __kmp_str_match_true(char const *data);
 void __kmp_str_replace(char *str, char search_for, char replace_with);
 void __kmp_str_split(char *str, char delim, char **head, char **tail);
 char *__kmp_str_token(char *str, char const *delim, char **buf);
+int __kmp_basic_str_to_int(char const *str);
 int __kmp_str_to_int(char const *str, char sentinel);
 
 void __kmp_str_to_size(char const *str, size_t *out, size_t dfactor,
diff --git a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp
index 6c1d93a8918..39cf3496c5a 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp
@@ -30,7 +30,7 @@
 // TODO: Any ITT support needed?
 
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
-static std::atomic<kmp_int32> kmp_node_id_seed = ATOMIC_VAR_INIT(0);
+static std::atomic<kmp_int32> kmp_node_id_seed = 0;
 #endif
 
 static void __kmp_init_node(kmp_depnode_t *node) {
@@ -218,6 +218,44 @@ static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
 static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
                                           kmp_depnode_t *sink,
                                           kmp_task_t *sink_task) {
+#if OMPX_TASKGRAPH
+  kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
+  kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
+  if (source->dn.task && sink_task) {
+    // Not supporting dependency between two tasks that one is within the TDG
+    // and the other is not
+    KMP_ASSERT(task_source->is_taskgraph == task_sink->is_taskgraph);
+  }
+  if (task_sink->is_taskgraph &&
+      __kmp_tdg_is_recording(task_sink->tdg->tdg_status)) {
+    kmp_node_info_t *source_info =
+        &task_sink->tdg->record_map[task_source->td_task_id];
+    bool exists = false;
+    for (int i = 0; i < source_info->nsuccessors; i++) {
+      if (source_info->successors[i] == task_sink->td_task_id) {
+        exists = true;
+        break;
+      }
+    }
+    if (!exists) {
+      if (source_info->nsuccessors >= source_info->successors_size) {
+        source_info->successors_size = 2 * source_info->successors_size;
+        kmp_int32 *old_succ_ids = source_info->successors;
+        kmp_int32 *new_succ_ids = (kmp_int32 *)__kmp_allocate(
+            source_info->successors_size * sizeof(kmp_int32));
+        source_info->successors = new_succ_ids;
+        __kmp_free(old_succ_ids);
+      }
+
+      source_info->successors[source_info->nsuccessors] = task_sink->td_task_id;
+      source_info->nsuccessors++;
+
+      kmp_node_info_t *sink_info =
+          &(task_sink->tdg->record_map[task_sink->td_task_id]);
+      sink_info->npredecessors++;
+    }
+  }
+#endif
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
   kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
   // do not use sink->dn.task as that is only filled after the dependences
@@ -246,6 +284,16 @@ static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
 #endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
 }
 
+kmp_base_depnode_t *__kmpc_task_get_depnode(kmp_task_t *task) {
+  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+  return td->td_depnode ? &(td->td_depnode->dn) : NULL;
+}
+
+kmp_depnode_list_t *__kmpc_task_get_successors(kmp_task_t *task) {
+  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+  return td->td_depnode->dn.successors;
+}
+
 static inline kmp_int32
 __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
                              kmp_task_t *task, kmp_depnode_t *node,
@@ -256,16 +304,31 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
   // link node as successor of list elements
   for (kmp_depnode_list_t *p = plist; p; p = p->next) {
     kmp_depnode_t *dep = p->node;
+#if OMPX_TASKGRAPH
+    kmp_tdg_status tdg_status = KMP_TDG_NONE;
+    if (task) {
+      kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+      if (td->is_taskgraph)
+        tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status;
+      if (__kmp_tdg_is_recording(tdg_status))
+        __kmp_track_dependence(gtid, dep, node, task);
+    }
+#endif
     if (dep->dn.task) {
       KMP_ACQUIRE_DEPNODE(gtid, dep);
       if (dep->dn.task) {
-        __kmp_track_dependence(gtid, dep, node, task);
-        dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
-        KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
-                      "%p\n",
-                      gtid, KMP_TASK_TO_TASKDATA(dep->dn.task),
-                      KMP_TASK_TO_TASKDATA(task)));
-        npredecessors++;
+        if (!dep->dn.successors || dep->dn.successors->node != node) {
+#if OMPX_TASKGRAPH
+          if (!(__kmp_tdg_is_recording(tdg_status)) && task)
+#endif
+            __kmp_track_dependence(gtid, dep, node, task);
+          dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
+          KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
+                        "%p\n",
+                        gtid, KMP_TASK_TO_TASKDATA(dep->dn.task),
+                        KMP_TASK_TO_TASKDATA(task)));
+          npredecessors++;
+        }
       }
       KMP_RELEASE_DEPNODE(gtid, dep);
     }
@@ -273,6 +336,7 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
   return npredecessors;
 }
 
+// Add the edge 'sink' -> 'source' in the task dependency graph
 static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
                                                      kmp_info_t *thread,
                                                      kmp_task_t *task,
@@ -281,17 +345,45 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
   if (!sink)
     return 0;
   kmp_int32 npredecessors = 0;
+#if OMPX_TASKGRAPH
+  kmp_tdg_status tdg_status = KMP_TDG_NONE;
+  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+  if (task) {
+    if (td->is_taskgraph)
+      tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status;
+    if (__kmp_tdg_is_recording(tdg_status) && sink->dn.task)
+      __kmp_track_dependence(gtid, sink, source, task);
+  }
+#endif
   if (sink->dn.task) {
     // synchronously add source to sink' list of successors
     KMP_ACQUIRE_DEPNODE(gtid, sink);
     if (sink->dn.task) {
-      __kmp_track_dependence(gtid, sink, source, task);
-      sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
-      KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
+      if (!sink->dn.successors || sink->dn.successors->node != source) {
+#if OMPX_TASKGRAPH
+        if (!(__kmp_tdg_is_recording(tdg_status)) && task)
+#endif
+          __kmp_track_dependence(gtid, sink, source, task);
+        sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
+        KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                     "%p\n",
                     gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
                     KMP_TASK_TO_TASKDATA(task)));
+#if OMPX_TASKGRAPH
+        if (__kmp_tdg_is_recording(tdg_status)) {
+          kmp_taskdata_t *tdd = KMP_TASK_TO_TASKDATA(sink->dn.task);
+          if (tdd->is_taskgraph) {
+            if (tdd->td_flags.onced)
+              // decrement npredecessors if sink->dn.task belongs to a taskgraph
+              // and
+              //  1) the task is reset to its initial state (by kmp_free_task) or
+              //  2) the task is complete but not yet reset
+              npredecessors--;
+          }
+        }
+#endif
       npredecessors++;
+      }
     }
     KMP_RELEASE_DEPNODE(gtid, sink);
   }
@@ -595,6 +687,48 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *current_task = thread->th.th_current_task;
 
+#if OMPX_TASKGRAPH
+  // record TDG with deps
+  if (new_taskdata->is_taskgraph &&
+      __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
+    kmp_tdg_info_t *tdg = new_taskdata->tdg;
+    // extend record_map if needed
+    if (new_taskdata->td_task_id >= tdg->map_size) {
+      __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
+      if (new_taskdata->td_task_id >= tdg->map_size) {
+        kmp_uint old_size = tdg->map_size;
+        kmp_uint new_size = old_size * 2;
+        kmp_node_info_t *old_record = tdg->record_map;
+        kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
+            new_size * sizeof(kmp_node_info_t));
+        KMP_MEMCPY(new_record, tdg->record_map,
+                   old_size * sizeof(kmp_node_info_t));
+        tdg->record_map = new_record;
+
+        __kmp_free(old_record);
+
+        for (kmp_int i = old_size; i < new_size; i++) {
+          kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
+              __kmp_successors_size * sizeof(kmp_int32));
+          new_record[i].task = nullptr;
+          new_record[i].successors = successorsList;
+          new_record[i].nsuccessors = 0;
+          new_record[i].npredecessors = 0;
+          new_record[i].successors_size = __kmp_successors_size;
+          KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
+        }
+        // update the size at the end, so that we avoid other
+        // threads use old_record while map_size is already updated
+        tdg->map_size = new_size;
+      }
+      __kmp_release_bootstrap_lock(&tdg->graph_lock);
+    }
+    tdg->record_map[new_taskdata->td_task_id].task = new_task;
+    tdg->record_map[new_taskdata->td_task_id].parent_task =
+        new_taskdata->td_parent;
+    KMP_ATOMIC_INC(&tdg->num_tasks);
+  }
+#endif
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     if (!current_task->ompt_task_info.frame.enter_frame.ptr)
@@ -605,7 +739,7 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
           &(current_task->ompt_task_info.task_data),
           &(current_task->ompt_task_info.frame),
           &(new_taskdata->ompt_task_info.task_data),
-          ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
+          TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
           OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
     }
 
@@ -626,7 +760,9 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
 
     for (i = 0; i < ndeps; i++) {
       ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
-      if (dep_list[i].flags.in && dep_list[i].flags.out)
+      if (dep_list[i].base_addr == KMP_SIZE_T_MAX)
+        ompt_deps[i].dependence_type = ompt_dependence_type_out_all_memory;
+      else if (dep_list[i].flags.in && dep_list[i].flags.out)
         ompt_deps[i].dependence_type = ompt_dependence_type_inout;
       else if (dep_list[i].flags.out)
         ompt_deps[i].dependence_type = ompt_dependence_type_out;
@@ -636,10 +772,15 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
         ompt_deps[i].dependence_type = ompt_dependence_type_mutexinoutset;
       else if (dep_list[i].flags.set)
         ompt_deps[i].dependence_type = ompt_dependence_type_inoutset;
+      else if (dep_list[i].flags.all)
+        ompt_deps[i].dependence_type = ompt_dependence_type_out_all_memory;
     }
     for (i = 0; i < ndeps_noalias; i++) {
       ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
-      if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
+      if (noalias_dep_list[i].base_addr == KMP_SIZE_T_MAX)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_out_all_memory;
+      else if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
       else if (noalias_dep_list[i].flags.out)
         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
@@ -650,6 +791,9 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
             ompt_dependence_type_mutexinoutset;
       else if (noalias_dep_list[i].flags.set)
         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
+      else if (noalias_dep_list[i].flags.all)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_out_all_memory;
     }
     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
         &(new_taskdata->ompt_task_info.task_data), ompt_deps, ompt_ndeps);
@@ -744,10 +888,24 @@ Blocks the current task until all specifies dependences have been fulfilled.
 void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
                           kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
                           kmp_depend_info_t *noalias_dep_list) {
-  KA_TRACE(10, ("__kmpc_omp_wait_deps(enter): T#%d loc=%p\n", gtid, loc_ref));
+  __kmpc_omp_taskwait_deps_51(loc_ref, gtid, ndeps, dep_list, ndeps_noalias,
+                              noalias_dep_list, false);
+}
 
+/* __kmpc_omp_taskwait_deps_51 : Function for OpenMP 5.1 nowait clause.
+                                 Placeholder for taskwait with nowait clause.
+                                 Earlier code of __kmpc_omp_wait_deps() is now
+                                 in this function.
+*/
+void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
+                                 kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                                 kmp_int32 ndeps_noalias,
+                                 kmp_depend_info_t *noalias_dep_list,
+                                 kmp_int32 has_no_wait) {
+  KA_TRACE(10, ("__kmpc_omp_taskwait_deps(enter): T#%d loc=%p nowait#%d\n",
+                gtid, loc_ref, has_no_wait));
   if (ndeps == 0 && ndeps_noalias == 0) {
-    KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no dependences to "
+    KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d has no dependences to "
                   "wait upon : loc=%p\n",
                   gtid, loc_ref));
     return;
@@ -839,7 +997,7 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
   ignore = ignore || current_task->td_dephash == NULL;
 
   if (ignore) {
-    KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
+    KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d has no blocking "
                   "dependences : loc=%p\n",
                   gtid, loc_ref));
 #if OMPT_SUPPORT
@@ -854,7 +1012,7 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
   if (!__kmp_check_deps(gtid, &node, NULL, &current_task->td_dephash,
                         DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
                         noalias_dep_list)) {
-    KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
+    KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d has no blocking "
                   "dependences : loc=%p\n",
                   gtid, loc_ref));
 #if OMPT_SUPPORT
@@ -872,9 +1030,16 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
                        __kmp_task_stealing_constraint);
   }
 
+  // Wait until the last __kmp_release_deps is finished before we free the
+  // current stack frame holding the "node" variable; once its nrefs count
+  // reaches 1, we're sure nobody else can try to reference it again.
+  while (node.dn.nrefs > 1)
+    KMP_YIELD(TRUE);
+
 #if OMPT_SUPPORT
   __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
 #endif /* OMPT_SUPPORT */
-  KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d finished waiting : loc=%p\n",
+  KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d finished waiting : loc=%p\
+                \n",
                 gtid, loc_ref));
 }
diff --git a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h
index ac6174afd3f..d2ab5151580 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h
@@ -92,6 +92,23 @@ static inline void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h) {
 extern void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start);
 
 static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
+
+#if OMPX_TASKGRAPH
+  if (task->is_taskgraph && !(__kmp_tdg_is_recording(task->tdg->tdg_status))) {
+    kmp_node_info_t *TaskInfo = &(task->tdg->record_map[task->td_task_id]);
+
+    for (int i = 0; i < TaskInfo->nsuccessors; i++) {
+      kmp_int32 successorNumber = TaskInfo->successors[i];
+      kmp_node_info_t *successor = &(task->tdg->record_map[successorNumber]);
+      kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->npredecessors_counter) - 1;
+      if (successor->task != nullptr && npredecessors == 0) {
+        __kmp_omp_task(gtid, successor->task, false);
+      }
+    }
+    return;
+  }
+#endif
+
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_depnode_t *node = task->td_depnode;
 
@@ -120,8 +137,12 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
                 gtid, task));
 
   KMP_ACQUIRE_DEPNODE(gtid, node);
-  node->dn.task =
-      NULL; // mark this task as finished, so no new dependencies are generated
+#if OMPX_TASKGRAPH
+  if (!task->is_taskgraph ||
+      (task->is_taskgraph && !__kmp_tdg_is_recording(task->tdg->tdg_status)))
+#endif
+    node->dn.task =
+        NULL; // mark this task as finished, so no new dependencies are generated
   KMP_RELEASE_DEPNODE(gtid, node);
 
   kmp_depnode_list_t *next;
diff --git a/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp b/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp
index 1622c6aea10..932799e133b 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp
@@ -21,6 +21,14 @@
 #include "ompt-specific.h"
 #endif
 
+#if ENABLE_LIBOMPTARGET
+static void (*tgt_target_nowait_query)(void **);
+
+void __kmp_init_target_task() {
+  *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
+}
+#endif
+
 /* forward declaration */
 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
                                  kmp_info_t *this_thr);
@@ -29,6 +37,10 @@ static void __kmp_alloc_task_deque(kmp_info_t *thread,
 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
                                            kmp_task_team_t *task_team);
 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
+#if OMPX_TASKGRAPH
+static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
+int __kmp_taskloop_task(int gtid, void *ptask);
+#endif
 
 #ifdef BUILD_TIED_TASK_STACK
 
@@ -273,7 +285,11 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
   }
   // Check mutexinoutset dependencies, acquire locks
   kmp_depnode_t *node = tasknew->td_depnode;
+#if OMPX_TASKGRAPH
+  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
+#else
   if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
+#endif
     for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
       if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
@@ -795,8 +811,7 @@ static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
           &(parent_info->task_data), &(parent_info->frame),
           &(taskdata->ompt_task_info.task_data),
-          ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
-          return_address);
+          TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
     }
     __ompt_task_start(task, current_task, gtid);
   }
@@ -823,6 +838,14 @@ static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
 // loc_ref: source location information; points to beginning of task block.
 // gtid: global thread number.
 // task: task thunk for the started task.
+#ifdef __s390x__
+// This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
+// In order for it to work correctly, the caller also needs to be compiled with
+// backchain. If a caller is compiled without backchain,
+// OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
+// crash.
+__attribute__((target("backchain")))
+#endif
 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
                                kmp_task_t *task) {
 #if OMPT_SUPPORT
@@ -880,12 +903,34 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
   task->data2.priority = 0;
 
   taskdata->td_flags.freed = 1;
+#if OMPX_TASKGRAPH
+  // do not free tasks in taskgraph
+  if (!taskdata->is_taskgraph) {
+#endif
 // deallocate the taskdata and shared variable blocks associated with this task
 #if USE_FAST_MEMORY
   __kmp_fast_free(thread, taskdata);
 #else /* ! USE_FAST_MEMORY */
   __kmp_thread_free(thread, taskdata);
 #endif
+#if OMPX_TASKGRAPH
+  } else {
+    taskdata->td_flags.complete = 0;
+    taskdata->td_flags.started = 0;
+    taskdata->td_flags.freed = 0;
+    taskdata->td_flags.executing = 0;
+    taskdata->td_flags.task_serial =
+        (taskdata->td_parent->td_flags.final ||
+          taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
+
+    // taskdata->td_allow_completion_event.pending_events_count = 1;
+    KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
+    KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
+    // start at one because counts current task and children
+    KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
+  }
+#endif
+
   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
 }
 
@@ -972,6 +1017,10 @@ static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
         flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
   ret = ret ||
         KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
+#if OMPX_TASKGRAPH
+  if (taskdata->td_taskgroup && taskdata->is_taskgraph)
+    ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
+#endif
   return ret;
 }
 
@@ -991,6 +1040,10 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_task_team_t *task_team =
       thread->th.th_task_team; // might be NULL for serial teams...
+#if OMPX_TASKGRAPH
+  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
+  bool is_taskgraph;
+#endif
 #if KMP_DEBUG
   kmp_int32 children = 0;
 #endif
@@ -1000,6 +1053,10 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
 
   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
 
+#if OMPX_TASKGRAPH
+  is_taskgraph = taskdata->is_taskgraph;
+#endif
+
 // Pop task from stack if tied
 #ifdef BUILD_TIED_TASK_STACK
   if (taskdata->td_flags.tiedness == TASK_TIED) {
@@ -1063,7 +1120,7 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
 
-  bool detach = false;
+  bool completed = true;
   if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
     if (taskdata->td_allow_completion_event.type ==
         KMP_EVENT_ALLOW_COMPLETION) {
@@ -1087,14 +1144,33 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
         // __kmp_fulfill_event might free taskdata at any time from now
 
         taskdata->td_flags.proxy = TASK_PROXY; // proxify!
-        detach = true;
+        completed = false;
       }
       __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
     }
   }
 
-  if (!detach) {
+  // Tasks with valid target async handles must be re-enqueued.
+  if (taskdata->td_target_data.async_handle != NULL) {
+    // Note: no need to translate gtid to its shadow. If the current thread is a
+    // hidden helper one, then the gtid is already correct. Otherwise, hidden
+    // helper threads are disabled, and gtid refers to a OpenMP thread.
+#if OMPT_SUPPORT
+    if (ompt) {
+      __ompt_task_finish(task, resumed_task, ompt_task_switch);
+    }
+#endif
+    __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
+    if (KMP_HIDDEN_HELPER_THREAD(gtid))
+      __kmp_hidden_helper_worker_thread_signal();
+    completed = false;
+  }
+
+  if (completed) {
     taskdata->td_flags.complete = 1; // mark the task as completed
+#if OMPX_TASKGRAPH
+    taskdata->td_flags.onced = 1; // mark the task as ran once already
+#endif
 
 #if OMPT_SUPPORT
     // This is not a detached task, we are done here
@@ -1111,7 +1187,11 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
 #endif
           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
       KMP_DEBUG_ASSERT(children >= 0);
+#if OMPX_TASKGRAPH
+      if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
+#else
       if (taskdata->td_taskgroup)
+#endif
         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
     } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
                              task_team->tt.tt_hidden_helper_task_encountered)) {
@@ -1125,6 +1205,13 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
     // function
     KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
     taskdata->td_flags.executing = 0; // suspend the finishing task
+
+    // Decrement the counter of hidden helper tasks to be executed.
+    if (taskdata->td_flags.hidden_helper) {
+      // Hidden helper tasks can only be executed by hidden helper threads.
+      KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
+      KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
+    }
   }
 
   KA_TRACE(
@@ -1136,13 +1223,26 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   // johnmc: if an asynchronous inquiry peers into the runtime system
   // it doesn't see the freed task as the current task.
   thread->th.th_current_task = resumed_task;
-  if (!detach)
+  if (completed)
     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
 
   // TODO: GEH - make sure root team implicit task is initialized properly.
   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
   resumed_task->td_flags.executing = 1; // resume previous task
 
+#if OMPX_TASKGRAPH
+  if (is_taskgraph && __kmp_track_children_task(taskdata) &&
+      taskdata->td_taskgroup) {
+    // TDG: we only release taskgroup barrier here because
+    // free_task_and_ancestors will call
+    // __kmp_free_task, which resets all task parameters such as
+    // taskdata->started, etc. If we release the barrier earlier, these
+    // parameters could be read before being reset. This is not an issue for
+    // non-TDG implementation because we never reuse a task(data) structure
+    KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
+  }
+#endif
+
   KA_TRACE(
       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
            gtid, taskdata, resumed_task));
@@ -1259,6 +1359,9 @@ void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
   task->td_flags.executing = 1;
   task->td_flags.complete = 0;
   task->td_flags.freed = 0;
+#if OMPX_TASKGRAPH
+  task->td_flags.onced = 0;
+#endif
 
   task->td_depnode = NULL;
   task->td_last_tied = task;
@@ -1295,6 +1398,9 @@ void __kmp_finish_implicit_task(kmp_info_t *thread) {
   if (task->td_dephash) {
     int children;
     task->td_flags.complete = 1;
+#if OMPX_TASKGRAPH
+    task->td_flags.onced = 1;
+#endif
     children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
     kmp_tasking_flags_t flags_old = task->td_flags;
     if (children == 0 && flags_old.complete == 1) {
@@ -1409,8 +1515,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
       KA_TRACE(30,
                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
                 gtid));
-      // 1 indicates setup the current team regardless of nthreads
-      __kmp_task_team_setup(thread, team, 1);
+      __kmp_task_team_setup(thread, team);
       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
     }
     kmp_task_team_t *task_team = thread->th.th_task_team;
@@ -1460,7 +1565,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   task = KMP_TASKDATA_TO_TASK(taskdata);
 
 // Make sure task & taskdata are aligned appropriately
-#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
+#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
 #else
@@ -1524,7 +1629,9 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   taskdata->td_flags.executing = 0;
   taskdata->td_flags.complete = 0;
   taskdata->td_flags.freed = 0;
-
+#if OMPX_TASKGRAPH
+  taskdata->td_flags.onced = 0;
+#endif
   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
   // start at one because counts current task and children
   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
@@ -1532,6 +1639,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
   taskdata->td_dephash = NULL;
   taskdata->td_depnode = NULL;
+  taskdata->td_target_data.async_handle = NULL;
   if (flags->tiedness == TASK_UNTIED)
     taskdata->td_last_tied = NULL; // will be set when the task is scheduled
   else
@@ -1559,6 +1667,15 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
     }
   }
 
+#if OMPX_TASKGRAPH
+  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
+  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
+      (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
+    taskdata->is_taskgraph = 1;
+    taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
+    taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
+  }
+#endif
   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
                 gtid, taskdata, taskdata->td_parent));
 
@@ -1598,6 +1715,7 @@ kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
   // target task is untied defined in the specification
   input_flags.tiedness = TASK_UNTIED;
+  input_flags.target = 1;
 
   if (__kmp_enable_hidden_helper)
     input_flags.hidden_helper = TRUE;
@@ -1631,8 +1749,12 @@ __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
 // gtid: global thread ID of caller
 // task: the task to invoke
 // current_task: the task to resume after task invocation
-static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
-                              kmp_taskdata_t *current_task) {
+#ifdef __s390x__
+__attribute__((target("backchain")))
+#endif
+static void
+__kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
+                  kmp_taskdata_t *current_task) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   kmp_info_t *thread;
   int discard = 0 /* false */;
@@ -1674,13 +1796,6 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
   }
 #endif
 
-  // Decreament the counter of hidden helper tasks to be executed
-  if (taskdata->td_flags.hidden_helper) {
-    // Hidden helper tasks can only be executed by hidden helper threads
-    KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
-    KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
-  }
-
   // Proxy tasks are not handled by the runtime
   if (taskdata->td_flags.proxy != TASK_PROXY) {
     __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
@@ -1783,6 +1898,15 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
     KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
 #endif
 
+#if ENABLE_LIBOMPTARGET
+    if (taskdata->td_target_data.async_handle != NULL) {
+      // If we have a valid target async handle, that means that we have already
+      // executed the task routine once. We must query for the handle completion
+      // instead of re-executing the routine.
+      KMP_ASSERT(tgt_target_nowait_query);
+      tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
+    } else
+#endif
     if (task->routine != NULL) {
 #ifdef KMP_GOMP_COMPAT
       if (taskdata->td_flags.native) {
@@ -1823,6 +1947,11 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
 #endif
       __kmp_task_finish<false>(gtid, task, current_task);
   }
+#if OMPT_SUPPORT
+  else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
+    __ompt_task_finish(task, current_task, ompt_task_switch);
+  }
+#endif
 
   KA_TRACE(
       30,
@@ -1855,7 +1984,8 @@ kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
     if (ompt_enabled.ompt_callback_task_create) {
       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
           &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
-          &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
+          &(new_taskdata->ompt_task_info.task_data),
+          TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
           OMPT_GET_RETURN_ADDRESS(0));
     }
   }
@@ -1900,6 +2030,53 @@ kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
                          bool serialize_immediate) {
   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
 
+#if OMPX_TASKGRAPH
+  if (new_taskdata->is_taskgraph &&
+      __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
+    kmp_tdg_info_t *tdg = new_taskdata->tdg;
+    // extend the record_map if needed
+    if (new_taskdata->td_task_id >= new_taskdata->tdg->map_size) {
+      __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
+      // map_size could have been updated by another thread if recursive
+      // taskloop
+      if (new_taskdata->td_task_id >= tdg->map_size) {
+        kmp_uint old_size = tdg->map_size;
+        kmp_uint new_size = old_size * 2;
+        kmp_node_info_t *old_record = tdg->record_map;
+        kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
+            new_size * sizeof(kmp_node_info_t));
+
+        KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
+        tdg->record_map = new_record;
+
+        __kmp_free(old_record);
+
+        for (kmp_int i = old_size; i < new_size; i++) {
+          kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
+              __kmp_successors_size * sizeof(kmp_int32));
+          new_record[i].task = nullptr;
+          new_record[i].successors = successorsList;
+          new_record[i].nsuccessors = 0;
+          new_record[i].npredecessors = 0;
+          new_record[i].successors_size = __kmp_successors_size;
+          KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
+        }
+        // update the size at the end, so that we avoid other
+        // threads use old_record while map_size is already updated
+        tdg->map_size = new_size;
+      }
+      __kmp_release_bootstrap_lock(&tdg->graph_lock);
+    }
+    // record a task
+    if (tdg->record_map[new_taskdata->td_task_id].task == nullptr) {
+      tdg->record_map[new_taskdata->td_task_id].task = new_task;
+      tdg->record_map[new_taskdata->td_task_id].parent_task =
+          new_taskdata->td_parent;
+      KMP_ATOMIC_INC(&tdg->num_tasks);
+    }
+  }
+#endif
+
   /* Should we execute the new task or queue it? For now, let's just always try
      to queue it.  If the queue fills up, then we'll execute it.  */
   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
@@ -1966,7 +2143,7 @@ kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
             &(parent->ompt_task_info.task_data),
             &(parent->ompt_task_info.frame),
             &(new_taskdata->ompt_task_info.task_data),
-            ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
+            TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
             OMPT_LOAD_RETURN_ADDRESS(gtid));
       }
     } else {
@@ -2027,8 +2204,7 @@ kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
           &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
           &(new_taskdata->ompt_task_info.task_data),
-          ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
-          codeptr_ra);
+          TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
     }
   }
 #endif
@@ -2148,7 +2324,6 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
       taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
     }
 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
-
   }
 
   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
@@ -2358,7 +2533,7 @@ void *__kmp_task_reduction_init(int gtid, int num, T *data) {
   KMP_ASSERT(tg != NULL);
   KMP_ASSERT(data != NULL);
   KMP_ASSERT(num > 0);
-  if (nth == 1) {
+  if (nth == 1 && !__kmp_enable_hidden_helper) {
     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
                   gtid, tg));
     return (void *)tg;
@@ -2416,6 +2591,17 @@ the reduction either does not use omp_orig object, or the omp_orig is accessible
 without help of the runtime library.
 */
 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
+#if OMPX_TASKGRAPH
+  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
+  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
+    kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
+    this_tdg->rec_taskred_data =
+        __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
+    this_tdg->rec_num_taskred = num;
+    KMP_MEMCPY(this_tdg->rec_taskred_data, data,
+               sizeof(kmp_task_red_input_t) * num);
+  }
+#endif
   return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
 }
 
@@ -2432,6 +2618,17 @@ Note: this entry supposes the optional compiler-generated initializer routine
 has two parameters, pointer to object to be initialized and pointer to omp_orig
 */
 void *__kmpc_taskred_init(int gtid, int num, void *data) {
+#if OMPX_TASKGRAPH
+  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
+  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
+    kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
+    this_tdg->rec_taskred_data =
+        __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
+    this_tdg->rec_num_taskred = num;
+    KMP_MEMCPY(this_tdg->rec_taskred_data, data,
+               sizeof(kmp_task_red_input_t) * num);
+  }
+#endif
   return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
 }
 
@@ -2474,12 +2671,26 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
   if (tg == NULL)
     tg = thread->th.th_current_task->td_taskgroup;
   KMP_ASSERT(tg != NULL);
-  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
-  kmp_int32 num = tg->reduce_num_data;
+  kmp_taskred_data_t *arr;
+  kmp_int32 num;
   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
 
+#if OMPX_TASKGRAPH
+  if ((thread->th.th_current_task->is_taskgraph) &&
+      (!__kmp_tdg_is_recording(
+          __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
+    tg = thread->th.th_current_task->td_taskgroup;
+    KMP_ASSERT(tg != NULL);
+    KMP_ASSERT(tg->reduce_data != NULL);
+    arr = (kmp_taskred_data_t *)(tg->reduce_data);
+    num = tg->reduce_num_data;
+  }
+#endif
+
   KMP_ASSERT(data != NULL);
   while (tg != NULL) {
+    arr = (kmp_taskred_data_t *)(tg->reduce_data);
+    num = tg->reduce_num_data;
     for (int i = 0; i < num; ++i) {
       if (!arr[i].flags.lazy_priv) {
         if (data == arr[i].reduce_shar ||
@@ -2511,9 +2722,8 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
         return p_priv[tid];
       }
     }
+    KMP_ASSERT(tg->parent);
     tg = tg->parent;
-    arr = (kmp_taskred_data_t *)(tg->reduce_data);
-    num = tg->reduce_num_data;
   }
   KMP_ASSERT2(0, "Unknown task reduction item");
   return NULL; // ERROR, this line never executed
@@ -2523,7 +2733,10 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
 // Called from __kmpc_end_taskgroup()
 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
   kmp_int32 nth = th->th.th_team_nproc;
-  KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
+  KMP_DEBUG_ASSERT(
+      nth > 1 ||
+      __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
+                                   // are using hidden helper threads
   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
   kmp_int32 num = tg->reduce_num_data;
   for (int i = 0; i < num; ++i) {
@@ -2860,6 +3073,7 @@ static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
     if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
                                    ntasks - 1))
       break;
+    ntasks = task_team->tt.tt_num_task_pri;
   } while (ntasks > 0);
   if (ntasks == 0) {
     KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
@@ -3014,7 +3228,7 @@ static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
 // __kmp_steal_task: remove a task from another thread's deque
 // Assume that calling thread has already checked existence of
 // task_team thread_data before calling this routine.
-static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
+static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
                                     kmp_task_team_t *task_team,
                                     std::atomic<kmp_int32> *unfinished_threads,
                                     int *thread_finished,
@@ -3024,15 +3238,18 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
   kmp_taskdata_t *current;
   kmp_thread_data_t *victim_td, *threads_data;
   kmp_int32 target;
-  kmp_int32 victim_tid;
+  kmp_info_t *victim_thr;
 
   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
 
   threads_data = task_team->tt.tt_threads_data;
   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
+  KMP_DEBUG_ASSERT(victim_tid >= 0);
+  KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
 
-  victim_tid = victim_thr->th.th_info.ds.ds_tid;
   victim_td = &threads_data[victim_tid];
+  victim_thr = victim_td->td.td_thr;
+  (void)victim_thr; // Use in TRACE messages which aren't always enabled.
 
   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
                 "task_team=%p ntasks=%d head=%u tail=%u\n",
@@ -3182,8 +3399,6 @@ static inline int __kmp_execute_tasks_template(
 
   nthreads = task_team->tt.tt_nproc;
   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
-  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
-                   task_team->tt.tt_hidden_helper_task_encountered);
   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
 
   while (1) { // Outer loop keeps trying to find tasks in case of single thread
@@ -3247,9 +3462,9 @@ static inline int __kmp_execute_tasks_template(
 
         if (!asleep) {
           // We have a victim to try to steal from
-          task = __kmp_steal_task(other_thread, gtid, task_team,
-                                  unfinished_threads, thread_finished,
-                                  is_constrained);
+          task =
+              __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
+                               thread_finished, is_constrained);
         }
         if (task != NULL) { // set last stolen to victim
           if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
@@ -3735,6 +3950,20 @@ static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
   __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
 }
 
+static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
+                                        kmp_team_t *team) {
+  int team_nth = team->t.t_nproc;
+  // Only need to init if task team is isn't active or team size changed
+  if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
+    TCW_4(task_team->tt.tt_found_tasks, FALSE);
+    TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+    TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
+    TCW_4(task_team->tt.tt_nproc, team_nth);
+    KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
+    TCW_4(task_team->tt.tt_active, TRUE);
+  }
+}
+
 // __kmp_allocate_task_team:
 // Allocates a task team associated with a specific team, taking it from
 // the global task team free list if possible.  Also initializes data
@@ -3742,7 +3971,6 @@ static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
                                                  kmp_team_t *team) {
   kmp_task_team_t *task_team = NULL;
-  int nthreads;
 
   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
@@ -3784,14 +4012,7 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
     // task_team->tt.tt_next = NULL;
   }
 
-  TCW_4(task_team->tt.tt_found_tasks, FALSE);
-  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
-  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
-  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
-
-  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
-  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
-  TCW_4(task_team->tt.tt_active, TRUE);
+  __kmp_task_team_init(task_team, team);
 
   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
                 "unfinished_threads init'd to %d\n",
@@ -3845,6 +4066,40 @@ void __kmp_reap_task_teams(void) {
   }
 }
 
+// View the array of two task team pointers as a pair of pointers:
+//  1) a single task_team pointer
+//  2) next pointer for stack
+// Serial teams can create a stack of task teams for nested serial teams.
+void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
+  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+  kmp_task_team_list_t *current =
+      (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
+  kmp_task_team_list_t *node =
+      (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
+  node->task_team = current->task_team;
+  node->next = current->next;
+  thread->th.th_task_team = current->task_team = NULL;
+  current->next = node;
+}
+
+// Serial team pops a task team off the stack
+void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
+  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+  kmp_task_team_list_t *current =
+      (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
+  if (current->task_team) {
+    __kmp_free_task_team(thread, current->task_team);
+  }
+  kmp_task_team_list_t *next = current->next;
+  if (next) {
+    current->task_team = next->task_team;
+    current->next = next->next;
+    KMP_DEBUG_ASSERT(next != current);
+    __kmp_free(next);
+    thread->th.th_task_team = current->task_team;
+  }
+}
+
 // __kmp_wait_to_unref_task_teams:
 // Some threads could still be in the fork barrier release code, possibly
 // trying to steal tasks.  Wait for each thread to unreference its task team.
@@ -3911,15 +4166,32 @@ void __kmp_wait_to_unref_task_teams(void) {
 
 // __kmp_task_team_setup:  Create a task_team for the current team, but use
 // an already created, unused one if it already exists.
-void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
+void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
 
+  // For the serial and root teams, setup the first task team pointer to point
+  // to task team. The other pointer is a stack of task teams from previous
+  // serial levels.
+  if (team == this_thr->th.th_serial_team ||
+      team == this_thr->th.th_root->r.r_root_team) {
+    KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+    if (team->t.t_task_team[0] == NULL) {
+      team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
+      KA_TRACE(
+          20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
+               " for serial/root team %p\n",
+               __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
+
+    } else
+      __kmp_task_team_init(team->t.t_task_team[0], team);
+    return;
+  }
+
   // If this task_team hasn't been created yet, allocate it. It will be used in
   // the region after the next.
   // If it exists, it is the current task team and shouldn't be touched yet as
   // it may still be in use.
-  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
-      (always || team->t.t_nproc > 1)) {
+  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
     team->t.t_task_team[this_thr->th.th_task_state] =
         __kmp_allocate_task_team(this_thr, team);
     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
@@ -3934,38 +4206,25 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
   // threads spin in the barrier release phase, they will continue to use the
   // previous task_team struct(above), until they receive the signal to stop
   // checking for tasks (they can't safely reference the kmp_team_t struct,
-  // which could be reallocated by the primary thread). No task teams are formed
-  // for serialized teams.
-  if (team->t.t_nproc > 1) {
-    int other_team = 1 - this_thr->th.th_task_state;
-    KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
-    if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
-      team->t.t_task_team[other_team] =
-          __kmp_allocate_task_team(this_thr, team);
-      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
-                    "task_team %p for team %d at parity=%d\n",
-                    __kmp_gtid_from_thread(this_thr),
-                    team->t.t_task_team[other_team], team->t.t_id, other_team));
-    } else { // Leave the old task team struct in place for the upcoming region;
-      // adjust as needed
-      kmp_task_team_t *task_team = team->t.t_task_team[other_team];
-      if (!task_team->tt.tt_active ||
-          team->t.t_nproc != task_team->tt.tt_nproc) {
-        TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
-        TCW_4(task_team->tt.tt_found_tasks, FALSE);
-        TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
-        TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
-        KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
-                          team->t.t_nproc);
-        TCW_4(task_team->tt.tt_active, TRUE);
-      }
-      // if team size has changed, the first thread to enable tasking will
-      // realloc threads_data if necessary
-      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
-                    "%p for team %d at parity=%d\n",
-                    __kmp_gtid_from_thread(this_thr),
-                    team->t.t_task_team[other_team], team->t.t_id, other_team));
-    }
+  // which could be reallocated by the primary thread).
+  int other_team = 1 - this_thr->th.th_task_state;
+  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
+  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
+    team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
+    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
+                  "task_team %p for team %d at parity=%d\n",
+                  __kmp_gtid_from_thread(this_thr),
+                  team->t.t_task_team[other_team], team->t.t_id, other_team));
+  } else { // Leave the old task team struct in place for the upcoming region;
+    // adjust as needed
+    kmp_task_team_t *task_team = team->t.t_task_team[other_team];
+    __kmp_task_team_init(task_team, team);
+    // if team size has changed, the first thread to enable tasking will
+    // realloc threads_data if necessary
+    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
+                  "%p for team %d at parity=%d\n",
+                  __kmp_gtid_from_thread(this_thr),
+                  team->t.t_task_team[other_team], team->t.t_id, other_team));
   }
 
   // For regular thread, task enabling should be called when the task is going
@@ -3991,9 +4250,11 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
 
 // __kmp_task_team_sync: Propagation of task team data from team to threads
 // which happens just after the release phase of a team barrier.  This may be
-// called by any thread, but only for teams with # threads > 1.
+// called by any thread. This is not called for serial or root teams.
 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+  KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
+  KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
 
   // Toggle the th_task_state field, to switch which task_team this thread
   // refers to
@@ -4011,8 +4272,7 @@ void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
 }
 
 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
-// barrier gather phase. Only called by primary thread if #threads in team > 1
-// or if proxy tasks were created.
+// barrier gather phase. Only called by the primary thread.
 //
 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
 // by passing in 0 optionally as the last argument. When wait is zero, primary
@@ -4046,9 +4306,6 @@ void __kmp_task_team_wait(
         ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
          "setting active to false, setting local and team's pointer to NULL\n",
          __kmp_gtid_from_thread(this_thr), task_team));
-    KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
-                     task_team->tt.tt_found_proxy_tasks == TRUE ||
-                     task_team->tt.tt_hidden_helper_task_encountered == TRUE);
     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
     TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
@@ -4203,6 +4460,9 @@ static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
 
   taskdata->td_flags.complete = 1; // mark the task as completed
+#if OMPX_TASKGRAPH
+  taskdata->td_flags.onced = 1;
+#endif
 
   if (taskdata->td_taskgroup)
     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
@@ -4401,8 +4661,14 @@ void __kmp_fulfill_event(kmp_event_t *event) {
 //
 // thread:   allocating thread
 // task_src: pointer to source task to be duplicated
+// taskloop_recur: used only when dealing with taskgraph,
+//      indicating whether we need to update task->td_task_id
 // returns:  a pointer to the allocated kmp_task_t structure (task).
-kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
+kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
+#if OMPX_TASKGRAPH
+                                 , int taskloop_recur
+#endif
+) {
   kmp_task_t *task;
   kmp_taskdata_t *taskdata;
   kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
@@ -4430,7 +4696,15 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
   task = KMP_TASKDATA_TO_TASK(taskdata);
 
   // Initialize new task (only specific fields not affected by memcpy)
+#if OMPX_TASKGRAPH
+  if (!taskdata->is_taskgraph || taskloop_recur)
+    taskdata->td_task_id = KMP_GEN_TASK_ID();
+  else if (taskdata->is_taskgraph &&
+           __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
+    taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
+#else
   taskdata->td_task_id = KMP_GEN_TASK_ID();
+#endif
   if (task->shareds != NULL) { // need setup shareds pointer
     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
     task->shareds = &((char *)taskdata)[shareds_offset];
@@ -4657,7 +4931,13 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
           lastpriv = 1;
       }
     }
+
+#if OMPX_TASKGRAPH
+    next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
+#else
     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
+#endif
+
     kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
     kmp_taskloop_bounds_t next_task_bounds =
         kmp_taskloop_bounds_t(next_task, task_bounds);
@@ -4854,7 +5134,12 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   lb1 = ub0 + st;
 
   // create pattern task for 2nd half of the loop
+#if OMPX_TASKGRAPH
+  next_task = __kmp_task_dup_alloc(thread, task,
+                                   /* taskloop_recur */ 1);
+#else
   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
+#endif
   // adjust lower bound (upper bound is not changed) for the 2nd half
   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
   if (ptask_dup != NULL) // construct firstprivates, etc.
@@ -4887,6 +5172,12 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   p->codeptr_ra = codeptr_ra;
 #endif
 
+#if OMPX_TASKGRAPH
+  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
+  new_task_data->tdg = taskdata->tdg;
+  new_task_data->is_taskgraph = 0;
+#endif
+
 #if OMPT_SUPPORT
   // schedule new task with correct return address for OMPT events
   __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
@@ -4926,6 +5217,9 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
     __kmpc_taskgroup(loc, gtid);
   }
 
+#if OMPX_TASKGRAPH
+  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
+#endif
   // =========================================================================
   // calculate loop parameters
   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
@@ -4982,7 +5276,7 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
   switch (sched) {
   case 0: // no schedule clause specified, we can choose the default
     // let's try to schedule (team_size*10) tasks
-    grainsize = thread->th.th_team_nproc * 10;
+    grainsize = thread->th.th_team_nproc * static_cast<kmp_uint64>(10);
     KMP_FALLTHROUGH();
   case 2: // num_tasks provided
     if (grainsize > tc) {
@@ -5131,3 +5425,300 @@ void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
                  modifier, task_dup);
   KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
 }
+
+/*!
+@ingroup TASKING
+@param gtid Global Thread ID of current thread
+@return Returns a pointer to the thread's current task async handle. If no task
+is present or gtid is invalid, returns NULL.
+
+Acqurires a pointer to the target async handle from the current task.
+*/
+void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid) {
+  if (gtid == KMP_GTID_DNE)
+    return NULL;
+
+  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
+  kmp_taskdata_t *taskdata = thread->th.th_current_task;
+
+  if (!taskdata)
+    return NULL;
+
+  return &taskdata->td_target_data.async_handle;
+}
+
+/*!
+@ingroup TASKING
+@param gtid Global Thread ID of current thread
+@return Returns TRUE if the current task being executed of the given thread has
+a task team allocated to it. Otherwise, returns FALSE.
+
+Checks if the current thread has a task team.
+*/
+bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
+  if (gtid == KMP_GTID_DNE)
+    return FALSE;
+
+  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
+  kmp_taskdata_t *taskdata = thread->th.th_current_task;
+
+  if (!taskdata)
+    return FALSE;
+
+  return taskdata->td_task_team != NULL;
+}
+
+#if OMPX_TASKGRAPH
+// __kmp_find_tdg: identify a TDG through its ID
+// gtid:   Global Thread ID
+// tdg_id: ID of the TDG
+// returns: If a TDG corresponding to this ID is found and not
+// its initial state, return the pointer to it, otherwise nullptr
+static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
+  kmp_tdg_info_t *res = nullptr;
+  if (__kmp_max_tdgs == 0)
+    return res;
+
+  if (__kmp_global_tdgs == NULL)
+    __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
+        sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
+
+  if ((__kmp_global_tdgs[tdg_id]) &&
+      (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
+    res = __kmp_global_tdgs[tdg_id];
+  return res;
+}
+
+// __kmp_print_tdg_dot: prints the TDG to a dot file
+// tdg:    ID of the TDG
+// gtid:   Global Thread ID
+void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
+  kmp_int32 tdg_id = tdg->tdg_id;
+  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
+
+  char file_name[20];
+  sprintf(file_name, "tdg_%d.dot", tdg_id);
+  kmp_safe_raii_file_t tdg_file(file_name, "w");
+
+  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
+  fprintf(tdg_file,
+          "digraph TDG {\n"
+          "   compound=true\n"
+          "   subgraph cluster {\n"
+          "      label=TDG_%d\n",
+          tdg_id);
+  for (kmp_int32 i = 0; i < num_tasks; i++) {
+    fprintf(tdg_file, "      %d[style=bold]\n", i);
+  }
+  fprintf(tdg_file, "   }\n");
+  for (kmp_int32 i = 0; i < num_tasks; i++) {
+    kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
+    kmp_int32 *successors = tdg->record_map[i].successors;
+    if (nsuccessors > 0) {
+      for (kmp_int32 j = 0; j < nsuccessors; j++)
+        fprintf(tdg_file, "   %d -> %d \n", i, successors[j]);
+    }
+  }
+  fprintf(tdg_file, "}");
+  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
+}
+
+// __kmp_start_record: launch the execution of a previous
+// recorded TDG
+// gtid:   Global Thread ID
+// tdg:    ID of the TDG
+void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
+  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
+  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
+                tdg->tdg_id, tdg->num_roots));
+  kmp_node_info_t *this_record_map = tdg->record_map;
+  kmp_int32 *this_root_tasks = tdg->root_tasks;
+  kmp_int32 this_num_roots = tdg->num_roots;
+  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
+
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *parent_task = thread->th.th_current_task;
+
+  if (tdg->rec_taskred_data) {
+    __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
+  }
+
+  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
+    kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
+
+    td->td_parent = parent_task;
+    this_record_map[j].parent_task = parent_task;
+
+    kmp_taskgroup_t *parent_taskgroup =
+        this_record_map[j].parent_task->td_taskgroup;
+
+    KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
+                      this_record_map[j].npredecessors);
+    KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
+
+    if (parent_taskgroup) {
+      KMP_ATOMIC_INC(&parent_taskgroup->count);
+      // The taskgroup is different so we must update it
+      td->td_taskgroup = parent_taskgroup;
+    } else if (td->td_taskgroup != nullptr) {
+      // If the parent doesnt have a taskgroup, remove it from the task
+      td->td_taskgroup = nullptr;
+    }
+    if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
+      KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
+  }
+
+  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
+    __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
+  }
+  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
+                tdg->tdg_id, tdg->num_roots));
+}
+
+// __kmp_start_record: set up a TDG structure and turn the
+// recording flag to true
+// gtid:        Global Thread ID of the encountering thread
+// input_flags: Flags associated with the TDG
+// tdg_id:      ID of the TDG to record
+static inline void __kmp_start_record(kmp_int32 gtid,
+                                      kmp_taskgraph_flags_t *flags,
+                                      kmp_int32 tdg_id) {
+  kmp_tdg_info_t *tdg =
+      (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
+  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
+  // Initializing the TDG structure
+  tdg->tdg_id = tdg_id;
+  tdg->map_size = INIT_MAPSIZE;
+  tdg->num_roots = -1;
+  tdg->root_tasks = nullptr;
+  tdg->tdg_status = KMP_TDG_RECORDING;
+  tdg->rec_num_taskred = 0;
+  tdg->rec_taskred_data = nullptr;
+  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
+
+  // Initializing the list of nodes in this TDG
+  kmp_node_info_t *this_record_map =
+      (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
+  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
+    kmp_int32 *successorsList =
+        (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
+    this_record_map[i].task = nullptr;
+    this_record_map[i].successors = successorsList;
+    this_record_map[i].nsuccessors = 0;
+    this_record_map[i].npredecessors = 0;
+    this_record_map[i].successors_size = __kmp_successors_size;
+    KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
+  }
+
+  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
+}
+
+// __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
+// the beginning of the record process of a task region
+// loc_ref:     Location of TDG, not used yet
+// gtid:        Global Thread ID of the encountering thread
+// input_flags: Flags associated with the TDG
+// tdg_id:      ID of the TDG to record, for now, incremental integer
+// returns:     1 if we record, otherwise, 0
+kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
+                                   kmp_int32 input_flags, kmp_int32 tdg_id) {
+
+  kmp_int32 res;
+  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
+  KA_TRACE(10,
+           ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
+            gtid, loc_ref, input_flags, tdg_id));
+
+  if (__kmp_max_tdgs == 0) {
+    KA_TRACE(
+        10,
+        ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
+         "__kmp_max_tdgs = 0\n",
+         gtid, loc_ref, input_flags, tdg_id));
+    return 1;
+  }
+
+  __kmpc_taskgroup(loc_ref, gtid);
+  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
+    // TODO: use re_record flag
+    __kmp_exec_tdg(gtid, tdg);
+    res = 0;
+  } else {
+    __kmp_curr_tdg_idx = tdg_id;
+    KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
+    __kmp_start_record(gtid, flags, tdg_id);
+    __kmp_num_tdg++;
+    res = 1;
+  }
+  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
+                gtid, tdg_id, res ? "record" : "execute"));
+  return res;
+}
+
+// __kmp_end_record: set up a TDG after recording it
+// gtid:   Global thread ID
+// tdg:    Pointer to the TDG
+void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
+  // Store roots
+  kmp_node_info_t *this_record_map = tdg->record_map;
+  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
+  kmp_int32 *this_root_tasks =
+      (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
+  kmp_int32 this_map_size = tdg->map_size;
+  kmp_int32 this_num_roots = 0;
+  kmp_info_t *thread = __kmp_threads[gtid];
+
+  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
+    if (this_record_map[i].npredecessors == 0) {
+      this_root_tasks[this_num_roots++] = i;
+    }
+  }
+
+  // Update with roots info and mapsize
+  tdg->map_size = this_map_size;
+  tdg->num_roots = this_num_roots;
+  tdg->root_tasks = this_root_tasks;
+  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
+  tdg->tdg_status = KMP_TDG_READY;
+
+  if (thread->th.th_current_task->td_dephash) {
+    __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
+    thread->th.th_current_task->td_dephash = NULL;
+  }
+
+  // Reset predecessor counter
+  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
+    KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
+                      this_record_map[i].npredecessors);
+  }
+  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
+
+  if (__kmp_tdg_dot)
+    __kmp_print_tdg_dot(tdg, gtid);
+}
+
+// __kmpc_end_record_task: wrapper around __kmp_end_record to mark
+// the end of recording phase
+//
+// loc_ref:      Source location information
+// gtid:         Global thread ID
+// input_flags:  Flags attached to the graph
+// tdg_id:       ID of the TDG just finished recording
+void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
+                            kmp_int32 input_flags, kmp_int32 tdg_id) {
+  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
+
+  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
+                " tdg=%d with flags=%d\n",
+                gtid, loc_ref, tdg_id, input_flags));
+  if (__kmp_max_tdgs) {
+    // TODO: use input_flags->nowait
+    __kmpc_end_taskgroup(loc_ref, gtid);
+    if (__kmp_tdg_is_recording(tdg->tdg_status))
+      __kmp_end_record(gtid, tdg);
+  }
+  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
+                " tdg=%d, its status is now READY\n",
+                gtid, loc_ref, tdg_id));
+}
+#endif
diff --git a/contrib/libs/cxxsupp/openmp/kmp_threadprivate.cpp b/contrib/libs/cxxsupp/openmp/kmp_threadprivate.cpp
index b79ac7d6d2b..c4a1ec6e102 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_threadprivate.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_threadprivate.cpp
@@ -248,16 +248,16 @@ void __kmp_common_destroy_gtid(int gtid) {
         if (d_tn->is_vec) {
           if (d_tn->dt.dtorv != 0) {
             (void)(*d_tn->dt.dtorv)(tn->par_addr, d_tn->vec_len);
-          }
-          if (d_tn->obj_init != 0) {
-            (void)(*d_tn->dt.dtorv)(d_tn->obj_init, d_tn->vec_len);
+            if (d_tn->obj_init != 0) {
+              (void)(*d_tn->dt.dtorv)(d_tn->obj_init, d_tn->vec_len);
+            }
           }
         } else {
           if (d_tn->dt.dtor != 0) {
             (void)(*d_tn->dt.dtor)(tn->par_addr);
-          }
-          if (d_tn->obj_init != 0) {
-            (void)(*d_tn->dt.dtor)(d_tn->obj_init);
+            if (d_tn->obj_init != 0) {
+              (void)(*d_tn->dt.dtor)(d_tn->obj_init);
+            }
           }
         }
       }
diff --git a/contrib/libs/cxxsupp/openmp/kmp_utility.cpp b/contrib/libs/cxxsupp/openmp/kmp_utility.cpp
index 9465f720e07..a5f9b3a4004 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_utility.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_utility.cpp
@@ -32,68 +32,6 @@ static const char *unknown = "unknown";
 static int trace_level = 5;
 #endif
 
-/* LOG_ID_BITS  = ( 1 + floor( log_2( max( log_per_phy - 1, 1 ))))
- * APIC_ID      = (PHY_ID << LOG_ID_BITS) | LOG_ID
- * PHY_ID       = APIC_ID >> LOG_ID_BITS
- */
-int __kmp_get_physical_id(int log_per_phy, int apic_id) {
-  int index_lsb, index_msb, temp;
-
-  if (log_per_phy > 1) {
-    index_lsb = 0;
-    index_msb = 31;
-
-    temp = log_per_phy;
-    while ((temp & 1) == 0) {
-      temp >>= 1;
-      index_lsb++;
-    }
-
-    temp = log_per_phy;
-    while ((temp & 0x80000000) == 0) {
-      temp <<= 1;
-      index_msb--;
-    }
-
-    /* If >1 bits were set in log_per_phy, choose next higher power of 2 */
-    if (index_lsb != index_msb)
-      index_msb++;
-
-    return ((int)(apic_id >> index_msb));
-  }
-
-  return apic_id;
-}
-
-/*
- * LOG_ID_BITS  = ( 1 + floor( log_2( max( log_per_phy - 1, 1 ))))
- * APIC_ID      = (PHY_ID << LOG_ID_BITS) | LOG_ID
- * LOG_ID       = APIC_ID & (( 1 << LOG_ID_BITS ) - 1 )
- */
-int __kmp_get_logical_id(int log_per_phy, int apic_id) {
-  unsigned current_bit;
-  int bits_seen;
-
-  if (log_per_phy <= 1)
-    return (0);
-
-  bits_seen = 0;
-
-  for (current_bit = 1; log_per_phy != 0; current_bit <<= 1) {
-    if (log_per_phy & current_bit) {
-      log_per_phy &= ~current_bit;
-      bits_seen++;
-    }
-  }
-
-  /* If exactly 1 bit was set in log_per_phy, choose next lower power of 2 */
-  if (bits_seen == 1) {
-    current_bit >>= 1;
-  }
-
-  return ((int)((current_bit - 1) & apic_id));
-}
-
 static kmp_uint64 __kmp_parse_frequency( // R: Frequency in Hz.
     char const *frequency // I: Float number and unit: MHz, GHz, or TGz.
 ) {
@@ -126,7 +64,6 @@ static kmp_uint64 __kmp_parse_frequency( // R: Frequency in Hz.
 void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
   struct kmp_cpuid buf;
   int max_arg;
-  int log_per_phy;
 #ifdef KMP_DEBUG
   int cflush_size;
 #endif
@@ -233,11 +170,8 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
 
     if ((buf.edx >> 28) & 1) {
       /* Bits 23-16: Logical Processors per Physical Processor (1 for P4) */
-      log_per_phy = data[2];
       p->apic_id = data[3]; /* Bits 31-24: Processor Initial APIC ID (X) */
-      KA_TRACE(trace_level, (" HT(%d TPUs)", log_per_phy));
-      p->physical_id = __kmp_get_physical_id(log_per_phy, p->apic_id);
-      p->logical_id = __kmp_get_logical_id(log_per_phy, p->apic_id);
+      KA_TRACE(trace_level, (" HT(%d TPUs)", data[2]));
     }
 #ifdef KMP_DEBUG
     if ((buf.edx >> 29) & 1) {
@@ -297,6 +231,8 @@ void __kmp_expand_host_name(char *buffer, size_t size) {
     if (!GetComputerNameA(buffer, &s))
       KMP_STRCPY_S(buffer, size, unknown);
   }
+#elif KMP_OS_WASI
+  KMP_STRCPY_S(buffer, size, unknown);
 #else
   buffer[size - 2] = 0;
   if (gethostname(buffer, size) || buffer[size - 2] != 0)
@@ -373,7 +309,7 @@ void __kmp_expand_file_name(char *result, size_t rlen, char *pattern) {
         case 'I':
         case 'i': {
           pid_t id = getpid();
-#if KMP_ARCH_X86_64 && defined(__MINGW32__)
+#if (KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) && defined(__MINGW32__)
           snp_result = KMP_SNPRINTF(pos, end - pos + 1, "%0*lld", width, id);
 #else
           snp_result = KMP_SNPRINTF(pos, end - pos + 1, "%0*d", width, id);
@@ -406,3 +342,16 @@ void __kmp_expand_file_name(char *result, size_t rlen, char *pattern) {
 
   *pos = '\0';
 }
+
+#if !OMPT_SUPPORT
+extern "C" {
+typedef struct ompt_start_tool_result_t ompt_start_tool_result_t;
+// Define symbols expected by VERSION script
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+                                          const char *runtime_version) {
+  return nullptr;
+}
+
+void ompt_libomp_connect(ompt_start_tool_result_t *result) { result = nullptr; }
+}
+#endif
diff --git a/contrib/libs/cxxsupp/openmp/kmp_utils.h b/contrib/libs/cxxsupp/openmp/kmp_utils.h
new file mode 100644
index 00000000000..a557f929e6e
--- /dev/null
+++ b/contrib/libs/cxxsupp/openmp/kmp_utils.h
@@ -0,0 +1,55 @@
+/*
+ * kmp_utils.h -- Utilities that used internally
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef __KMP_UTILS_H__
+#define __KMP_UTILS_H__
+
+#include <cstddef>
+
+#include "kmp.h"
+
+/// A simple pure header implementation of VLA that aims to replace uses of
+/// actual VLA, which can cause compile warning. This class by default creates a
+/// stack buffer that can accomodate \p N elements. If the number of elements is
+/// greater than \p N, then a heap buffer will be allocated and used to
+/// accomodate the elements. Similar to the actual VLA, we don't check boundary
+/// (for now), so we will not store the number of elements. We can always revise
+/// it later.
+template <typename T, unsigned N = 8> class SimpleVLA final {
+  T StackBuffer[N];
+  T *HeapBuffer = nullptr;
+  T *Ptr = StackBuffer;
+
+public:
+  SimpleVLA() = delete;
+  SimpleVLA(const SimpleVLA &) = delete;
+  SimpleVLA(SimpleVLA &&) = delete;
+  SimpleVLA &operator=(const SimpleVLA &) = delete;
+  SimpleVLA &operator=(SimpleVLA &&) = delete;
+
+  explicit SimpleVLA(unsigned NumOfElements) noexcept {
+    if (NumOfElements > N) {
+      HeapBuffer =
+          reinterpret_cast<T *>(__kmp_allocate(NumOfElements * sizeof(T)));
+      Ptr = HeapBuffer;
+    }
+  }
+
+  ~SimpleVLA() {
+    if (HeapBuffer)
+      __kmp_free(HeapBuffer);
+  }
+
+  operator T *() noexcept { return Ptr; }
+  operator const T *() const noexcept { return Ptr; }
+};
+
+#endif
diff --git a/contrib/libs/cxxsupp/openmp/kmp_version.cpp b/contrib/libs/cxxsupp/openmp/kmp_version.cpp
index bb600c120dd..39d0f6084ba 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_version.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_version.cpp
@@ -179,7 +179,7 @@ void __kmp_print_version_1(void) {
       &buffer, "%sthread affinity support: %s\n", KMP_VERSION_PREF_STR,
 #if KMP_AFFINITY_SUPPORTED
       (KMP_AFFINITY_CAPABLE()
-           ? (__kmp_affinity_type == affinity_none ? "not used" : "yes")
+           ? (__kmp_affinity.type == affinity_none ? "not used" : "yes")
            : "no")
 #else
       "no"
diff --git a/contrib/libs/cxxsupp/openmp/kmp_wait_release.h b/contrib/libs/cxxsupp/openmp/kmp_wait_release.h
index 3fcae5687d1..9baf280228e 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_wait_release.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_wait_release.h
@@ -104,7 +104,8 @@ template <> struct flag_traits<flag_oncore> {
 template <flag_type FlagType> class kmp_flag {
 protected:
   flag_properties t; /**< "Type" of the flag in loc */
-  kmp_info_t *waiting_threads[1]; /**< Threads sleeping on this thread. */
+  /**< Threads sleeping on this thread. */
+  kmp_info_t *waiting_threads[1] = {nullptr};
   kmp_uint32 num_waiting_threads; /**< Num threads sleeping on this thread. */
   std::atomic<bool> *sleepLoc;
 
@@ -140,7 +141,7 @@ template <typename PtrType, flag_type FlagType, bool Sleepable>
 class kmp_flag_native : public kmp_flag<FlagType> {
 protected:
   volatile PtrType *loc;
-  PtrType checker; /**< When flag==checker, it has been released. */
+  PtrType checker = (PtrType)0; /**< When flag==checker, it has been released */
   typedef flag_traits<FlagType> traits_type;
 
 public:
@@ -234,7 +235,7 @@ template <typename PtrType, flag_type FlagType, bool Sleepable>
 class kmp_flag_atomic : public kmp_flag<FlagType> {
 protected:
   std::atomic<PtrType> *loc; /**< Pointer to flag location to wait on */
-  PtrType checker; /**< Flag == checker means it has been released. */
+  PtrType checker = (PtrType)0; /**< Flag==checker means it has been released */
 public:
   typedef flag_traits<FlagType> traits_type;
   typedef PtrType flag_t;
@@ -323,19 +324,21 @@ static void __ompt_implicit_task_end(kmp_info_t *this_thr,
                                      ompt_state_t ompt_state,
                                      ompt_data_t *tId) {
   int ds_tid = this_thr->th.th_info.ds.ds_tid;
-  if (ompt_state == ompt_state_wait_barrier_implicit) {
+  if (ompt_state == ompt_state_wait_barrier_implicit_parallel ||
+      ompt_state == ompt_state_wait_barrier_teams) {
     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
 #if OMPT_OPTIONAL
     void *codeptr = NULL;
+    ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
+    if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
+      sync_kind = ompt_sync_region_barrier_teams;
     if (ompt_enabled.ompt_callback_sync_region_wait) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
-          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
-          codeptr);
+          sync_kind, ompt_scope_end, NULL, tId, codeptr);
     }
     if (ompt_enabled.ompt_callback_sync_region) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
-          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
-          codeptr);
+          sync_kind, ompt_scope_end, NULL, tId, codeptr);
     }
 #endif
     if (!KMP_MASTER_TID(ds_tid)) {
@@ -455,7 +458,9 @@ final_spin=FALSE)
   ompt_data_t *tId;
   if (ompt_enabled.enabled) {
     ompt_entry_state = this_thr->th.ompt_thread_info.state;
-    if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit ||
+    if (!final_spin ||
+        (ompt_entry_state != ompt_state_wait_barrier_implicit_parallel &&
+         ompt_entry_state != ompt_state_wait_barrier_teams) ||
         KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
       ompt_lw_taskteam_t *team = NULL;
       if (this_thr->th.th_team)
@@ -931,7 +936,8 @@ class kmp_flag_oncore : public kmp_flag_native<kmp_uint64, flag_oncore, false> {
   kmp_uint32 offset; /**< Portion of flag of interest for an operation. */
   bool flag_switch; /**< Indicates a switch in flag location. */
   enum barrier_type bt; /**< Barrier type. */
-  kmp_info_t *this_thr; /**< Thread to redirect to different flag location. */
+  /**< Thread to redirect to different flag location. */
+  kmp_info_t *this_thr = nullptr;
 #if USE_ITT_BUILD
   void *itt_sync_obj; /**< ITT object to pass to new flag location. */
 #endif
@@ -1038,15 +1044,9 @@ static inline void __kmp_null_resume_wrapper(kmp_info_t *thr) {
   case flag_oncore:
     __kmp_resume_oncore(gtid, RCAST(kmp_flag_oncore *, flag));
     break;
-#ifdef KMP_DEBUG
   case flag_unset:
     KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d is unset\n", type));
     break;
-  default:
-    KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d does not match any "
-                   "known flag type\n",
-                   type));
-#endif
   }
 }
 
diff --git a/contrib/libs/cxxsupp/openmp/kmp_wrapper_getpid.h b/contrib/libs/cxxsupp/openmp/kmp_wrapper_getpid.h
index 32ede3ed715..d31c6e80f75 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_wrapper_getpid.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_wrapper_getpid.h
@@ -17,20 +17,25 @@
 
 // On Unix-like systems (Linux* OS and OS X*) getpid() is declared in standard
 // headers.
+#if !defined(KMP_OS_AIX)
 #include <sys/syscall.h>
+#endif
 #include <sys/types.h>
 #include <unistd.h>
 #if KMP_OS_DARWIN
 // OS X
 #define __kmp_gettid() pthread_mach_thread_np(pthread_self())
-#elif KMP_OS_FREEBSD
+#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
 #include <pthread_np.h>
 #define __kmp_gettid() pthread_getthreadid_np()
 #elif KMP_OS_NETBSD
 #include <lwp.h>
 #define __kmp_gettid() _lwp_self()
 #elif KMP_OS_OPENBSD
-#define __kmp_gettid() syscall(SYS_getthrid)
+#define __kmp_gettid() getthrid()
+#elif KMP_OS_AIX
+#include <pthread.h>
+#define __kmp_gettid() pthread_self()
 #elif defined(SYS_gettid)
 // Hopefully other Unix systems define SYS_gettid syscall for getting os thread
 // id
diff --git a/contrib/libs/cxxsupp/openmp/kmp_wrapper_malloc.h b/contrib/libs/cxxsupp/openmp/kmp_wrapper_malloc.h
index c027e0b297d..1f75e88a23b 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_wrapper_malloc.h
+++ b/contrib/libs/cxxsupp/openmp/kmp_wrapper_malloc.h
@@ -154,7 +154,7 @@
 
 #if KMP_DEBUG
 
-#if KMP_OS_WINDOWS && _DEBUG
+#if KMP_OS_WINDOWS && _DEBUG && !defined(__MINGW32__)
 // KMP_DEBUG != _DEBUG. MS debug RTL is available only if _DEBUG is defined.
 
 // Windows* OS has native memory debugging capabilities. Enable them.
diff --git a/contrib/libs/cxxsupp/openmp/omp-tools.h b/contrib/libs/cxxsupp/openmp/omp-tools.h
index 6bae305c711..471f46a9073 100644
--- a/contrib/libs/cxxsupp/openmp/omp-tools.h
+++ b/contrib/libs/cxxsupp/openmp/omp-tools.h
@@ -78,6 +78,8 @@
                                             /* implicit barrier at the end of worksharing */    \
     macro (ompt_state_wait_barrier_implicit, 0x013)  /* implicit barrier */                      \
     macro (ompt_state_wait_barrier_explicit, 0x014)  /* explicit barrier */                      \
+    macro (ompt_state_wait_barrier_implementation, 0x015) /* implementation barrier */           \
+    macro (ompt_state_wait_barrier_teams, 0x016)          /* teams barrier */                    \
                                                                                                 \
     /* task wait states (32..63) */                                                             \
     macro (ompt_state_wait_taskwait, 0x020)  /* waiting at a taskwait */                         \
@@ -108,7 +110,7 @@
     macro (kmp_mutex_impl_queuing, 2)      /* based on some fair policy */           \
     macro (kmp_mutex_impl_speculative, 3)  /* based on HW-supported speculation */
 
-#define FOREACH_OMPT_EVENT(macro)                                                                                        \
+#define FOREACH_OMPT_HOST_EVENT(macro)                                                                                   \
                                                                                                                          \
     /*--- Mandatory Events ---*/                                                                                         \
     macro (ompt_callback_thread_begin,      ompt_callback_thread_begin_t,       1) /* thread begin                    */ \
@@ -121,18 +123,8 @@
     macro (ompt_callback_task_schedule,     ompt_callback_task_schedule_t,      6) /* task schedule                   */ \
     macro (ompt_callback_implicit_task,     ompt_callback_implicit_task_t,      7) /* implicit task                   */ \
                                                                                                                          \
-    macro (ompt_callback_target,            ompt_callback_target_t,             8) /* target                          */ \
-    macro (ompt_callback_target_data_op,    ompt_callback_target_data_op_t,     9) /* target data op                  */ \
-    macro (ompt_callback_target_submit,     ompt_callback_target_submit_t,     10) /* target  submit                  */ \
-                                                                                                                         \
     macro (ompt_callback_control_tool,      ompt_callback_control_tool_t,      11) /* control tool                    */ \
                                                                                                                          \
-    macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize               */ \
-    macro (ompt_callback_device_finalize,   ompt_callback_device_finalize_t,   13) /* device finalize                 */ \
-                                                                                                                         \
-    macro (ompt_callback_device_load,       ompt_callback_device_load_t,       14) /* device load                     */ \
-    macro (ompt_callback_device_unload,     ompt_callback_device_unload_t,     15) /* device unload                   */ \
-                                                                                                                         \
     /* Optional Events */                                                                                                \
     macro (ompt_callback_sync_region_wait,  ompt_callback_sync_region_t,       16) /* sync region wait begin or end   */ \
                                                                                                                          \
@@ -145,8 +137,6 @@
                                                                                                                          \
     macro (ompt_callback_masked,            ompt_callback_masked_t,            21) /* task at masked begin or end     */ \
                                                                                                                          \
-    macro (ompt_callback_target_map,        ompt_callback_target_map_t,        22) /* target map                      */ \
-                                                                                                                         \
     macro (ompt_callback_sync_region,       ompt_callback_sync_region_t,       23) /* sync region begin or end        */ \
                                                                                                                          \
     macro (ompt_callback_lock_init,         ompt_callback_mutex_acquire_t,     24) /* lock init                       */ \
@@ -164,11 +154,50 @@
     macro (ompt_callback_reduction,         ompt_callback_sync_region_t,       31) /* reduction                       */ \
                                                                                                                          \
     macro (ompt_callback_dispatch,          ompt_callback_dispatch_t,          32) /* dispatch of work                */ \
+    macro (ompt_callback_error,             ompt_callback_error_t,             37) /* error                           */
+
+#define FOREACH_OMPT_DEVICE_EVENT(macro)                                                                                 \
+    /*--- Mandatory Events ---*/                                                                                         \
+    macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize               */ \
+    macro (ompt_callback_device_finalize,   ompt_callback_device_finalize_t,   13) /* device finalize                 */ \
+                                                                                                                         \
+    macro (ompt_callback_device_load,       ompt_callback_device_load_t,       14) /* device load                     */ \
+    macro (ompt_callback_device_unload,     ompt_callback_device_unload_t,     15) /* device unload                   */
+
+#define FOREACH_OMPT_NOEMI_EVENT(macro)                                                                                  \
+    /*--- Mandatory Events ---*/                                                                                         \
+    macro (ompt_callback_target,            ompt_callback_target_t,             8) /* target                          */ \
+    macro (ompt_callback_target_data_op,    ompt_callback_target_data_op_t,     9) /* target data op                  */ \
+    macro (ompt_callback_target_submit,     ompt_callback_target_submit_t,     10) /* target  submit                  */ \
+    /* Optional Events */                                                                                                \
+    macro (ompt_callback_target_map,        ompt_callback_target_map_t,        22) /* target map                      */
+
+#define FOREACH_OMPT_EMI_EVENT(macro)                                                                                    \
+    /*--- Mandatory Events ---*/                                                                                         \
     macro (ompt_callback_target_emi,        ompt_callback_target_emi_t,        33) /* target                          */ \
     macro (ompt_callback_target_data_op_emi,ompt_callback_target_data_op_emi_t,34) /* target data op                  */ \
     macro (ompt_callback_target_submit_emi, ompt_callback_target_submit_emi_t, 35) /* target submit                   */ \
-    macro (ompt_callback_target_map_emi,    ompt_callback_target_map_emi_t,    36) /* target map                      */ \
-    macro (ompt_callback_error,             ompt_callback_error_t,             37) /* error                           */
+    /* Optional Events */                                                                                                \
+    macro (ompt_callback_target_map_emi,    ompt_callback_target_map_emi_t,    36) /* target map                      */
+
+#define FOREACH_OMPT_50_TARGET_EVENT(macro)                                                                              \
+    FOREACH_OMPT_DEVICE_EVENT(macro)                                                                                     \
+    FOREACH_OMPT_NOEMI_EVENT(macro) 
+
+#define FOREACH_OMPT_51_TARGET_EVENT(macro)                                                                              \
+    FOREACH_OMPT_DEVICE_EVENT(macro)                                                                                     \
+    FOREACH_OMPT_EMI_EVENT(macro) 
+
+#define FOREACH_OMPT_EVENT(macro)                                                                                        \
+    FOREACH_OMPT_HOST_EVENT(macro)                                                                                       \
+    FOREACH_OMPT_DEVICE_EVENT(macro)                                                                                     \
+    FOREACH_OMPT_NOEMI_EVENT(macro)                                                                                      \
+    FOREACH_OMPT_EMI_EVENT(macro)
+
+#define FOREACH_OMPT_51_EVENT(macro)                                                                                     \
+    FOREACH_OMPT_HOST_EVENT(macro)                                                                                       \
+    FOREACH_OMPT_DEVICE_EVENT(macro)                                                                                     \
+    FOREACH_OMPT_EMI_EVENT(macro)
 
 /*****************************************************************************
  * implementation specific types
@@ -184,6 +213,10 @@ typedef enum kmp_mutex_impl_t {
  * definitions generated from spec
  *****************************************************************************/
 
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
 typedef enum ompt_callbacks_t {
   ompt_callback_thread_begin             = 1,
   ompt_callback_thread_end               = 2,
@@ -386,13 +419,15 @@ typedef enum ompt_target_map_flag_t {
 } ompt_target_map_flag_t;
 
 typedef enum ompt_dependence_type_t {
-  ompt_dependence_type_in              = 1,
-  ompt_dependence_type_out             = 2,
-  ompt_dependence_type_inout           = 3,
-  ompt_dependence_type_mutexinoutset   = 4,
-  ompt_dependence_type_source          = 5,
-  ompt_dependence_type_sink            = 6,
-  ompt_dependence_type_inoutset        = 7
+  ompt_dependence_type_in               = 1,
+  ompt_dependence_type_out              = 2,
+  ompt_dependence_type_inout            = 3,
+  ompt_dependence_type_mutexinoutset    = 4,
+  ompt_dependence_type_source           = 5,
+  ompt_dependence_type_sink             = 6,
+  ompt_dependence_type_inoutset         = 7,
+  ompt_dependence_type_out_all_memory   = 34,
+  ompt_dependence_type_inout_all_memory = 35
 } ompt_dependence_type_t;
 
 typedef enum ompt_severity_t {
@@ -1375,6 +1410,14 @@ typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
   ompt_buffer_cursor_t current
 );
 
+#ifdef _WIN32
+__declspec(dllexport)
+#else
+__attribute__((visibility("default")))
+#endif
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+                                          const char *runtime_version);
+
 #define ompt_id_none 0
 #define ompt_data_none {0}
 #define ompt_time_none 0
@@ -1385,4 +1428,8 @@ typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
 
 #define ompd_segment_none 0
 
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
 #endif /* __OMPT__ */
diff --git a/contrib/libs/cxxsupp/openmp/omp.h b/contrib/libs/cxxsupp/openmp/omp.h
index 959e87359dc..c5bfd6ec543 100644
--- a/contrib/libs/cxxsupp/openmp/omp.h
+++ b/contrib/libs/cxxsupp/openmp/omp.h
@@ -15,13 +15,8 @@
 #ifndef __OMP_H
 #   define __OMP_H
 
-#if 0  // !defined(NORUNTIME) && !defined(USE_STL_SYSTEM)
-    // We need to put all possible dependencies to prevent blinking:
-    // on all stdlib.h that can be mentioned here within a platform.
-#   include <contrib/libs/cxxsupp/libcxx/include/stdlib.h>
-#else
+#   include <stddef.h>
 #   include <stdlib.h>
-#endif
 #   include <stdint.h>
 
 #   define KMP_VERSION_MAJOR    5
@@ -161,6 +156,8 @@
     /* OpenMP 5.1 interop */
     typedef intptr_t omp_intptr_t;
 
+    extern void __KAI_KMPC_CONVENTION ompx_dump_mapping_tables(void);
+
     /* 0..omp_get_num_interop_properties()-1 are reserved for implementation-defined properties */
     typedef enum omp_interop_property {
         omp_ipr_fr_id = -1,
@@ -242,6 +239,11 @@
     extern int    __KAI_KMPC_CONVENTION  omp_target_memcpy_rect_async(void *, const void *, size_t, int, const size_t *,
                                              const size_t *, const size_t *, const size_t *, const size_t *, int, int,
                                              int, omp_depend_t *);
+
+    /* OpenMP 6.0 device memory routines */
+    extern void * __KAI_KMPC_CONVENTION omp_target_memset(void *, int, size_t, int);
+    extern void * __KAI_KMPC_CONVENTION omp_target_memset_async(void *, int, size_t, int, int, omp_depend_t *);
+
     /*!
      * The `omp_get_mapped_ptr` routine returns the device pointer that is associated with a host pointer for a given device.
      */
@@ -474,7 +476,8 @@
     typedef enum omp_pause_resource_t {
       omp_pause_resume = 0,
       omp_pause_soft = 1,
-      omp_pause_hard = 2
+      omp_pause_hard = 2,
+      omp_pause_stop_tool = 3
     } omp_pause_resource_t;
     extern int __KAI_KMPC_CONVENTION omp_pause_resource(omp_pause_resource_t, int);
     extern int __KAI_KMPC_CONVENTION omp_pause_resource_all(omp_pause_resource_t);
@@ -503,7 +506,7 @@
     extern int __KAI_KMPC_CONVENTION omp_in_explicit_task(void);
 
     /* LLVM Extensions */
-    extern void *llvm_omp_target_dynamic_shared_alloc();
+    extern void *llvm_omp_target_dynamic_shared_alloc(void);
 
 #   undef __KAI_KMPC_CONVENTION
 #   undef __KMP_IMP
diff --git a/contrib/libs/cxxsupp/openmp/ompt-event-specific.h b/contrib/libs/cxxsupp/openmp/ompt-event-specific.h
index f6c7022c8f6..7736ba85316 100644
--- a/contrib/libs/cxxsupp/openmp/ompt-event-specific.h
+++ b/contrib/libs/cxxsupp/openmp/ompt-event-specific.h
@@ -55,19 +55,18 @@
 
 #define ompt_callback_implicit_task_implemented ompt_event_MAY_ALWAYS
 
-#define ompt_callback_target_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_emi_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_data_op_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_data_op_emi_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_submit_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_submit_emi_implemented ompt_event_UNIMPLEMENTED
-
+#define ompt_callback_target_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_emi_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_data_op_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_data_op_emi_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_submit_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_submit_emi_implemented ompt_event_MAY_ALWAYS
 #define ompt_callback_control_tool_implemented ompt_event_MAY_ALWAYS
 
-#define ompt_callback_device_initialize_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_device_finalize_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_device_initialize_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_device_finalize_implemented ompt_event_MAY_ALWAYS
 
-#define ompt_callback_device_load_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_device_load_implemented ompt_event_MAY_ALWAYS
 #define ompt_callback_device_unload_implemented ompt_event_UNIMPLEMENTED
 
 /*----------------------------------------------------------------------------
diff --git a/contrib/libs/cxxsupp/openmp/ompt-general.cpp b/contrib/libs/cxxsupp/openmp/ompt-general.cpp
index 0bee7e77c81..cd738f066fc 100644
--- a/contrib/libs/cxxsupp/openmp/ompt-general.cpp
+++ b/contrib/libs/cxxsupp/openmp/ompt-general.cpp
@@ -10,10 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "kmp_utils.h"
+
 /*****************************************************************************
  * system include files
  ****************************************************************************/
-
 #include <assert.h>
 
 #include <stdint.h>
@@ -104,12 +105,17 @@ static ompt_start_tool_result_t *ompt_start_tool_result = NULL;
 
 #if KMP_OS_WINDOWS
 static HMODULE ompt_tool_module = NULL;
+static HMODULE ompt_archer_module = NULL;
 #define OMPT_DLCLOSE(Lib) FreeLibrary(Lib)
 #else
 static void *ompt_tool_module = NULL;
+static void *ompt_archer_module = NULL;
 #define OMPT_DLCLOSE(Lib) dlclose(Lib)
 #endif
 
+/// Used to track the initializer and the finalizer provided by libomptarget
+static ompt_start_tool_result_t *libomptarget_ompt_result = NULL;
+
 /*****************************************************************************
  * forward declarations
  ****************************************************************************/
@@ -371,6 +377,7 @@ ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) {
               "Tool was started and is using the OMPT interface.\n");
           OMPT_VERBOSE_INIT_PRINT(
               "----- END LOGGING OF TOOL REGISTRATION -----\n");
+          ompt_archer_module = h;
           return ret;
         }
         OMPT_VERBOSE_INIT_CONTINUED_PRINT(
@@ -378,6 +385,7 @@ ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) {
       } else {
         OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", dlerror());
       }
+      OMPT_DLCLOSE(h);
     }
   }
 #endif
@@ -456,7 +464,7 @@ void ompt_pre_init() {
   if (verbose_init && verbose_file != stderr && verbose_file != stdout)
     fclose(verbose_file);
 #if OMPT_DEBUG
-  printf("ompt_pre_init(): ompt_enabled = %d\n", ompt_enabled);
+  printf("ompt_pre_init(): ompt_enabled = %d\n", ompt_enabled.enabled);
 #endif
 }
 
@@ -495,8 +503,8 @@ void ompt_post_init() {
       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
           ompt_thread_initial, __ompt_get_thread_data_internal());
     }
-    ompt_data_t *task_data;
-    ompt_data_t *parallel_data;
+    ompt_data_t *task_data = nullptr;
+    ompt_data_t *parallel_data = nullptr;
     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
                                   NULL);
     if (ompt_enabled.ompt_callback_implicit_task) {
@@ -509,14 +517,17 @@ void ompt_post_init() {
 }
 
 void ompt_fini() {
-  if (ompt_enabled.enabled
-#if OMPD_SUPPORT
-      && ompt_start_tool_result && ompt_start_tool_result->finalize
-#endif
-  ) {
-    ompt_start_tool_result->finalize(&(ompt_start_tool_result->tool_data));
+  if (ompt_enabled.enabled) {
+    if (ompt_start_tool_result && ompt_start_tool_result->finalize) {
+      ompt_start_tool_result->finalize(&(ompt_start_tool_result->tool_data));
+    }
+    if (libomptarget_ompt_result && libomptarget_ompt_result->finalize) {
+      libomptarget_ompt_result->finalize(NULL);
+    }
   }
 
+  if (ompt_archer_module)
+    OMPT_DLCLOSE(ompt_archer_module);
   if (ompt_tool_module)
     OMPT_DLCLOSE(ompt_tool_module);
   memset(&ompt_enabled, 0, sizeof(ompt_enabled));
@@ -687,7 +698,7 @@ OMPT_API_ROUTINE int ompt_get_num_places(void) {
 #else
   if (!KMP_AFFINITY_CAPABLE())
     return 0;
-  return __kmp_affinity_num_masks;
+  return __kmp_affinity.num_masks;
 #endif
 }
 
@@ -698,16 +709,16 @@ OMPT_API_ROUTINE int ompt_get_place_proc_ids(int place_num, int ids_size,
   return 0;
 #else
   int i, count;
-  int tmp_ids[ids_size];
+  SimpleVLA<int> tmp_ids(ids_size);
   for (int j = 0; j < ids_size; j++)
     tmp_ids[j] = 0;
   if (!KMP_AFFINITY_CAPABLE())
     return 0;
-  if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
+  if (place_num < 0 || place_num >= (int)__kmp_affinity.num_masks)
     return 0;
   /* TODO: Is this safe for asynchronous call from signal handler during runtime
    * shutdown? */
-  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num);
+  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity.masks, place_num);
   count = 0;
   KMP_CPU_SET_ITERATE(i, mask) {
     if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
@@ -869,5 +880,58 @@ static ompt_interface_fn_t ompt_fn_lookup(const char *s) {
 
   FOREACH_OMPT_INQUIRY_FN(ompt_interface_fn)
 
+#undef ompt_interface_fn
+
   return NULL;
 }
+
+static ompt_data_t *ompt_get_task_data() { return __ompt_get_task_data(); }
+
+static ompt_data_t *ompt_get_target_task_data() {
+  return __ompt_get_target_task_data();
+}
+
+/// Lookup function to query libomp callbacks registered by the tool
+static ompt_interface_fn_t ompt_libomp_target_fn_lookup(const char *s) {
+#define provide_fn(fn)                                                         \
+  if (strcmp(s, #fn) == 0)                                                     \
+    return (ompt_interface_fn_t)fn;
+
+  provide_fn(ompt_get_callback);
+  provide_fn(ompt_get_task_data);
+  provide_fn(ompt_get_target_task_data);
+#undef provide_fn
+
+#define ompt_interface_fn(fn, type, code)                                      \
+  if (strcmp(s, #fn) == 0)                                                     \
+    return (ompt_interface_fn_t)ompt_callbacks.ompt_callback(fn);
+
+  FOREACH_OMPT_DEVICE_EVENT(ompt_interface_fn)
+  FOREACH_OMPT_EMI_EVENT(ompt_interface_fn)
+  FOREACH_OMPT_NOEMI_EVENT(ompt_interface_fn)
+#undef ompt_interface_fn
+
+  return (ompt_interface_fn_t)0;
+}
+
+/// This function is called by the libomptarget connector to assign
+/// callbacks already registered with libomp.
+_OMP_EXTERN void ompt_libomp_connect(ompt_start_tool_result_t *result) {
+  OMPT_VERBOSE_INIT_PRINT("libomp --> OMPT: Enter ompt_libomp_connect\n");
+
+  // Ensure libomp callbacks have been added if not already
+  __ompt_force_initialization();
+
+  if (ompt_enabled.enabled && result) {
+    OMPT_VERBOSE_INIT_PRINT("libomp --> OMPT: Connecting with libomptarget\n");
+    // Pass in the libomp lookup function so that the already registered
+    // functions can be extracted and assigned to the callbacks in
+    // libomptarget
+    result->initialize(ompt_libomp_target_fn_lookup,
+                       /* initial_device_num */ 0, /* tool_data */ nullptr);
+    // Track the object provided by libomptarget so that the finalizer can be
+    // called during OMPT finalization
+    libomptarget_ompt_result = result;
+  }
+  OMPT_VERBOSE_INIT_PRINT("libomp --> OMPT: Exit ompt_libomp_connect\n");
+}
diff --git a/contrib/libs/cxxsupp/openmp/ompt-internal.h b/contrib/libs/cxxsupp/openmp/ompt-internal.h
index a85fe3835c6..580a7c2ac79 100644
--- a/contrib/libs/cxxsupp/openmp/ompt-internal.h
+++ b/contrib/libs/cxxsupp/openmp/ompt-internal.h
@@ -50,6 +50,10 @@ typedef struct ompt_callbacks_active_s {
        : 0x0) |                                                                \
       ((!(info->td_flags.tiedness)) ? ompt_task_untied : 0x0) |                \
       (info->td_flags.final ? ompt_task_final : 0x0) |                         \
+      (info->td_flags.target                                                   \
+           ? ompt_task_target                                                  \
+           : (info->td_flags.tasktype ? ompt_task_explicit                     \
+                                      : ompt_task_implicit)) |                 \
       (info->td_flags.merged_if0 ? ompt_task_mergeable : 0x0)
 
 typedef struct {
@@ -76,6 +80,7 @@ typedef struct {
   ompt_data_t thread_data;
   ompt_data_t task_data; /* stored here from implicit barrier-begin until
                             implicit-task-end */
+  ompt_data_t target_task_data; /* required by target support */
   void *return_address; /* stored here on entry of runtime */
   ompt_state_t state;
   ompt_wait_id_t wait_id;
diff --git a/contrib/libs/cxxsupp/openmp/ompt-specific.cpp b/contrib/libs/cxxsupp/openmp/ompt-specific.cpp
index c28b9bd1a66..0737c0cdfb1 100644
--- a/contrib/libs/cxxsupp/openmp/ompt-specific.cpp
+++ b/contrib/libs/cxxsupp/openmp/ompt-specific.cpp
@@ -188,6 +188,11 @@ ompt_task_info_t *__ompt_get_scheduling_taskinfo(int depth) {
 //******************************************************************************
 // interface operations
 //******************************************************************************
+//----------------------------------------------------------
+// initialization support
+//----------------------------------------------------------
+
+void __ompt_force_initialization() { __kmp_serial_initialize(); }
 
 //----------------------------------------------------------
 // thread support
@@ -339,6 +344,16 @@ void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
 // task support
 //----------------------------------------------------------
 
+ompt_data_t *__ompt_get_task_data() {
+  kmp_info_t *thr = ompt_get_thread();
+  ompt_data_t *task_data = thr ? OMPT_CUR_TASK_DATA(thr) : NULL;
+  return task_data;
+}
+
+ompt_data_t *__ompt_get_target_task_data() {
+  return &__kmp_threads[__kmp_get_gtid()]->th.ompt_thread_info.target_task_data;
+}
+
 int __ompt_get_task_info_internal(int ancestor_level, int *type,
                                   ompt_data_t **task_data,
                                   ompt_frame_t **task_frame,
@@ -406,9 +421,7 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type,
       team_info = &team->t.ompt_team_info;
       if (type) {
         if (taskdata->td_parent) {
-          *type = (taskdata->td_flags.tasktype ? ompt_task_explicit
-                                               : ompt_task_implicit) |
-                  TASK_TYPE_DETAILS_FORMAT(taskdata);
+          *type = TASK_TYPE_DETAILS_FORMAT(taskdata);
         } else {
           *type = ompt_task_initial;
         }
@@ -448,6 +461,7 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type,
 }
 
 int __ompt_get_task_memory_internal(void **addr, size_t *size, int blocknum) {
+  *size = 0;
   if (blocknum != 0)
     return 0; // support only a single block
 
@@ -456,27 +470,13 @@ int __ompt_get_task_memory_internal(void **addr, size_t *size, int blocknum) {
     return 0;
 
   kmp_taskdata_t *taskdata = thr->th.th_current_task;
-  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
 
   if (taskdata->td_flags.tasktype != TASK_EXPLICIT)
     return 0; // support only explicit task
 
-  void *ret_addr;
-  int64_t ret_size = taskdata->td_size_alloc - sizeof(kmp_taskdata_t);
-
-  // kmp_task_t->data1 is an optional member
-  if (taskdata->td_flags.destructors_thunk)
-    ret_addr = &task->data1 + 1;
-  else
-    ret_addr = &task->part_id + 1;
-
-  ret_size -= (char *)(ret_addr) - (char *)(task);
-  if (ret_size < 0)
-    return 0;
-
-  *addr = ret_addr;
-  *size = (size_t)ret_size;
-  return 1;
+  *addr = taskdata;
+  *size = taskdata->td_size_alloc;
+  return 0;
 }
 
 //----------------------------------------------------------
@@ -503,22 +503,23 @@ static uint64_t __ompt_get_unique_id_internal() {
 
 ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type bt,
                                            kmp_info_t *thr) {
-  if (bt == bs_forkjoin_barrier)
-    return ompt_sync_region_barrier_implicit;
+  if (bt == bs_forkjoin_barrier) {
+    if (thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
+      return ompt_sync_region_barrier_teams;
+    else
+      return ompt_sync_region_barrier_implicit_parallel;
+  }
 
-  if (bt != bs_plain_barrier)
+  if (bt != bs_plain_barrier || !thr->th.th_ident)
     return ompt_sync_region_barrier_implementation;
 
-  if (!thr->th.th_ident)
-    return ompt_sync_region_barrier;
-
   kmp_int32 flags = thr->th.th_ident->flags;
 
   if ((flags & KMP_IDENT_BARRIER_EXPL) != 0)
     return ompt_sync_region_barrier_explicit;
 
   if ((flags & KMP_IDENT_BARRIER_IMPL) != 0)
-    return ompt_sync_region_barrier_implicit;
+    return ompt_sync_region_barrier_implicit_workshare;
 
   return ompt_sync_region_barrier_implementation;
 }
diff --git a/contrib/libs/cxxsupp/openmp/ompt-specific.h b/contrib/libs/cxxsupp/openmp/ompt-specific.h
index bd1e0d8991e..7864ed6126c 100644
--- a/contrib/libs/cxxsupp/openmp/ompt-specific.h
+++ b/contrib/libs/cxxsupp/openmp/ompt-specific.h
@@ -20,6 +20,10 @@
  * forward declarations
  ****************************************************************************/
 
+/// Entrypoint used by libomptarget to register callbacks in libomp, if not
+/// done already
+void __ompt_force_initialization();
+
 void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid);
 void __ompt_thread_assign_wait_id(void *variable);
 
@@ -33,6 +37,10 @@ void __ompt_lw_taskteam_unlink(kmp_info_t *thr);
 
 ompt_team_info_t *__ompt_get_teaminfo(int depth, int *size);
 
+ompt_data_t *__ompt_get_task_data();
+
+ompt_data_t *__ompt_get_target_task_data();
+
 ompt_task_info_t *__ompt_get_task_info_object(int depth);
 
 int __ompt_get_parallel_info_internal(int ancestor_level,
@@ -57,12 +65,12 @@ ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type, kmp_info_t *);
  * macros
  ****************************************************************************/
 
-#define OMPT_CUR_TASK_INFO(thr) (&(thr->th.th_current_task->ompt_task_info))
+#define OMPT_CUR_TASK_INFO(thr) (&((thr)->th.th_current_task->ompt_task_info))
 #define OMPT_CUR_TASK_DATA(thr)                                                \
-  (&(thr->th.th_current_task->ompt_task_info.task_data))
-#define OMPT_CUR_TEAM_INFO(thr) (&(thr->th.th_team->t.ompt_team_info))
+  (&((thr)->th.th_current_task->ompt_task_info.task_data))
+#define OMPT_CUR_TEAM_INFO(thr) (&((thr)->th.th_team->t.ompt_team_info))
 #define OMPT_CUR_TEAM_DATA(thr)                                                \
-  (&(thr->th.th_team->t.ompt_team_info.parallel_data))
+  (&((thr)->th.th_team->t.ompt_team_info.parallel_data))
 
 #define OMPT_HAVE_WEAK_ATTRIBUTE KMP_HAVE_WEAK_ATTRIBUTE
 #define OMPT_HAVE_PSAPI KMP_HAVE_PSAPI
@@ -122,6 +130,25 @@ inline const char *ompt_get_runtime_version() {
   return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN];
 }
 
+inline ompt_work_t ompt_get_work_schedule(enum sched_type schedule) {
+  switch (SCHEDULE_WITHOUT_MODIFIERS(schedule)) {
+  case kmp_sch_static_chunked:
+  case kmp_sch_static_balanced:
+  case kmp_sch_static_greedy:
+    return ompt_work_loop_static;
+  case kmp_sch_dynamic_chunked:
+  case kmp_sch_static_steal:
+    return ompt_work_loop_dynamic;
+  case kmp_sch_guided_iterative_chunked:
+  case kmp_sch_guided_analytical_chunked:
+  case kmp_sch_guided_chunked:
+  case kmp_sch_guided_simd:
+    return ompt_work_loop_guided;
+  default:
+    return ompt_work_loop_other;
+  }
+}
+
 class OmptReturnAddressGuard {
 private:
   bool SetAddress{false};
diff --git a/contrib/libs/cxxsupp/openmp/patches/fix_stdlib_resolving.patch b/contrib/libs/cxxsupp/openmp/patches/fix_stdlib_resolving.patch
deleted file mode 100644
index 72ec8d4e792..00000000000
--- a/contrib/libs/cxxsupp/openmp/patches/fix_stdlib_resolving.patch
+++ /dev/null
@@ -1,18 +0,0 @@
-diff --git a/omp.h b/omp.h
-index f2e6345..cb2fe49 100644
---- a/omp.h
-+++ b/omp.h
-@@ -15,7 +15,13 @@
- #ifndef __OMP_H
- #   define __OMP_H
- 
-+#if 0  // !defined(NORUNTIME) && !defined(USE_STL_SYSTEM)
-+    // We need to put all possible dependencies to prevent blinking:
-+    // on all stdlib.h that can be mentioned here within a platform.
-+#   include <contrib/libs/cxxsupp/libcxx/include/stdlib.h>
-+#else
- #   include <stdlib.h>
-+#endif
- #   include <stdint.h>
- 
- #   define KMP_VERSION_MAJOR    5
diff --git a/contrib/libs/cxxsupp/openmp/patches/remove_generation_date.sh b/contrib/libs/cxxsupp/openmp/patches/remove_generation_date.sh
new file mode 100644
index 00000000000..47c3aff76c0
--- /dev/null
+++ b/contrib/libs/cxxsupp/openmp/patches/remove_generation_date.sh
@@ -0,0 +1,7 @@
+replace_date() {
+	sed -e 's|// The file was generated from en_US.txt by message-converter.py on .* //|// The file was generated from en_US.txt by message-converter.py on Fri Jul 11 21:54:37 2025 (fixed date by patch) //|' -i $1	
+}
+
+
+replace_date "kmp_i18n_id.inc"
+replace_date "kmp_i18n_default.inc"
diff --git a/contrib/libs/cxxsupp/openmp/ya.make b/contrib/libs/cxxsupp/openmp/ya.make
index 5ab68e3978b..8a2be02f1eb 100644
--- a/contrib/libs/cxxsupp/openmp/ya.make
+++ b/contrib/libs/cxxsupp/openmp/ya.make
@@ -12,9 +12,9 @@ LICENSE(
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-VERSION(15.0.7)
+VERSION(20.1.7)
 
-ORIGINAL_SOURCE(https://github.com/llvm/llvm-project/archive/llvmorg-15.0.7.tar.gz)
+ORIGINAL_SOURCE(https://github.com/llvm/llvm-project/archive/llvmorg-20.1.7.tar.gz)
 
 ADDINCL(
     GLOBAL contrib/libs/cxxsupp/openmp
@@ -63,6 +63,7 @@ SRCS(
     kmp_atomic.cpp
     kmp_barrier.cpp
     kmp_cancel.cpp
+    kmp_collapse.cpp
     kmp_csupport.cpp
     kmp_debug.cpp
     kmp_dispatch.cpp
diff --git a/contrib/libs/cxxsupp/openmp/z_Linux_asm.S b/contrib/libs/cxxsupp/openmp/z_Linux_asm.S
index b4a45c1ac6f..0bf9f07a13f 100644
--- a/contrib/libs/cxxsupp/openmp/z_Linux_asm.S
+++ b/contrib/libs/cxxsupp/openmp/z_Linux_asm.S
@@ -19,6 +19,16 @@
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
+# if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+# if __has_include(<cet.h>)
+# include <cet.h>
+# endif
+# endif
+
+# if !defined(_CET_ENDBR)
+# define _CET_ENDBR
+# endif
+
 # if KMP_MIC
 // the 'delay r16/r32/r64' should be used instead of the 'pause'.
 // The delay operation has the effect of removing the current thread from
@@ -66,6 +76,7 @@
 	ALIGN  4
 	.globl KMP_PREFIX_UNDERSCORE($0)
 KMP_PREFIX_UNDERSCORE($0):
+	_CET_ENDBR
 .endmacro
 # else // KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
@@ -92,6 +103,7 @@ KMP_PREFIX_UNDERSCORE($0):
         .globl KMP_PREFIX_UNDERSCORE(\proc)
 KMP_PREFIX_UNDERSCORE(\proc):
 	.cfi_startproc
+	_CET_ENDBR
 .endm
 .macro KMP_CFI_DEF_OFFSET sz
 	.cfi_def_cfa_offset	\sz
@@ -108,7 +120,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
 # endif // KMP_OS_DARWIN
 #endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
 
-#if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
 
 # if KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
@@ -129,7 +141,25 @@ KMP_PREFIX_UNDERSCORE(\proc):
 	.globl KMP_PREFIX_UNDERSCORE($0)
 KMP_PREFIX_UNDERSCORE($0):
 .endmacro
-# else // KMP_OS_DARWIN
+# elif KMP_OS_WINDOWS
+#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Windows/ARM64 symbols
+// Format labels so that they don't override function names in gdb's backtraces
+#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
+
+.macro ALIGN size
+	.align 1<<(\size)
+.endm
+
+.macro DEBUG_INFO proc
+	ALIGN 2
+.endm
+
+.macro PROC proc
+	ALIGN 2
+	.globl KMP_PREFIX_UNDERSCORE(\proc)
+KMP_PREFIX_UNDERSCORE(\proc):
+.endm
+# else // KMP_OS_DARWIN || KMP_OS_WINDOWS
 #  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Linux* OS symbols
 // Format labels so that they don't override function names in gdb's backtraces
 #  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
@@ -142,7 +172,11 @@ KMP_PREFIX_UNDERSCORE($0):
 	.cfi_endproc
 // Not sure why we need .type and .size for the functions
 	ALIGN 2
+#if KMP_ARCH_ARM
+	.type  \proc,%function
+#else
 	.type  \proc,@function
+#endif
 	.size  \proc,.-\proc
 .endm
 
@@ -154,7 +188,64 @@ KMP_PREFIX_UNDERSCORE(\proc):
 .endm
 # endif // KMP_OS_DARWIN
 
-#endif // (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
+# if KMP_OS_LINUX
+// BTI and PAC gnu property note
+#  define NT_GNU_PROPERTY_TYPE_0 5
+#  define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
+#  define GNU_PROPERTY_AARCH64_FEATURE_1_BTI 1
+#  define GNU_PROPERTY_AARCH64_FEATURE_1_PAC 2
+
+#  define GNU_PROPERTY(type, value)                                            \
+  .pushsection .note.gnu.property, "a";                                        \
+  .p2align 3;                                                                  \
+  .word 4;                                                                     \
+  .word 16;                                                                    \
+  .word NT_GNU_PROPERTY_TYPE_0;                                                \
+  .asciz "GNU";                                                                \
+  .word type;                                                                  \
+  .word 4;                                                                     \
+  .word value;                                                                 \
+  .word 0;                                                                     \
+  .popsection
+# endif
+
+# if defined(__ARM_FEATURE_BTI_DEFAULT)
+#  define BTI_FLAG GNU_PROPERTY_AARCH64_FEATURE_1_BTI
+# else
+#  define BTI_FLAG 0
+# endif
+# if __ARM_FEATURE_PAC_DEFAULT & 3
+#  define PAC_FLAG GNU_PROPERTY_AARCH64_FEATURE_1_PAC
+# else
+#  define PAC_FLAG 0
+# endif
+
+# if (BTI_FLAG | PAC_FLAG) != 0
+#  if PAC_FLAG != 0
+#   define PACBTI_C hint #25
+#   define PACBTI_RET hint #29
+#  else
+#   define PACBTI_C hint #34
+#   define PACBTI_RET
+#  endif
+#  define GNU_PROPERTY_BTI_PAC \
+    GNU_PROPERTY(GNU_PROPERTY_AARCH64_FEATURE_1_AND, BTI_FLAG | PAC_FLAG)
+# else
+#  define PACBTI_C
+#  define PACBTI_RET
+#  define GNU_PROPERTY_BTI_PAC
+# endif
+#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
+
+.macro COMMON name, size, align_power
+#if KMP_OS_DARWIN
+	.comm \name, \size
+#elif KMP_OS_WINDOWS
+	.comm \name, \size, \align_power
+#else // !KMP_OS_DARWIN && !KMP_OS_WINDOWS
+	.comm \name, \size, (1<<(\align_power))
+#endif
+.endm
 
 // -----------------------------------------------------------------------
 // data
@@ -1118,6 +1209,9 @@ KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
 	movq	%rdi, %rbx	// pkfn -> %rbx
 	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
 	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
+	// Check if argc is 0
+	cmpq $0, %rax
+	je KMP_LABEL(kmp_no_args) // Jump ahead
 
 	movq	%r8, %r11	// p_argv -> %r11
 
@@ -1163,6 +1257,7 @@ KMP_LABEL(kmp_1_exit):
 	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
 #endif // KMP_MIC
 
+KMP_LABEL(kmp_no_args):
 	call	*%rbx		// call (*pkfn)();
 	movq	$1, %rax	// move 1 into return register;
 
@@ -1204,7 +1299,7 @@ KMP_LABEL(kmp_1_exit):
 #endif /* KMP_ARCH_X86_64 */
 
 // '
-#if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32)
 
 //------------------------------------------------------------------------
 // int
@@ -1260,6 +1355,7 @@ __tid = 8
 // mark_begin;
 	.text
 	PROC __kmp_invoke_microtask
+	PACBTI_C
 
 	stp	x29, x30, [sp, #-16]!
 # if OMPT_SUPPORT
@@ -1323,12 +1419,158 @@ KMP_LABEL(kmp_1):
 	ldp	x19, x20, [sp], #16
 # endif
 	ldp	x29, x30, [sp], #16
+	PACBTI_RET
 	ret
 
 	DEBUG_INFO __kmp_invoke_microtask
 // -- End  __kmp_invoke_microtask
 
-#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64 */
+#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32) */
+
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM
+
+//------------------------------------------------------------------------
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+//                         int gtid, int tid,
+//                         int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+//                         ,
+//                         void **exit_frame_ptr
+// #endif
+//                       ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)( & gtid, & tid, argv[0], ... );
+//
+// // FIXME: This is done at call-site and can be removed here.
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = 0;
+// #endif
+//
+//   return 1;
+// }
+//
+// parameters:
+//	r0:	pkfn
+//	r1:	gtid
+//	r2:	tid
+//	r3:	argc
+//	r4(stack):	p_argv
+//	r5(stack):	&exit_frame
+//
+// locals:
+//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
+//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
+//
+// reg temps:
+//	 r4:	used to hold pkfn address
+//	 r5:	used as temporary for number of pkfn parms
+//	 r6:	used to traverse p_argv array
+//	 r7:	frame pointer (in some configurations)
+//	 r8:	used as temporary for stack placement calculation
+//	 	and as pointer to base of callee saved area
+//	 r9:	used as temporary for stack parameters
+//	r10:	used to preserve exit_frame_ptr, callee-save
+//	r11:	frame pointer (in some configurations)
+//
+// return:	r0	(always 1/TRUE)
+//
+
+__gtid = 4
+__tid = 8
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	PROC __kmp_invoke_microtask
+
+	// Pushing one extra register (r3) to keep the stack aligned
+	// for when we call pkfn below
+	push	{r3-r11,lr}
+	// Load p_argv and &exit_frame
+	ldr	r4, [sp, #10*4]
+# if OMPT_SUPPORT
+	ldr	r5, [sp, #11*4]
+# endif
+
+# if KMP_OS_DARWIN || (defined(__thumb__) && !KMP_OS_WINDOWS)
+# define FP r7
+# define FPOFF 4*4
+#else
+# define FP r11
+# define FPOFF 8*4
+#endif
+	add	FP, sp, #FPOFF
+# if OMPT_SUPPORT
+	mov	r10, r5
+	str	FP, [r10]
+# endif
+	mov	r8, sp
+
+	// Calculate how much stack to allocate, in increments of 8 bytes.
+	// We strictly need 4*(argc-2) bytes (2 arguments are passed in
+	// registers) but allocate 4*argc for simplicity (to avoid needing
+	// to handle the argc<2 cases). We align the number of bytes
+	// allocated to 8 bytes, to keep the stack aligned. (Since we
+	// already allocate more than enough, it's ok to round down
+	// instead of up for the alignment.) We allocate another extra
+	// 8 bytes for gtid and tid.
+	mov	r5, #1
+	add	r5, r5, r3, lsr #1
+	sub	sp, sp, r5, lsl #3
+
+	str	r1, [r8, #-__gtid]
+	str	r2, [r8, #-__tid]
+	mov	r5, r3
+	mov	r6, r4
+	mov	r4, r0
+
+	// Prepare the first 2 parameters to pkfn - pointers to gtid and tid
+	// in our stack frame.
+	sub	r0, r8, #__gtid
+	sub	r1, r8, #__tid
+
+	mov	r8, sp
+
+	// Load p_argv[0] and p_argv[1] into r2 and r3, if argc >= 1/2
+	cmp	r5, #0
+	beq	KMP_LABEL(kmp_1)
+	ldr	r2, [r6]
+
+	subs	r5, r5, #1
+	beq	KMP_LABEL(kmp_1)
+	ldr	r3, [r6, #4]!
+
+	// Loop, loading the rest of p_argv and writing the elements on the
+	// stack.
+KMP_LABEL(kmp_0):
+	subs	r5, r5, #1
+	beq	KMP_LABEL(kmp_1)
+	ldr	r12, [r6, #4]!
+	str	r12, [r8], #4
+	b	KMP_LABEL(kmp_0)
+KMP_LABEL(kmp_1):
+	blx	r4
+	mov	r0, #1
+
+	sub	r4, FP, #FPOFF
+	mov	sp, r4
+# undef FP
+# undef FPOFF
+
+# if OMPT_SUPPORT
+	mov	r1, #0
+	str	r1, [r10]
+# endif
+	pop	{r3-r11,pc}
+
+	DEBUG_INFO __kmp_invoke_microtask
+// -- End  __kmp_invoke_microtask
+
+#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM */
 
 #if KMP_ARCH_PPC64
 
@@ -1725,23 +1967,533 @@ __kmp_invoke_microtask:
 
 #endif /* KMP_ARCH_RISCV64 */
 
-#if KMP_ARCH_ARM || KMP_ARCH_MIPS
+#if KMP_ARCH_LOONGARCH64
+
+//------------------------------------------------------------------------
+//
+// typedef void (*microtask_t)(int *gtid, int *tid, ...);
+//
+// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+//                            void *p_argv[]
+// #if OMPT_SUPPORT
+//                            ,
+//                            void **exit_frame_ptr
+// #endif
+//                            ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)(&gtid, &tid, argv[0], ...);
+//
+//   return 1;
+// }
+//
+// Parameters:
+//   a0: pkfn
+//   a1: gtid
+//   a2: tid
+//   a3: argc
+//   a4: p_argv
+//   a5: exit_frame_ptr
+//
+// Locals:
+//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
+//   __tid: tid param pushed on stack so can pass &tid to pkfn
+//
+// Temp registers:
+//
+//  t0: used to calculate the dynamic stack size / used to hold pkfn address
+//  t1: used as temporary for stack placement calculation
+//  t2: used as temporary for stack arguments
+//  t3: used as temporary for number of remaining pkfn parms
+//  t4: used to traverse p_argv array
+//
+// return: a0 (always 1/TRUE)
+//
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	.globl	__kmp_invoke_microtask
+	.p2align	2
+	.type	__kmp_invoke_microtask,@function
+__kmp_invoke_microtask:
+	.cfi_startproc
+
+	// First, save ra and fp
+	addi.d	$sp, $sp, -16
+	st.d	$ra, $sp, 8
+	st.d	$fp, $sp, 0
+	addi.d	$fp, $sp, 16
+	.cfi_def_cfa	22, 0
+	.cfi_offset	1, -8
+	.cfi_offset	22, -16
+
+	// Compute the dynamic stack size:
+	//
+	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
+	//   reference
+	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
+	//   function by register. Given that we have 8 of such registers (a[0-7])
+	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
+	//   reserve max(0, argc - 6)*8 extra bytes
+	//
+	// The total number of bytes is then max(0, argc - 6)*8 + 8
+
+	addi.d  $t0, $a3, -6
+	slt  $t1, $t0, $zero
+	masknez  $t0, $t0, $t1
+	addi.d  $t0, $t0, 1
+	slli.d	$t0, $t0, 3
+	sub.d	$sp, $sp, $t0
+
+	// Align the stack to 16 bytes
+	bstrins.d $sp, $zero, 3, 0
+
+	move	$t0, $a0
+	move	$t3, $a3
+	move	$t4, $a4
+
+#if OMPT_SUPPORT
+	// Save frame pointer into exit_frame
+	st.d	$fp, $a5, 0
+#endif
+
+	// Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
+
+	st.w	$a1, $fp, -20
+	st.w	$a2, $fp, -24
+
+	addi.d	$a0, $fp, -20
+	addi.d	$a1, $fp, -24
+
+	beqz	$t3, .L_kmp_3
+	ld.d	$a2, $t4, 0
+
+	addi.d	$t3, $t3, -1
+	beqz	$t3, .L_kmp_3
+	ld.d	$a3, $t4, 8
+
+	addi.d	$t3, $t3, -1
+	beqz	$t3, .L_kmp_3
+	ld.d	$a4, $t4, 16
+
+	addi.d	$t3, $t3, -1
+	beqz	$t3, .L_kmp_3
+	ld.d	$a5, $t4, 24
+
+	addi.d	$t3, $t3, -1
+	beqz	$t3, .L_kmp_3
+	ld.d	$a6, $t4, 32
+
+	addi.d	$t3, $t3, -1
+	beqz	$t3, .L_kmp_3
+	ld.d	$a7, $t4, 40
+
+	// Prepare any additional argument passed through the stack
+	addi.d	$t4, $t4, 48
+	move	$t1, $sp
+	b .L_kmp_2
+.L_kmp_1:
+	ld.d	$t2, $t4, 0
+	st.d	$t2, $t1, 0
+	addi.d	$t4, $t4, 8
+	addi.d	$t1, $t1, 8
+.L_kmp_2:
+	addi.d	$t3, $t3, -1
+	bnez	$t3, .L_kmp_1
+
+.L_kmp_3:
+	// Call pkfn function
+	jirl	$ra, $t0, 0
+
+	// Restore stack and return
+
+	addi.d	$a0, $zero, 1
+
+	addi.d	$sp, $fp, -16
+	ld.d	$fp, $sp, 0
+	ld.d	$ra, $sp, 8
+	addi.d	$sp, $sp, 16
+	jr $ra
+.Lfunc_end0:
+	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
+	.cfi_endproc
+
+// -- End  __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_LOONGARCH64 */
+
+#if KMP_ARCH_VE
+
+//------------------------------------------------------------------------
+//
+// typedef void (*microtask_t)(int *gtid, int *tid, ...);
+//
+// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+//                            void *p_argv[]
+// #if OMPT_SUPPORT
+//                            ,
+//                            void **exit_frame_ptr
+// #endif
+//                            ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)(&gtid, &tid, argv[0], ...);
+//
+//   return 1;
+// }
+//
+// Parameters:
+//   s0: pkfn
+//   s1: gtid
+//   s2: tid
+//   s3: argc
+//   s4: p_argv
+//   s5: exit_frame_ptr
+//
+// Locals:
+//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
+//   __tid: tid param pushed on stack so can pass &tid to pkfn
+//
+// Temp. registers:
+//
+//  s34: used to calculate the dynamic stack size
+//  s35: used as temporary for stack placement calculation
+//  s36: used as temporary for stack arguments
+//  s37: used as temporary for number of remaining pkfn parms
+//  s38: used to traverse p_argv array
+//
+// return: s0 (always 1/TRUE)
+//
+
+__gtid = -4
+__tid = -8
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	.globl	__kmp_invoke_microtask
+	// A function requires 8 bytes align.
+	.p2align	3
+	.type	__kmp_invoke_microtask,@function
+__kmp_invoke_microtask:
+	.cfi_startproc
+
+	// First, save fp and lr.  VE stores them at caller stack frame.
+	st	%fp, 0(, %sp)
+	st	%lr, 8(, %sp)
+	or	%fp, 0, %sp
+	.cfi_def_cfa	%fp, 0
+	.cfi_offset	%lr, 8
+	.cfi_offset	%fp, 0
+
+	// Compute the dynamic stack size:
+	//
+	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them
+	//   by reference
+	// - We need 8 bytes for whole arguments.  We have two + 'argc'
+	//   arguments (condider &gtid and &tid).  We need to reserve
+	//   (argc + 2) * 8 bytes.
+	// - We need 176 bytes for RSA and others
+	//
+	// The total number of bytes is then (argc + 2) * 8 + 8 + 176.
+	//
+	// |------------------------------|
+	// | return address of callee     | 8(%fp)
+	// |------------------------------|
+	// | frame pointer of callee      | 0(%fp)
+	// |------------------------------| <------------------ %fp
+	// | __tid / __gtid               | -8(%fp) / -4(%fp)
+	// |------------------------------|
+	// | argc+2 for arguments         | 176(%sp)
+	// |------------------------------|
+	// | RSA                          |
+	// |------------------------------|
+	// | return address               |
+	// |------------------------------|
+	// | frame pointer                |
+	// |------------------------------| <------------------ %sp
+
+	adds.w.sx	%s34, 2, %s3
+	sll	%s34, %s34, 3
+	lea	%s34, 184(, %s34)
+	subs.l	%sp, %sp, %s34
+
+	// Align the stack to 16 bytes.
+	and	%sp, -16, %sp
+
+	// Save pkfn.
+	or	%s12, 0, %s0
+
+	// Call host to allocate stack if it is necessary.
+	brge.l	%sp, %sl, .L_kmp_pass
+	ld	%s61, 24(, %tp)
+	lea	%s63, 0x13b
+	shm.l	%s63, 0(%s61)
+	shm.l	%sl, 8(%s61)
+	shm.l	%sp, 16(%s61)
+	monc
+
+.L_kmp_pass:
+	lea	%s35, 176(, %sp)
+	adds.w.sx	%s37, 0, %s3
+	or	%s38, 0, %s4
+
+#if OMPT_SUPPORT
+	// Save frame pointer into exit_frame.
+	st	%fp, 0(%s5)
+#endif
+
+	// Prepare arguments for the pkfn function (first 8 using s0-s7
+	// registers, but need to store stack also because of varargs).
+
+	stl	%s1, __gtid(%fp)
+	stl	%s2, __tid(%fp)
+
+	adds.l	%s0, __gtid, %fp
+	st	%s0, 0(, %s35)
+	adds.l	%s1, __tid, %fp
+	st	%s1, 8(, %s35)
+
+	breq.l	0, %s37, .L_kmp_call
+	ld	%s2, 0(, %s38)
+	st	%s2, 16(, %s35)
+
+	breq.l	1, %s37, .L_kmp_call
+	ld	%s3, 8(, %s38)
+	st	%s3, 24(, %s35)
+
+	breq.l	2, %s37, .L_kmp_call
+	ld	%s4, 16(, %s38)
+	st	%s4, 32(, %s35)
+
+	breq.l	3, %s37, .L_kmp_call
+	ld	%s5, 24(, %s38)
+	st	%s5, 40(, %s35)
+
+	breq.l	4, %s37, .L_kmp_call
+	ld	%s6, 32(, %s38)
+	st	%s6, 48(, %s35)
+
+	breq.l	5, %s37, .L_kmp_call
+	ld	%s7, 40(, %s38)
+	st	%s7, 56(, %s35)
+
+	breq.l	6, %s37, .L_kmp_call
+
+	// Prepare any additional argument passed through the stack.
+	adds.l	%s37, -6, %s37
+	lea	%s38, 48(, %s38)
+	lea	%s35, 64(, %s35)
+.L_kmp_loop:
+	ld	%s36, 0(, %s38)
+	st	%s36, 0(, %s35)
+	adds.l	%s37, -1, %s37
+	adds.l	%s38, 8, %s38
+	adds.l	%s35, 8, %s35
+	brne.l	0, %s37, .L_kmp_loop
+
+.L_kmp_call:
+	// Call pkfn function.
+	bsic	%lr, (, %s12)
+
+	// Return value.
+	lea	%s0, 1
+
+	// Restore stack and return.
+	or	%sp, 0, %fp
+	ld	%lr, 8(, %sp)
+	ld	%fp, 0(, %sp)
+	b.l.t	(, %lr)
+.Lfunc_end0:
+	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
+	.cfi_endproc
+
+// -- End  __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_VE */
+
+#if KMP_ARCH_S390X
+
+//------------------------------------------------------------------------
+//
+// typedef void (*microtask_t)(int *gtid, int *tid, ...);
+//
+// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+//                            void *p_argv[]
+// #if OMPT_SUPPORT
+//                            ,
+//                            void **exit_frame_ptr
+// #endif
+//                            ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)(&gtid, &tid, argv[0], ...);
+//
+//   return 1;
+// }
+//
+// Parameters:
+//   r2: pkfn
+//   r3: gtid
+//   r4: tid
+//   r5: argc
+//   r6: p_argv
+//   SP+160: exit_frame_ptr
+//
+// Locals:
+//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
+//   __tid: tid param pushed on stack so can pass &tid to pkfn
+//
+// Temp. registers:
+//
+//  r0: used to fetch argv slots
+//  r7: used as temporary for number of remaining pkfn parms
+//  r8: argv
+//  r9: pkfn
+//  r10: stack size
+//  r11: previous fp
+//  r12: stack parameter area
+//  r13: argv slot
+//
+// return: r2 (always 1/TRUE)
+//
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	.globl	__kmp_invoke_microtask
+	.p2align	1
+	.type	__kmp_invoke_microtask,@function
+__kmp_invoke_microtask:
+	.cfi_startproc
+
+	stmg	%r6,%r14,48(%r15)
+        .cfi_offset %r6, -112
+        .cfi_offset %r7, -104
+        .cfi_offset %r8, -96
+        .cfi_offset %r9, -88
+        .cfi_offset %r10, -80
+        .cfi_offset %r11, -72
+        .cfi_offset %r12, -64
+        .cfi_offset %r13, -56
+        .cfi_offset %r14, -48
+        .cfi_offset %r15, -40
+	lgr	%r11,%r15
+	.cfi_def_cfa %r11, 160
+
+	// Compute the dynamic stack size:
+	//
+	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
+	//   reference
+	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
+	//   function by register. Given that we have 5 of such registers (r[2-6])
+	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
+	//   reserve max(0, argc - 3)*8 extra bytes
+	//
+	// The total number of bytes is then max(0, argc - 3)*8 + 8
+
+	lgr	%r10,%r5
+	aghi	%r10,-2
+	jnm	0f
+	lghi	%r10,0
+0:
+	sllg	%r10,%r10,3
+	lgr	%r12,%r10
+	aghi	%r10,176
+	sgr 	%r15,%r10
+	agr	%r12,%r15
+	stg	%r11,0(%r15)
+
+	lgr	%r9,%r2			// pkfn
+
+#if OMPT_SUPPORT
+	// Save frame pointer into exit_frame
+	lg	%r8,160(%r11)
+	stg	%r11,0(%r8)
+#endif
+
+	// Prepare arguments for the pkfn function (first 5 using r2-r6 registers)
+
+	stg     %r3,160(%r12)
+	la	%r2,164(%r12)		// gid
+	stg	%r4,168(%r12)		
+	la	%r3,172(%r12)		// tid
+	lgr	%r8,%r6			// argv
+
+	// If argc > 0
+	ltgr	%r7,%r5
+	jz	1f
+
+	lg	%r4,0(%r8)		// argv[0]
+	aghi	%r7,-1
+	jz	1f
+
+	// If argc > 1
+	lg	%r5,8(%r8)		// argv[1]
+	aghi	%r7,-1
+	jz	1f
+
+	// If argc > 2
+	lg	%r6,16(%r8)		// argv[2]
+	aghi	%r7,-1
+	jz	1f
+
+	lghi	%r13,0			// Index [n]
+2:
+	lg	%r0,24(%r13,%r8)	// argv[2+n]
+	stg	%r0,160(%r13,%r15)	// parm[2+n]
+	aghi	%r13,8			// Next
+	aghi	%r7,-1
+	jnz	2b
+
+1:
+	basr	%r14,%r9		// Call pkfn
+
+	// Restore stack and return
+
+	lgr	%r15,%r11
+	lmg	%r6,%r14,48(%r15)
+	lghi	%r2,1
+	br	%r14
+.Lfunc_end0:
+	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
+	.cfi_endproc
+
+// -- End  __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_S390X */
+
+#if KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_AARCH64_32
+#ifndef KMP_PREFIX_UNDERSCORE
+# define KMP_PREFIX_UNDERSCORE(x) x
+#endif
     .data
-    .comm .gomp_critical_user_,32,8
+    COMMON .gomp_critical_user_, 32, 3
     .data
     .align 4
-    .global __kmp_unnamed_critical_addr
-__kmp_unnamed_critical_addr:
+    .global KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr)
+KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
     .4byte .gomp_critical_user_
-    .size __kmp_unnamed_critical_addr,4
-#endif /* KMP_ARCH_ARM */
+#ifdef __ELF__
+    .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),4
+#endif
+#endif /* KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_AARCH64_32 */
 
-#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
+#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||                   \
+    KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE ||                 \
+    KMP_ARCH_S390X
 #ifndef KMP_PREFIX_UNDERSCORE
 # define KMP_PREFIX_UNDERSCORE(x) x
 #endif
     .data
-    .comm .gomp_critical_user_,32,8
+    COMMON .gomp_critical_user_, 32, 3
     .data
     .align 8
     .global KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr)
@@ -1751,12 +2503,17 @@ KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
     .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8
 #endif
 #endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
-          KMP_ARCH_RISCV64 */
+          KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE || 
+          KMP_ARCH_S390X */
 
 #if KMP_OS_LINUX
-# if KMP_ARCH_ARM
+# if KMP_ARCH_ARM || KMP_ARCH_AARCH64
 .section .note.GNU-stack,"",%progbits
-# else
+# elif !KMP_ARCH_WASM
 .section .note.GNU-stack,"",@progbits
 # endif
 #endif
+
+#if KMP_OS_LINUX && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32)
+GNU_PROPERTY_BTI_PAC
+#endif
diff --git a/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp b/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp
index 91edf0254a7..e3d0a4ee00c 100644
--- a/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp
+++ b/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp
@@ -29,7 +29,12 @@
 #include <semaphore.h>
 #endif // KMP_OS_LINUX
 #include <sys/resource.h>
+#if KMP_OS_AIX
+#include <sys/ldr.h>
+#error #include <libperfstat.h>
+#else
 #include <sys/syscall.h>
+#endif
 #include <sys/time.h>
 #include <sys/times.h>
 #include <unistd.h>
@@ -57,9 +62,23 @@
 #include <sys/sysctl.h>
 #include <sys/user.h>
 #include <pthread_np.h>
+#if KMP_OS_DRAGONFLY
+#include <kvm.h>
+#endif
 #elif KMP_OS_NETBSD || KMP_OS_OPENBSD
 #include <sys/types.h>
 #include <sys/sysctl.h>
+#if KMP_OS_NETBSD
+#include <sched.h>
+#endif
+#if KMP_OS_OPENBSD
+#include <pthread_np.h>
+#endif
+#elif KMP_OS_SOLARIS
+#include <libproc.h>
+#error #include <procfs.h>
+#include <thread.h>
+#include <sys/loadavg.h>
 #endif
 
 #include <ctype.h>
@@ -70,6 +89,15 @@ struct kmp_sys_timer {
   struct timespec start;
 };
 
+#ifndef TIMEVAL_TO_TIMESPEC
+// Convert timeval to timespec.
+#define TIMEVAL_TO_TIMESPEC(tv, ts)                                            \
+  do {                                                                         \
+    (ts)->tv_sec = (tv)->tv_sec;                                               \
+    (ts)->tv_nsec = (tv)->tv_usec * 1000;                                      \
+  } while (0)
+#endif
+
 // Convert timespec to nanoseconds.
 #define TS2NS(timespec)                                                        \
   (((timespec).tv_sec * (long int)1e9) + (timespec).tv_nsec)
@@ -93,6 +121,7 @@ static kmp_cond_align_t __kmp_wait_cv;
 static kmp_mutex_align_t __kmp_wait_mx;
 
 kmp_uint64 __kmp_ticks_per_msec = 1000000;
+kmp_uint64 __kmp_ticks_per_usec = 1000;
 
 #ifdef DEBUG_SUSPEND
 static void __kmp_print_cond(char *buffer, kmp_cond_align_t *cond) {
@@ -102,7 +131,9 @@ static void __kmp_print_cond(char *buffer, kmp_cond_align_t *cond) {
 }
 #endif
 
-#if ((KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED)
+#if ((KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||   \
+      KMP_OS_AIX) &&                                                           \
+     KMP_AFFINITY_SUPPORTED)
 
 /* Affinity support */
 
@@ -118,6 +149,29 @@ void __kmp_affinity_bind_thread(int which) {
   KMP_CPU_FREE_FROM_STACK(mask);
 }
 
+#if KMP_OS_AIX
+void __kmp_affinity_determine_capable(const char *env_var) {
+  // All versions of AIX support bindprocessor().
+
+  size_t mask_size = __kmp_xproc / CHAR_BIT;
+  // Round up to byte boundary.
+  if (__kmp_xproc % CHAR_BIT)
+    ++mask_size;
+
+  // Round up to the mask_size_type boundary.
+  if (mask_size % sizeof(__kmp_affin_mask_size))
+    mask_size += sizeof(__kmp_affin_mask_size) -
+                 mask_size % sizeof(__kmp_affin_mask_size);
+  KMP_AFFINITY_ENABLE(mask_size);
+  KA_TRACE(10,
+           ("__kmp_affinity_determine_capable: "
+            "AIX OS affinity interface bindprocessor functional (mask size = "
+            "%" KMP_SIZE_T_SPEC ").\n",
+            __kmp_affin_mask_size));
+}
+
+#else // !KMP_OS_AIX
+
 /* Determine if we can access affinity functionality on this version of
  * Linux* OS by checking __NR_sched_{get,set}affinity system calls, and set
  * __kmp_affin_mask_size to the appropriate value (0 means not capable). */
@@ -127,10 +181,16 @@ void __kmp_affinity_determine_capable(const char *env_var) {
 #if KMP_OS_LINUX
 #define KMP_CPU_SET_SIZE_LIMIT (1024 * 1024)
 #define KMP_CPU_SET_TRY_SIZE CACHE_LINE
-#elif KMP_OS_FREEBSD
+#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
 #define KMP_CPU_SET_SIZE_LIMIT (sizeof(cpuset_t))
+#elif KMP_OS_NETBSD
+#define KMP_CPU_SET_SIZE_LIMIT (256)
 #endif
 
+  int verbose = __kmp_affinity.flags.verbose;
+  int warnings = __kmp_affinity.flags.warnings;
+  enum affinity_type type = __kmp_affinity.type;
+
 #if KMP_OS_LINUX
   long gCode;
   unsigned char *buf;
@@ -145,10 +205,9 @@ void __kmp_affinity_determine_capable(const char *env_var) {
 
   if (gCode < 0 && errno != EINVAL) {
     // System call not supported
-    if (__kmp_affinity_verbose ||
-        (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none) &&
-         (__kmp_affinity_type != affinity_default) &&
-         (__kmp_affinity_type != affinity_disabled))) {
+    if (verbose ||
+        (warnings && (type != affinity_none) && (type != affinity_default) &&
+         (type != affinity_disabled))) {
       int error = errno;
       kmp_msg_t err_code = KMP_ERR(error);
       __kmp_msg(kmp_ms_warning, KMP_MSG(GetAffSysCallNotSupported, env_var),
@@ -188,11 +247,9 @@ void __kmp_affinity_determine_capable(const char *env_var) {
                       "inconsistent OS call behavior: errno == ENOSYS for mask "
                       "size %d\n",
                       size));
-        if (__kmp_affinity_verbose ||
-            (__kmp_affinity_warnings &&
-             (__kmp_affinity_type != affinity_none) &&
-             (__kmp_affinity_type != affinity_default) &&
-             (__kmp_affinity_type != affinity_disabled))) {
+        if (verbose ||
+            (warnings && (type != affinity_none) &&
+             (type != affinity_default) && (type != affinity_disabled))) {
           int error = errno;
           kmp_msg_t err_code = KMP_ERR(error);
           __kmp_msg(kmp_ms_warning, KMP_MSG(GetAffSysCallNotSupported, env_var),
@@ -215,7 +272,7 @@ void __kmp_affinity_determine_capable(const char *env_var) {
     KMP_INTERNAL_FREE(buf);
     return;
   }
-#elif KMP_OS_FREEBSD
+#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
   long gCode;
   unsigned char *buf;
   buf = (unsigned char *)KMP_INTERNAL_MALLOC(KMP_CPU_SET_SIZE_LIMIT);
@@ -239,15 +296,14 @@ void __kmp_affinity_determine_capable(const char *env_var) {
   KMP_AFFINITY_DISABLE();
   KA_TRACE(10, ("__kmp_affinity_determine_capable: "
                 "cannot determine mask size - affinity not supported\n"));
-  if (__kmp_affinity_verbose ||
-      (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none) &&
-       (__kmp_affinity_type != affinity_default) &&
-       (__kmp_affinity_type != affinity_disabled))) {
+  if (verbose || (warnings && (type != affinity_none) &&
+                  (type != affinity_default) && (type != affinity_disabled))) {
     KMP_WARNING(AffCantGetMaskSize, env_var);
   }
 }
-
-#endif // KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+#endif // KMP_OS_AIX
+#endif // (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD ||                  \
+           KMP_OS_DRAGONFLY || KMP_OS_AIX) && KMP_AFFINITY_SUPPORTED
 
 #if KMP_USE_FUTEX
 
@@ -266,7 +322,7 @@ int __kmp_futex_determine_capable() {
 
 #endif // KMP_USE_FUTEX
 
-#if (KMP_ARCH_X86 || KMP_ARCH_X86_64) && (!KMP_ASM_INTRINS)
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_WASM) && (!KMP_ASM_INTRINS)
 /* Only 32-bit "add-exchange" instruction on IA-32 architecture causes us to
    use compare_and_store for these routines */
 
@@ -326,7 +382,7 @@ kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 d) {
   return old_value;
 }
 
-#if KMP_ARCH_X86
+#if KMP_ARCH_X86 || KMP_ARCH_WASM
 kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 d) {
   kmp_int8 old_value, new_value;
 
@@ -402,15 +458,14 @@ void __kmp_terminate_thread(int gtid) {
   KMP_YIELD(TRUE);
 } //
 
-/* Set thread stack info according to values returned by pthread_getattr_np().
+/* Set thread stack info.
    If values are unreasonable, assume call failed and use incremental stack
    refinement method instead. Returns TRUE if the stack parameters could be
    determined exactly, FALSE if incremental refinement is necessary. */
 static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
   int stack_data;
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-    KMP_OS_HURD
-  pthread_attr_t attr;
+    KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX
   int status;
   size_t size = 0;
   void *addr = 0;
@@ -420,6 +475,19 @@ static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
      pthread_attr_getstack may cause thread gtid aliasing */
   if (!KMP_UBER_GTID(gtid)) {
 
+#if KMP_OS_SOLARIS
+    stack_t s;
+    if ((status = thr_stksegment(&s)) < 0) {
+      KMP_CHECK_SYSFAIL("thr_stksegment", status);
+    }
+
+    addr = s.ss_sp;
+    size = s.ss_size;
+    KA_TRACE(60, ("__kmp_set_stack_info: T#%d thr_stksegment returned size:"
+                  " %lu, low addr: %p\n",
+                  gtid, size, addr));
+#else
+    pthread_attr_t attr;
     /* Fetch the real thread attributes */
     status = pthread_attr_init(&attr);
     KMP_CHECK_SYSFAIL("pthread_attr_init", status);
@@ -438,6 +506,7 @@ static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
               gtid, size, addr));
     status = pthread_attr_destroy(&attr);
     KMP_CHECK_SYSFAIL("pthread_attr_destroy", status);
+#endif
   }
 
   if (size != 0 && addr != 0) { // was stack parameter determination successful?
@@ -448,7 +517,7 @@ static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
     return TRUE;
   }
 #endif /* KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD  \
-          || KMP_OS_HURD */
+          || KMP_OS_HURD || KMP_OS_SOLARIS */
   /* Use incremental refinement starting from initial conservative estimate */
   TCW_PTR(th->th.th_info.ds.ds_stacksize, 0);
   TCW_PTR(th->th.th_info.ds.ds_stackbase, &stack_data);
@@ -463,7 +532,7 @@ static void *__kmp_launch_worker(void *thr) {
 #endif /* KMP_BLOCK_SIGNALS */
   void *exit_val;
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-    KMP_OS_OPENBSD || KMP_OS_HURD
+    KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX
   void *volatile padding = 0;
 #endif
   int gtid;
@@ -486,7 +555,7 @@ static void *__kmp_launch_worker(void *thr) {
 #endif /* USE_ITT_BUILD */
 
 #if KMP_AFFINITY_SUPPORTED
-  __kmp_affinity_set_init_mask(gtid, FALSE);
+  __kmp_affinity_bind_init_mask(gtid);
 #endif
 
 #ifdef KMP_CANCEL_THREADS
@@ -512,7 +581,7 @@ static void *__kmp_launch_worker(void *thr) {
 #endif /* KMP_BLOCK_SIGNALS */
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-    KMP_OS_OPENBSD
+    KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX
   if (__kmp_stkoffset > 0 && gtid > 0) {
     padding = KMP_ALLOCA(gtid * __kmp_stkoffset);
     (void)padding;
@@ -765,13 +834,6 @@ void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size) {
      and also gives the user the stack space they requested for all threads */
   stack_size += gtid * __kmp_stkoffset * 2;
 
-#if defined(__ANDROID__) && __ANDROID_API__ < 19
-  // Round the stack size to a multiple of the page size. Older versions of
-  // Android (until KitKat) would fail pthread_attr_setstacksize with EINVAL
-  // if the stack size was not a multiple of the page size.
-  stack_size = (stack_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
-#endif
-
   KA_TRACE(10, ("__kmp_create_worker: T#%d, default stacksize = %lu bytes, "
                 "__kmp_stksize = %lu bytes, final stacksize = %lu bytes\n",
                 gtid, KMP_DEFAULT_STKSIZE, __kmp_stksize, stack_size));
@@ -819,6 +881,19 @@ void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size) {
     KMP_SYSFAIL("pthread_create", status);
   }
 
+  // Rename worker threads for improved debuggability
+  if (!KMP_UBER_GTID(gtid)) {
+#if defined(LIBOMP_HAVE_PTHREAD_SET_NAME_NP)
+    pthread_set_name_np(handle, "openmp_worker");
+#elif defined(LIBOMP_HAVE_PTHREAD_SETNAME_NP) && !KMP_OS_DARWIN
+#if KMP_OS_NETBSD
+    pthread_setname_np(handle, "%s", const_cast<char *>("openmp_worker"));
+#else
+    pthread_setname_np(handle, "openmp_worker");
+#endif
+#endif
+  }
+
   th->th.th_info.ds.ds_thread = handle;
 
 #ifdef KMP_THREAD_ATTR
@@ -981,13 +1056,17 @@ retry:
 #endif // KMP_USE_MONITOR
 
 void __kmp_exit_thread(int exit_status) {
+#if KMP_OS_WASI
+// TODO: the wasm32-wasi-threads target does not yet support pthread_exit.
+#else
   pthread_exit((void *)(intptr_t)exit_status);
+#endif
 } // __kmp_exit_thread
 
 #if KMP_USE_MONITOR
 void __kmp_resume_monitor();
 
-void __kmp_reap_monitor(kmp_info_t *th) {
+extern "C" void __kmp_reap_monitor(kmp_info_t *th) {
   int status;
   void *exit_val;
 
@@ -1029,6 +1108,10 @@ void __kmp_reap_monitor(kmp_info_t *th) {
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 }
+#else
+// Empty symbol to export (see exports_so.txt) when
+// monitor thread feature is disabled
+extern "C" void __kmp_reap_monitor(kmp_info_t *th) { (void)th; }
 #endif // KMP_USE_MONITOR
 
 void __kmp_reap_worker(kmp_info_t *th) {
@@ -1229,7 +1312,8 @@ static void __kmp_atfork_child(void) {
   ++__kmp_fork_count;
 
 #if KMP_AFFINITY_SUPPORTED
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
+    KMP_OS_AIX
   // reset the affinity in the child to the initial thread
   // affinity in the parent
   kmp_set_thread_affinity_mask_initial();
@@ -1237,12 +1321,14 @@ static void __kmp_atfork_child(void) {
   // Set default not to bind threads tightly in the child (we're expecting
   // over-subscription after the fork and this can improve things for
   // scripting languages that use OpenMP inside process-parallel code).
-  __kmp_affinity_type = affinity_none;
   if (__kmp_nested_proc_bind.bind_types != NULL) {
     __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
   }
-  __kmp_affinity_masks = NULL;
-  __kmp_affinity_num_masks = 0;
+  for (kmp_affinity_t *affinity : __kmp_affinities)
+    *affinity = KMP_AFFINITY_INIT(affinity->env_var);
+  __kmp_affin_fullMask = nullptr;
+  __kmp_affin_origMask = nullptr;
+  __kmp_topology = nullptr;
 #endif // KMP_AFFINITY_SUPPORTED
 
 #if KMP_USE_MONITOR
@@ -1318,9 +1404,11 @@ static void __kmp_atfork_child(void) {
 
 void __kmp_register_atfork(void) {
   if (__kmp_need_register_atfork) {
+#if !KMP_OS_WASI
     int status = pthread_atfork(__kmp_atfork_prepare, __kmp_atfork_parent,
                                 __kmp_atfork_child);
     KMP_CHECK_SYSFAIL("pthread_atfork", status);
+#endif
     __kmp_need_register_atfork = FALSE;
   }
 }
@@ -1762,6 +1850,7 @@ int __kmp_read_system_info(struct kmp_sys_info *info) {
   status = getrusage(RUSAGE_SELF, &r_usage);
   KMP_CHECK_SYSFAIL_ERRNO("getrusage", status);
 
+#if !KMP_OS_WASI
   // The maximum resident set size utilized (in kilobytes)
   info->maxrss = r_usage.ru_maxrss;
   // The number of page faults serviced without any I/O
@@ -1778,6 +1867,7 @@ int __kmp_read_system_info(struct kmp_sys_info *info) {
   info->nvcsw = r_usage.ru_nvcsw;
   // The number of times a context switch was forced
   info->nivcsw = r_usage.ru_nivcsw;
+#endif
 
   return (status != 0);
 }
@@ -1812,27 +1902,14 @@ static int __kmp_get_xproc(void) {
   __kmp_type_convert(sysconf(_SC_NPROCESSORS_CONF), &(r));
 
 #elif KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD || \
-    KMP_OS_HURD
+    KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
 
   __kmp_type_convert(sysconf(_SC_NPROCESSORS_ONLN), &(r));
 
 #elif KMP_OS_DARWIN
 
-  // Bug C77011 High "OpenMP Threads and number of active cores".
-
-  // Find the number of available CPUs.
-  kern_return_t rc;
-  host_basic_info_data_t info;
-  mach_msg_type_number_t num = HOST_BASIC_INFO_COUNT;
-  rc = host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&info, &num);
-  if (rc == 0 && num == HOST_BASIC_INFO_COUNT) {
-    // Cannot use KA_TRACE() here because this code works before trace support
-    // is initialized.
-    r = info.avail_cpus;
-  } else {
-    KMP_WARNING(CantGetNumAvailCPU);
-    KMP_INFORM(AssumedNumCPU);
-  }
+  size_t len = sizeof(r);
+  sysctlbyname("hw.logicalcpu", &r, &len, NULL, 0);
 
 #else
 
@@ -1850,10 +1927,13 @@ int __kmp_read_from_file(char const *path, char const *format, ...) {
 
   va_start(args, format);
   FILE *f = fopen(path, "rb");
-  if (f == NULL)
+  if (f == NULL) {
+    va_end(args);
     return 0;
+  }
   result = vfscanf(f, format, args);
   fclose(f);
+  va_end(args);
 
   return result;
 }
@@ -1890,6 +1970,13 @@ void __kmp_runtime_initialize(void) {
 
     /* Query the maximum number of threads */
     __kmp_type_convert(sysconf(_SC_THREAD_THREADS_MAX), &(__kmp_sys_max_nth));
+#ifdef __ve__
+    if (__kmp_sys_max_nth == -1) {
+      // VE's pthread supports only up to 64 threads per a VE process.
+      // So we use that KMP_MAX_NTH (predefined as 64) here.
+      __kmp_sys_max_nth = KMP_MAX_NTH;
+    }
+#else
     if (__kmp_sys_max_nth == -1) {
       /* Unlimited threads for NPTL */
       __kmp_sys_max_nth = INT_MAX;
@@ -1897,6 +1984,7 @@ void __kmp_runtime_initialize(void) {
       /* Can't tell, just use PTHREAD_THREADS_MAX */
       __kmp_sys_max_nth = KMP_MAX_NTH;
     }
+#endif
 
     /* Query the minimum stack size */
     __kmp_sys_min_stksize = sysconf(_SC_THREAD_STACK_MIN);
@@ -1999,7 +2087,7 @@ kmp_uint64 __kmp_now_nsec() {
 /* Measure clock ticks per millisecond */
 void __kmp_initialize_system_tick() {
   kmp_uint64 now, nsec2, diff;
-  kmp_uint64 delay = 100000; // 50~100 usec on most machines.
+  kmp_uint64 delay = 1000000; // ~450 usec on most machines.
   kmp_uint64 nsec = __kmp_now_nsec();
   kmp_uint64 goal = __kmp_hardware_timestamp() + delay;
   while ((now = __kmp_hardware_timestamp()) < goal)
@@ -2007,9 +2095,11 @@ void __kmp_initialize_system_tick() {
   nsec2 = __kmp_now_nsec();
   diff = nsec2 - nsec;
   if (diff > 0) {
-    kmp_uint64 tpms = ((kmp_uint64)1e6 * (delay + (now - goal)) / diff);
-    if (tpms > 0)
-      __kmp_ticks_per_msec = tpms;
+    double tpus = 1000.0 * (double)(delay + (now - goal)) / (double)diff;
+    if (tpus > 0.0) {
+      __kmp_ticks_per_msec = (kmp_uint64)(tpus * 1000.0);
+      __kmp_ticks_per_usec = (kmp_uint64)tpus;
+    }
   }
 }
 #endif
@@ -2070,10 +2160,10 @@ int __kmp_is_address_mapped(void *addr) {
   // We pass from number of vm entry's semantic
   // to size of whole entry map list.
   lstsz = lstsz * 4 / 3;
-  buf = reinterpret_cast<char *>(kmpc_malloc(lstsz));
+  buf = reinterpret_cast<char *>(KMP_INTERNAL_MALLOC(lstsz));
   rc = sysctl(mib, 4, buf, &lstsz, NULL, 0);
   if (rc < 0) {
-    kmpc_free(buf);
+    KMP_INTERNAL_FREE(buf);
     return 0;
   }
 
@@ -2097,8 +2187,96 @@ int __kmp_is_address_mapped(void *addr) {
     }
     lw += cursz;
   }
-  kmpc_free(buf);
+  KMP_INTERNAL_FREE(buf);
+#elif KMP_OS_DRAGONFLY
+  char err[_POSIX2_LINE_MAX];
+  kinfo_proc *proc;
+  vmspace sp;
+  vm_map *cur;
+  vm_map_entry entry, *c;
+  struct proc p;
+  kvm_t *fd;
+  uintptr_t uaddr;
+  int num;
+
+  fd = kvm_openfiles(nullptr, nullptr, nullptr, O_RDONLY, err);
+  if (!fd) {
+    return 0;
+  }
+
+  proc = kvm_getprocs(fd, KERN_PROC_PID, getpid(), &num);
+
+  if (kvm_read(fd, static_cast<uintptr_t>(proc->kp_paddr), &p, sizeof(p)) !=
+          sizeof(p) ||
+      kvm_read(fd, reinterpret_cast<uintptr_t>(p.p_vmspace), &sp, sizeof(sp)) !=
+          sizeof(sp)) {
+    kvm_close(fd);
+    return 0;
+  }
+
+  (void)rc;
+  cur = &sp.vm_map;
+  uaddr = reinterpret_cast<uintptr_t>(addr);
+  for (c = kvm_vm_map_entry_first(fd, cur, &entry); c;
+       c = kvm_vm_map_entry_next(fd, c, &entry)) {
+    if ((uaddr >= entry.ba.start) && (uaddr <= entry.ba.end)) {
+      if ((entry.protection & VM_PROT_READ) != 0 &&
+          (entry.protection & VM_PROT_WRITE) != 0) {
+        found = 1;
+        break;
+      }
+    }
+  }
+
+  kvm_close(fd);
+#elif KMP_OS_SOLARIS
+  prmap_t *cur, *map;
+  void *buf;
+  uintptr_t uaddr;
+  ssize_t rd;
+  int err;
+  int file;
+
+  pid_t pid = getpid();
+  struct ps_prochandle *fd = Pgrab(pid, PGRAB_RDONLY, &err);
+  ;
+
+  if (!fd) {
+    return 0;
+  }
+
+  char *name = __kmp_str_format("/proc/%d/map", pid);
+  size_t sz = (1 << 20);
+  file = open(name, O_RDONLY);
+  if (file == -1) {
+    KMP_INTERNAL_FREE(name);
+    return 0;
+  }
+
+  buf = KMP_INTERNAL_MALLOC(sz);
+
+  while (sz > 0 && (rd = pread(file, buf, sz, 0)) == sz) {
+    void *newbuf;
+    sz <<= 1;
+    newbuf = KMP_INTERNAL_REALLOC(buf, sz);
+    buf = newbuf;
+  }
+
+  map = reinterpret_cast<prmap_t *>(buf);
+  uaddr = reinterpret_cast<uintptr_t>(addr);
+
+  for (cur = map; rd > 0; cur++, rd = -sizeof(*map)) {
+    if ((uaddr >= cur->pr_vaddr) && (uaddr < cur->pr_vaddr)) {
+      if ((cur->pr_mflags & MA_READ) != 0 && (cur->pr_mflags & MA_WRITE) != 0) {
+        found = 1;
+        break;
+      }
+    }
+  }
 
+  KMP_INTERNAL_FREE(map);
+  close(file);
+  KMP_INTERNAL_FREE(name);
 #elif KMP_OS_DARWIN
 
   /* On OS X*, /proc pseudo filesystem is not available. Try to read memory
@@ -2175,10 +2353,52 @@ int __kmp_is_address_mapped(void *addr) {
     }
     kiv.kve_start += 1;
   }
-#elif KMP_OS_DRAGONFLY
+#elif KMP_OS_WASI
+  found = (int)addr < (__builtin_wasm_memory_size(0) * PAGESIZE);
+#elif KMP_OS_AIX
+
+  uint32_t loadQueryBufSize = 4096u; // Default loadquery buffer size.
+  char *loadQueryBuf;
 
-  // FIXME(DragonFly): Implement this
-  found = 1;
+  for (;;) {
+    loadQueryBuf = (char *)KMP_INTERNAL_MALLOC(loadQueryBufSize);
+    if (loadQueryBuf == NULL) {
+      return 0;
+    }
+
+    rc = loadquery(L_GETXINFO | L_IGNOREUNLOAD, loadQueryBuf, loadQueryBufSize);
+    if (rc < 0) {
+      KMP_INTERNAL_FREE(loadQueryBuf);
+      if (errno != ENOMEM) {
+        return 0;
+      }
+      // errno == ENOMEM; double the size.
+      loadQueryBufSize <<= 1;
+      continue;
+    }
+    // Obtained the load info successfully.
+    break;
+  }
+
+  struct ld_xinfo *curLdInfo = (struct ld_xinfo *)loadQueryBuf;
+
+  // Loop through the load info to find if there is a match.
+  for (;;) {
+    uintptr_t curDataStart = (uintptr_t)curLdInfo->ldinfo_dataorg;
+    uintptr_t curDataEnd = curDataStart + curLdInfo->ldinfo_datasize;
+
+    // The data segment is readable and writable.
+    if (curDataStart <= (uintptr_t)addr && (uintptr_t)addr < curDataEnd) {
+      found = 1;
+      break;
+    }
+    if (curLdInfo->ldinfo_next == 0u) {
+      // Reached the end of load info.
+      break;
+    }
+    curLdInfo = (struct ld_xinfo *)((char *)curLdInfo + curLdInfo->ldinfo_next);
+  }
+  KMP_INTERNAL_FREE(loadQueryBuf);
 
 #else
 
@@ -2192,7 +2412,8 @@ int __kmp_is_address_mapped(void *addr) {
 
 #ifdef USE_LOAD_BALANCE
 
-#if KMP_OS_DARWIN || KMP_OS_NETBSD
+#if KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||    \
+    KMP_OS_OPENBSD || KMP_OS_SOLARIS
 
 // The function returns the rounded value of the system load average
 // during given time interval which depends on the value of
@@ -2223,6 +2444,79 @@ int __kmp_get_load_balance(int max) {
   return ret_avg;
 }
 
+#elif KMP_OS_AIX
+
+// The function returns number of running (not sleeping) threads, or -1 in case
+// of error.
+int __kmp_get_load_balance(int max) {
+
+  static int glb_running_threads = 0; // Saved count of the running threads for
+                                      // the thread balance algorithm.
+  static double glb_call_time = 0; // Thread balance algorithm call time.
+  int running_threads = 0; // Number of running threads in the system.
+
+  double call_time = 0.0;
+
+  __kmp_elapsed(&call_time);
+
+  if (glb_call_time &&
+      (call_time - glb_call_time < __kmp_load_balance_interval))
+    return glb_running_threads;
+
+  glb_call_time = call_time;
+
+  if (max <= 0) {
+    max = INT_MAX;
+  }
+
+  // Check how many perfstat_cpu_t structures are available.
+  int logical_cpus = perfstat_cpu(NULL, NULL, sizeof(perfstat_cpu_t), 0);
+  if (logical_cpus <= 0) {
+    glb_call_time = -1;
+    return -1;
+  }
+
+  perfstat_cpu_t *cpu_stat = (perfstat_cpu_t *)KMP_INTERNAL_MALLOC(
+      logical_cpus * sizeof(perfstat_cpu_t));
+  if (cpu_stat == NULL) {
+    glb_call_time = -1;
+    return -1;
+  }
+
+  // Set first CPU as the name of the first logical CPU for which the info is
+  // desired.
+  perfstat_id_t first_cpu_name;
+  strcpy(first_cpu_name.name, FIRST_CPU);
+
+  // Get the stat info of logical CPUs.
+  int rc = perfstat_cpu(&first_cpu_name, cpu_stat, sizeof(perfstat_cpu_t),
+                        logical_cpus);
+  KMP_DEBUG_ASSERT(rc == logical_cpus);
+  if (rc <= 0) {
+    KMP_INTERNAL_FREE(cpu_stat);
+    glb_call_time = -1;
+    return -1;
+  }
+  for (int i = 0; i < logical_cpus; ++i) {
+    running_threads += cpu_stat[i].runque;
+    if (running_threads >= max)
+      break;
+  }
+
+  // There _might_ be a timing hole where the thread executing this
+  // code gets skipped in the load balance, and running_threads is 0.
+  // Assert in the debug builds only!!!
+  KMP_DEBUG_ASSERT(running_threads > 0);
+  if (running_threads <= 0)
+    running_threads = 1;
+
+  KMP_INTERNAL_FREE(cpu_stat);
+
+  glb_running_threads = running_threads;
+
+  return running_threads;
+}
+
 #else // Linux* OS
 
 // The function returns number of running (not sleeping) threads, or -1 in case
@@ -2249,8 +2543,9 @@ int __kmp_get_load_balance(int max) {
   int stat_file = -1;
   int stat_path_fixed_len;
 
+#ifdef KMP_DEBUG
   int total_processes = 0; // Total number of processes in system.
-  int total_threads = 0; // Total number of threads in system.
+#endif
 
   double call_time = 0.0;
 
@@ -2280,7 +2575,7 @@ int __kmp_get_load_balance(int max) {
   // Open "/proc/" directory.
   proc_dir = opendir("/proc");
   if (proc_dir == NULL) {
-    // Cannot open "/prroc/". Probably the kernel does not support it. Return an
+    // Cannot open "/proc/". Probably the kernel does not support it. Return an
     // error now and in subsequent calls.
     running_threads = -1;
     permanent_error = 1;
@@ -2297,7 +2592,9 @@ int __kmp_get_load_balance(int max) {
     // process' directory.
     if (proc_entry->d_type == DT_DIR && isdigit(proc_entry->d_name[0])) {
 
+#ifdef KMP_DEBUG
       ++total_processes;
+#endif
       // Make sure init process is the very first in "/proc", so we can replace
       // strcmp( proc_entry->d_name, "1" ) == 0 with simpler total_processes ==
       // 1. We are going to check that total_processes == 1 => d_name == "1" is
@@ -2338,7 +2635,6 @@ int __kmp_get_load_balance(int max) {
         while (task_entry != NULL) {
           // It is a directory and name starts with a digit.
           if (proc_entry->d_type == DT_DIR && isdigit(task_entry->d_name[0])) {
-            ++total_threads;
 
             // Construct complete stat file path. Easiest way would be:
             //  __kmp_str_buf_print( & stat_path, "%s/%s/stat", task_path.str,
@@ -2447,7 +2743,46 @@ finish: // Clean up and exit.
 
 #if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC ||                            \
       ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) ||                 \
-      KMP_ARCH_PPC64 || KMP_ARCH_RISCV64)
+      KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||            \
+      KMP_ARCH_ARM || KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC_XCOFF ||   \
+      KMP_ARCH_AARCH64_32)
+
+// Because WebAssembly will use `call_indirect` to invoke the microtask and
+// WebAssembly indirect calls check that the called signature is a precise
+// match, we need to cast each microtask function pointer back from `void *` to
+// its original type.
+typedef void (*microtask_t0)(int *, int *);
+typedef void (*microtask_t1)(int *, int *, void *);
+typedef void (*microtask_t2)(int *, int *, void *, void *);
+typedef void (*microtask_t3)(int *, int *, void *, void *, void *);
+typedef void (*microtask_t4)(int *, int *, void *, void *, void *, void *);
+typedef void (*microtask_t5)(int *, int *, void *, void *, void *, void *,
+                             void *);
+typedef void (*microtask_t6)(int *, int *, void *, void *, void *, void *,
+                             void *, void *);
+typedef void (*microtask_t7)(int *, int *, void *, void *, void *, void *,
+                             void *, void *, void *);
+typedef void (*microtask_t8)(int *, int *, void *, void *, void *, void *,
+                             void *, void *, void *, void *);
+typedef void (*microtask_t9)(int *, int *, void *, void *, void *, void *,
+                             void *, void *, void *, void *, void *);
+typedef void (*microtask_t10)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *);
+typedef void (*microtask_t11)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *);
+typedef void (*microtask_t12)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *, void *);
+typedef void (*microtask_t13)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *, void *, void *);
+typedef void (*microtask_t14)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *, void *, void *, void *);
+typedef void (*microtask_t15)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *);
 
 // we really only need the case with 1 argument, because CLANG always build
 // a struct of pointers to shared variables referenced in the outlined function
@@ -2468,66 +2803,76 @@ int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
     fflush(stderr);
     exit(-1);
   case 0:
-    (*pkfn)(&gtid, &tid);
+    (*(microtask_t0)pkfn)(&gtid, &tid);
     break;
   case 1:
-    (*pkfn)(&gtid, &tid, p_argv[0]);
+    (*(microtask_t1)pkfn)(&gtid, &tid, p_argv[0]);
     break;
   case 2:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
+    (*(microtask_t2)pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
     break;
   case 3:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
+    (*(microtask_t3)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
     break;
   case 4:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]);
+    (*(microtask_t4)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3]);
     break;
   case 5:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]);
+    (*(microtask_t5)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4]);
     break;
   case 6:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5]);
+    (*(microtask_t6)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4], p_argv[5]);
     break;
   case 7:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6]);
+    (*(microtask_t7)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4], p_argv[5], p_argv[6]);
     break;
   case 8:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7]);
+    (*(microtask_t8)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                          p_argv[7]);
     break;
   case 9:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8]);
+    (*(microtask_t9)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4], p_argv[5], p_argv[6], p_argv[7],
+                          p_argv[8]);
     break;
   case 10:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9]);
+    (*(microtask_t10)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9]);
     break;
   case 11:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
+    (*(microtask_t11)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
     break;
   case 12:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11]);
+    (*(microtask_t12)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+                           p_argv[11]);
     break;
   case 13:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11], p_argv[12]);
+    (*(microtask_t13)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+                           p_argv[11], p_argv[12]);
     break;
   case 14:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11], p_argv[12], p_argv[13]);
+    (*(microtask_t14)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+                           p_argv[11], p_argv[12], p_argv[13]);
     break;
   case 15:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
+    (*(microtask_t15)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+                           p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
     break;
   }
 
@@ -2735,4 +3080,28 @@ void __kmp_hidden_helper_threads_deinitz_release() {
 }
 #endif // KMP_OS_LINUX
 
+bool __kmp_detect_shm() {
+  DIR *dir = opendir("/dev/shm");
+  if (dir) { // /dev/shm exists
+    closedir(dir);
+    return true;
+  } else if (ENOENT == errno) { // /dev/shm does not exist
+    return false;
+  } else { // opendir() failed
+    return false;
+  }
+}
+
+bool __kmp_detect_tmp() {
+  DIR *dir = opendir("/tmp");
+  if (dir) { // /tmp exists
+    closedir(dir);
+    return true;
+  } else if (ENOENT == errno) { // /tmp does not exist
+    return false;
+  } else { // opendir() failed
+    return false;
+  }
+}
+
 // end of file //
author	mikhnenko <[email protected]>	2025-07-15 20:05:43 +0300
committer	mikhnenko <[email protected]>	2025-07-15 20:52:16 +0300
commit	a40bd4f45bbc18fd95b1596e655b8942ceb2cf4b (patch)
tree	bce599ca02c778c277198de6d131d37db71997d0 /contrib/libs/cxxsupp/openmp
parent	728e0eaef4dc1f1152d2c3a4cc1bbdf597f3ef3d (diff)