Update contrib/libs/cxxsupp/openmp to 20.1.7

commit_hash:722dd5fe79203d22ad4a0be288ac0caeb6b3dd68
author: mikhnenko <[email protected]> 2025-07-15 20:05:43 +0300
committer: mikhnenko <[email protected]> 2025-07-15 20:52:16 +0300
commit: a40bd4f45bbc18fd95b1596e655b8942ceb2cf4b (patch)
tree: bce599ca02c778c277198de6d131d37db71997d0 /contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
parent: 728e0eaef4dc1f1152d2c3a4cc1bbdf597f3ef3d (diff)
1 files changed, 1034 insertions, 814 deletions
diff --git a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
index bfbff03bd62..c26992ab98b 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
@@ -24,6 +24,7 @@
 #include "kmp_wait_release.h"
 #include "kmp_wrapper_getpid.h"
 #include "kmp_dispatch.h"
+#include "kmp_utils.h"
 #if KMP_USE_HIER_SCHED
 #error #include "kmp_dispatch_hier.h"
 #endif
@@ -47,8 +48,9 @@ static char *ProfileTraceFile = nullptr;
 #include <process.h>
 #endif
 
-#if KMP_OS_WINDOWS
-// windows does not need include files as it doesn't use shared memory
+#ifndef KMP_USE_SHM
+// Windows and WASI do not need these include files as they don't use shared
+// memory.
 #else
 #include <sys/mman.h>
 #include <sys/stat.h>
@@ -111,6 +113,21 @@ void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
                                int new_nthreads);
 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
 
+static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
+                                                        int level) {
+  kmp_nested_nthreads_t *new_nested_nth =
+      (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
+          sizeof(kmp_nested_nthreads_t));
+  int new_size = level + thr->th.th_set_nested_nth_sz;
+  new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
+  for (int i = 0; i < level + 1; ++i)
+    new_nested_nth->nth[i] = 0;
+  for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
+    new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
+  new_nested_nth->size = new_nested_nth->used = new_size;
+  return new_nested_nth;
+}
+
 /* Calculate the identifier of the current thread */
 /* fast (and somewhat portable) way to get unique identifier of executing
    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
@@ -178,7 +195,12 @@ int __kmp_get_global_thread_id() {
       if (stack_diff <= stack_size) {
         /* The only way we can be closer than the allocated */
         /* stack size is if we are running on this thread. */
-        KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
+        // __kmp_gtid_get_specific can return negative value because this
+        // function can be called by thread destructor. However, before the
+        // thread destructor is called, the value of the corresponding
+        // thread-specific data will be reset to NULL.
+        KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
+                         __kmp_gtid_get_specific() == i);
         return i;
       }
     }
@@ -196,6 +218,12 @@ int __kmp_get_global_thread_id() {
   if (i < 0)
     return i;
 
+  // other_threads[i] can be nullptr at this point because the corresponding
+  // thread could have already been destructed. It can happen when this function
+  // is called in end library routine.
+  if (!TCR_SYNC_PTR(other_threads[i]))
+    return i;
+
   /* dynamically updated stack window for uber threads to avoid get_specific
      call */
   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
@@ -405,6 +433,8 @@ void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
   }
 #endif /* KMP_PRINT_DATA_PLACEMENT */
   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+
+  va_end(ap);
 }
 
 void __kmp_warn(char const *format, ...) {
@@ -433,26 +463,26 @@ void __kmp_abort_process() {
     __kmp_dump_debug_buffer();
   }
 
-  if (KMP_OS_WINDOWS) {
-    // Let other threads know of abnormal termination and prevent deadlock
-    // if abort happened during library initialization or shutdown
-    __kmp_global.g.g_abort = SIGABRT;
-
-    /* On Windows* OS by default abort() causes pop-up error box, which stalls
-       nightly testing. Unfortunately, we cannot reliably suppress pop-up error
-       boxes. _set_abort_behavior() works well, but this function is not
-       available in VS7 (this is not problem for DLL, but it is a problem for
-       static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
-       help, at least in some versions of MS C RTL.
-
-       It seems following sequence is the only way to simulate abort() and
-       avoid pop-up error box. */
-    raise(SIGABRT);
-    _exit(3); // Just in case, if signal ignored, exit anyway.
-  } else {
-    __kmp_unregister_library();
-    abort();
-  }
+#if KMP_OS_WINDOWS
+  // Let other threads know of abnormal termination and prevent deadlock
+  // if abort happened during library initialization or shutdown
+  __kmp_global.g.g_abort = SIGABRT;
+
+  /* On Windows* OS by default abort() causes pop-up error box, which stalls
+     nightly testing. Unfortunately, we cannot reliably suppress pop-up error
+     boxes. _set_abort_behavior() works well, but this function is not
+     available in VS7 (this is not problem for DLL, but it is a problem for
+     static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
+     help, at least in some versions of MS C RTL.
+
+     It seems following sequence is the only way to simulate abort() and
+     avoid pop-up error box. */
+  raise(SIGABRT);
+  _exit(3); // Just in case, if signal ignored, exit anyway.
+#else
+  __kmp_unregister_library();
+  abort();
+#endif
 
   __kmp_infinite_loop();
   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
@@ -553,6 +583,14 @@ static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
 
 /* ------------------------------------------------------------------------ */
 
+#if ENABLE_LIBOMPTARGET
+static void __kmp_init_omptarget() {
+  __kmp_init_target_task();
+}
+#endif
+
+/* ------------------------------------------------------------------------ */
+
 #if KMP_DYNAMIC_LIB
 #if KMP_OS_WINDOWS
 
@@ -907,6 +945,11 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
                   __kmp_get_gtid(), new_nthreads, set_nthreads));
   }
 #endif // KMP_DEBUG
+
+  if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
+    __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
+                 this_thr->th.th_nt_msg);
+  }
   return new_nthreads;
 }
 
@@ -1011,6 +1054,47 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
       __kmp_partition_places(team);
     }
 #endif
+
+    if (team->t.t_nproc > 1 &&
+        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      team->t.b->update_num_threads(team->t.t_nproc);
+      __kmp_add_threads_to_team(team, team->t.t_nproc);
+    }
+  }
+
+  // Take care of primary thread's task state
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    if (use_hot_team) {
+      KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
+      KA_TRACE(
+          20,
+          ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
+           "%p, new task_team %p / team %p\n",
+           __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
+           team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
+           team));
+
+      // Store primary thread's current task state on new team
+      KMP_CHECK_UPDATE(team->t.t_primary_task_state,
+                       master_th->th.th_task_state);
+
+      // Restore primary thread's task state to hot team's state
+      // by using thread 1's task state
+      if (team->t.t_nproc > 1) {
+        KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
+                         team->t.t_threads[1]->th.th_task_state == 1);
+        KMP_CHECK_UPDATE(master_th->th.th_task_state,
+                         team->t.t_threads[1]->th.th_task_state);
+      } else {
+        master_th->th.th_task_state = 0;
+      }
+    } else {
+      // Store primary thread's current task_state on new team
+      KMP_CHECK_UPDATE(team->t.t_primary_task_state,
+                       master_th->th.th_task_state);
+      // Are not using hot team, so set task state to 0.
+      master_th->th.th_task_state = 0;
+    }
   }
 
   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
@@ -1116,18 +1200,6 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   KMP_DEBUG_ASSERT(serial_team);
   KMP_MB();
 
-  if (__kmp_tasking_mode != tskm_immediate_exec) {
-    KMP_DEBUG_ASSERT(
-        this_thr->th.th_task_team ==
-        this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
-    KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
-                     NULL);
-    KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
-                  "team %p, new task_team = NULL\n",
-                  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
-    this_thr->th.th_task_team = NULL;
-  }
-
   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
     proc_bind = proc_bind_false;
@@ -1139,6 +1211,9 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   // Reset for next parallel region
   this_thr->th.th_set_proc_bind = proc_bind_default;
 
+  // Reset num_threads for next parallel region
+  this_thr->th.th_set_nproc = 0;
+
 #if OMPT_SUPPORT
   ompt_data_t ompt_parallel_data = ompt_data_none;
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
@@ -1210,6 +1285,12 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     serial_team->t.t_serialized = 1;
     serial_team->t.t_nproc = 1;
     serial_team->t.t_parent = this_thr->th.th_team;
+    if (this_thr->th.th_team->t.t_nested_nth)
+      serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
+    else
+      serial_team->t.t_nested_nth = &__kmp_nested_nth;
+    // Save previous team's task state on serial team structure
+    serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
     this_thr->th.th_team = serial_team;
     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
@@ -1229,9 +1310,11 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 
     // Thread value exists in the nested nthreads array for the next nested
     // level
-    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
-      this_thr->th.th_current_task->td_icvs.nproc =
-          __kmp_nested_nth.nth[level + 1];
+    kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
+    if (this_thr->th.th_team->t.t_nested_nth)
+      nested_nth = this_thr->th.th_team->t.t_nested_nth;
+    if (nested_nth->used && (level + 1 < nested_nth->used)) {
+      this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
     }
 
     if (__kmp_nested_proc_bind.used &&
@@ -1249,6 +1332,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     this_thr->th.th_team_nproc = 1;
     this_thr->th.th_team_master = this_thr;
     this_thr->th.th_team_serialized = 1;
+    this_thr->th.th_task_team = NULL;
+    this_thr->th.th_task_state = 0;
 
     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
@@ -1280,10 +1365,14 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     int level = this_thr->th.th_team->t.t_level;
     // Thread value exists in the nested nthreads array for the next nested
     // level
-    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
-      this_thr->th.th_current_task->td_icvs.nproc =
-          __kmp_nested_nth.nth[level + 1];
+
+    kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
+    if (serial_team->t.t_nested_nth)
+      nested_nth = serial_team->t.t_nested_nth;
+    if (nested_nth->used && (level + 1 < nested_nth->used)) {
+      this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
     }
+
     serial_team->t.t_level++;
     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
                   "of serial team %p to %d\n",
@@ -1300,6 +1389,9 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     }
     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
 
+    /* allocate/push task team stack */
+    __kmp_push_task_team_node(this_thr, serial_team);
+
     KMP_MB();
   }
   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
@@ -1350,6 +1442,486 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 #endif
 }
 
+// Test if this fork is for a team closely nested in a teams construct
+static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
+                                          microtask_t microtask, int level,
+                                          int teams_level, kmp_va_list ap) {
+  return (master_th->th.th_teams_microtask && ap &&
+          microtask != (microtask_t)__kmp_teams_master && level == teams_level);
+}
+
+// Test if this fork is for the teams construct, i.e. to form the outer league
+// of teams
+static inline bool __kmp_is_entering_teams(int active_level, int level,
+                                           int teams_level, kmp_va_list ap) {
+  return ((ap == NULL && active_level == 0) ||
+          (ap && teams_level > 0 && teams_level == level));
+}
+
+// AC: This is start of parallel that is nested inside teams construct.
+// The team is actual (hot), all workers are ready at the fork barrier.
+// No lock needed to initialize the team a bit, then free workers.
+static inline int
+__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
+                    kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
+                    enum fork_context_e call_context, microtask_t microtask,
+                    launch_t invoker, int master_set_numthreads, int level,
+#if OMPT_SUPPORT
+                    ompt_data_t ompt_parallel_data, void *return_address,
+#endif
+                    kmp_va_list ap) {
+  void **argv;
+  int i;
+
+  parent_team->t.t_ident = loc;
+  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
+  parent_team->t.t_argc = argc;
+  argv = (void **)parent_team->t.t_argv;
+  for (i = argc - 1; i >= 0; --i) {
+    *argv++ = va_arg(kmp_va_deref(ap), void *);
+  }
+  // Increment our nested depth levels, but not increase the serialization
+  if (parent_team == master_th->th.th_serial_team) {
+    // AC: we are in serialized parallel
+    __kmpc_serialized_parallel(loc, gtid);
+    KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
+
+    if (call_context == fork_context_gnu) {
+      // AC: need to decrement t_serialized for enquiry functions to work
+      // correctly, will restore at join time
+      parent_team->t.t_serialized--;
+      return TRUE;
+    }
+
+#if OMPD_SUPPORT
+    parent_team->t.t_pkfn = microtask;
+#endif
+
+#if OMPT_SUPPORT
+    void *dummy;
+    void **exit_frame_p;
+    ompt_data_t *implicit_task_data;
+    ompt_lw_taskteam_t lw_taskteam;
+
+    if (ompt_enabled.enabled) {
+      __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                              &ompt_parallel_data, return_address);
+      exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
+
+      __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+      // Don't use lw_taskteam after linking. Content was swapped.
+
+      /* OMPT implicit task begin */
+      implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
+      if (ompt_enabled.ompt_callback_implicit_task) {
+        OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
+        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+            ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
+            1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+      }
+
+      /* OMPT state */
+      master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+    } else {
+      exit_frame_p = &dummy;
+    }
+#endif
+
+    // AC: need to decrement t_serialized for enquiry functions to work
+    // correctly, will restore at join time
+    parent_team->t.t_serialized--;
+
+    {
+      KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
+      KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
+      __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
+#if OMPT_SUPPORT
+                             ,
+                             exit_frame_p
+#endif
+                             );
+    }
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      *exit_frame_p = NULL;
+      OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
+      if (ompt_enabled.ompt_callback_implicit_task) {
+        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+            ompt_scope_end, NULL, implicit_task_data, 1,
+            OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+      }
+      ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
+      __ompt_lw_taskteam_unlink(master_th);
+      if (ompt_enabled.ompt_callback_parallel_end) {
+        ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+            &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
+            OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
+      }
+      master_th->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
+    return TRUE;
+  }
+
+  parent_team->t.t_pkfn = microtask;
+  parent_team->t.t_invoke = invoker;
+  KMP_ATOMIC_INC(&root->r.r_in_parallel);
+  parent_team->t.t_active_level++;
+  parent_team->t.t_level++;
+  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
+
+  // If the threads allocated to the team are less than the thread limit, update
+  // the thread limit here. th_teams_size.nth is specific to this team nested
+  // in a teams construct, the team is fully created, and we're about to do
+  // the actual fork. Best to do this here so that the subsequent uses below
+  // and in the join have the correct value.
+  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    ompt_lw_taskteam_t lw_taskteam;
+    __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
+                            return_address);
+    __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
+  }
+#endif
+
+  /* Change number of threads in the team if requested */
+  if (master_set_numthreads) { // The parallel has num_threads clause
+    if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
+      // AC: only can reduce number of threads dynamically, can't increase
+      kmp_info_t **other_threads = parent_team->t.t_threads;
+      // NOTE: if using distributed barrier, we need to run this code block
+      // even when the team size appears not to have changed from the max.
+      int old_proc = master_th->th.th_teams_size.nth;
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
+        __kmp_add_threads_to_team(parent_team, master_set_numthreads);
+      }
+      parent_team->t.t_nproc = master_set_numthreads;
+      for (i = 0; i < master_set_numthreads; ++i) {
+        other_threads[i]->th.th_team_nproc = master_set_numthreads;
+      }
+    }
+    // Keep extra threads hot in the team for possible next parallels
+    master_th->th.th_set_nproc = 0;
+  }
+
+#if USE_DEBUGGER
+  if (__kmp_debugging) { // Let debugger override number of threads.
+    int nth = __kmp_omp_num_threads(loc);
+    if (nth > 0) { // 0 means debugger doesn't want to change num threads
+      master_set_numthreads = nth;
+    }
+  }
+#endif
+
+  // Figure out the proc_bind policy for the nested parallel within teams
+  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
+  // proc_bind_default means don't update
+  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
+  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
+    proc_bind = proc_bind_false;
+  } else {
+    // No proc_bind clause specified; use current proc-bind-var
+    if (proc_bind == proc_bind_default) {
+      proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
+    }
+    /* else: The proc_bind policy was specified explicitly on parallel clause.
+       This overrides proc-bind-var for this parallel region, but does not
+       change proc-bind-var. */
+    // Figure the value of proc-bind-var for the child threads.
+    if ((level + 1 < __kmp_nested_proc_bind.used) &&
+        (__kmp_nested_proc_bind.bind_types[level + 1] !=
+         master_th->th.th_current_task->td_icvs.proc_bind)) {
+      proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+    }
+  }
+  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
+  // Need to change the bind-var ICV to correct value for each implicit task
+  if (proc_bind_icv != proc_bind_default &&
+      master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
+    kmp_info_t **other_threads = parent_team->t.t_threads;
+    for (i = 0; i < master_th->th.th_team_nproc; ++i) {
+      other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
+    }
+  }
+  // Reset for next parallel region
+  master_th->th.th_set_proc_bind = proc_bind_default;
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
+       KMP_ITT_DEBUG) &&
+      __kmp_forkjoin_frames_mode == 3 &&
+      parent_team->t.t_active_level == 1 // only report frames at level 1
+      && master_th->th.th_teams_size.nteams == 1) {
+    kmp_uint64 tmp_time = __itt_get_timestamp();
+    master_th->th.th_frame_time = tmp_time;
+    parent_team->t.t_region_time = tmp_time;
+  }
+  if (__itt_stack_caller_create_ptr) {
+    KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
+    // create new stack stitching id before entering fork barrier
+    parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
+  }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+#if KMP_AFFINITY_SUPPORTED
+  __kmp_partition_places(parent_team);
+#endif
+
+  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
+                "master_th=%p, gtid=%d\n",
+                root, parent_team, master_th, gtid));
+  __kmp_internal_fork(loc, gtid, parent_team);
+  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
+                "master_th=%p, gtid=%d\n",
+                root, parent_team, master_th, gtid));
+
+  if (call_context == fork_context_gnu)
+    return TRUE;
+
+  /* Invoke microtask for PRIMARY thread */
+  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
+                parent_team->t.t_id, parent_team->t.t_pkfn));
+
+  if (!parent_team->t.t_invoke(gtid)) {
+    KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
+  }
+  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
+                parent_team->t.t_id, parent_team->t.t_pkfn));
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
+
+  return TRUE;
+}
+
+// Create a serialized parallel region
+static inline int
+__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
+                       kmp_int32 argc, microtask_t microtask, launch_t invoker,
+                       kmp_info_t *master_th, kmp_team_t *parent_team,
+#if OMPT_SUPPORT
+                       ompt_data_t *ompt_parallel_data, void **return_address,
+                       ompt_data_t **parent_task_data,
+#endif
+                       kmp_va_list ap) {
+  kmp_team_t *team;
+  int i;
+  void **argv;
+
+/* josh todo: hypothetical question: what do we do for OS X*? */
+#if KMP_OS_LINUX &&                                                            \
+    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+  SimpleVLA<void *> args(argc);
+#else
+  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
+#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
+          KMP_ARCH_AARCH64) */
+
+  KA_TRACE(
+      20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
+
+  __kmpc_serialized_parallel(loc, gtid);
+
+#if OMPD_SUPPORT
+  master_th->th.th_serial_team->t.t_pkfn = microtask;
+#endif
+
+  if (call_context == fork_context_intel) {
+    /* TODO this sucks, use the compiler itself to pass args! :) */
+    master_th->th.th_serial_team->t.t_ident = loc;
+    if (!ap) {
+      // revert change made in __kmpc_serialized_parallel()
+      master_th->th.th_serial_team->t.t_level--;
+// Get args from parent team for teams construct
+
+#if OMPT_SUPPORT
+      void *dummy;
+      void **exit_frame_p;
+      ompt_task_info_t *task_info;
+      ompt_lw_taskteam_t lw_taskteam;
+
+      if (ompt_enabled.enabled) {
+        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                                ompt_parallel_data, *return_address);
+
+        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+        // don't use lw_taskteam after linking. content was swaped
+        task_info = OMPT_CUR_TASK_INFO(master_th);
+        exit_frame_p = &(task_info->frame.exit_frame.ptr);
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+              &(task_info->task_data), 1,
+              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+        }
+
+        /* OMPT state */
+        master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+      } else {
+        exit_frame_p = &dummy;
+      }
+#endif
+
+      {
+        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
+        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
+        __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
+#if OMPT_SUPPORT
+                               ,
+                               exit_frame_p
+#endif
+                               );
+      }
+
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        *exit_frame_p = NULL;
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_end, NULL, &(task_info->task_data), 1,
+              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+        }
+        *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
+        __ompt_lw_taskteam_unlink(master_th);
+        if (ompt_enabled.ompt_callback_parallel_end) {
+          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+              ompt_parallel_data, *parent_task_data,
+              OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
+        }
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+      }
+#endif
+    } else if (microtask == (microtask_t)__kmp_teams_master) {
+      KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
+      team = master_th->th.th_team;
+      // team->t.t_pkfn = microtask;
+      team->t.t_invoke = invoker;
+      __kmp_alloc_argv_entries(argc, team, TRUE);
+      team->t.t_argc = argc;
+      argv = (void **)team->t.t_argv;
+      for (i = argc - 1; i >= 0; --i)
+        *argv++ = va_arg(kmp_va_deref(ap), void *);
+      // AC: revert change made in __kmpc_serialized_parallel()
+      //     because initial code in teams should have level=0
+      team->t.t_level--;
+      // AC: call special invoker for outer "parallel" of teams construct
+      invoker(gtid);
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_end, NULL, &(task_info->task_data), 0,
+              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
+        }
+        if (ompt_enabled.ompt_callback_parallel_end) {
+          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+              ompt_parallel_data, *parent_task_data,
+              OMPT_INVOKER(call_context) | ompt_parallel_league,
+              *return_address);
+        }
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+      }
+#endif
+    } else {
+      argv = args;
+      for (i = argc - 1; i >= 0; --i)
+        *argv++ = va_arg(kmp_va_deref(ap), void *);
+      KMP_MB();
+
+#if OMPT_SUPPORT
+      void *dummy;
+      void **exit_frame_p;
+      ompt_task_info_t *task_info;
+      ompt_lw_taskteam_t lw_taskteam;
+      ompt_data_t *implicit_task_data;
+
+      if (ompt_enabled.enabled) {
+        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                                ompt_parallel_data, *return_address);
+        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+        // don't use lw_taskteam after linking. content was swaped
+        task_info = OMPT_CUR_TASK_INFO(master_th);
+        exit_frame_p = &(task_info->frame.exit_frame.ptr);
+
+        /* OMPT implicit task begin */
+        implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+              implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
+              ompt_task_implicit);
+          OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
+        }
+
+        /* OMPT state */
+        master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+      } else {
+        exit_frame_p = &dummy;
+      }
+#endif
+
+      {
+        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
+        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
+        __kmp_invoke_microtask(microtask, gtid, 0, argc, args
+#if OMPT_SUPPORT
+                               ,
+                               exit_frame_p
+#endif
+                               );
+      }
+
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        *exit_frame_p = NULL;
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_end, NULL, &(task_info->task_data), 1,
+              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+        }
+
+        *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
+        __ompt_lw_taskteam_unlink(master_th);
+        if (ompt_enabled.ompt_callback_parallel_end) {
+          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+              ompt_parallel_data, *parent_task_data,
+              OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
+        }
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+      }
+#endif
+    }
+  } else if (call_context == fork_context_gnu) {
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      ompt_lw_taskteam_t lwt;
+      __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
+                              *return_address);
+
+      lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
+      __ompt_lw_taskteam_link(&lwt, master_th, 1);
+    }
+// don't use lw_taskteam after linking. content was swaped
+#endif
+
+    // we were called from GNU native code
+    KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
+    return FALSE;
+  } else {
+    KMP_ASSERT2(call_context < fork_context_last,
+                "__kmp_serial_fork_call: unknown fork_context parameter");
+  }
+
+  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
+  KMP_MB();
+  return FALSE;
+}
+
 /* most of the work for a fork */
 /* return true if we really went parallel, false if serialized */
 int __kmp_fork_call(ident_t *loc, int gtid,
@@ -1367,6 +1939,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
   int nthreads;
   int master_active;
   int master_set_numthreads;
+  int task_thread_limit = 0;
   int level;
   int active_level;
   int teams_level;
@@ -1395,20 +1968,23 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     __kmp_resume_if_soft_paused();
 
     /* setup current data */
-    master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
-    // shutdown
+    // AC: potentially unsafe, not in sync with library shutdown,
+    // __kmp_threads can be freed
+    master_th = __kmp_threads[gtid];
+
     parent_team = master_th->th.th_team;
     master_tid = master_th->th.th_info.ds.ds_tid;
     master_this_cons = master_th->th.th_local.this_construct;
     root = master_th->th.th_root;
     master_active = root->r.r_active;
     master_set_numthreads = master_th->th.th_set_nproc;
+    task_thread_limit =
+        master_th->th.th_current_task->td_icvs.task_thread_limit;
 
 #if OMPT_SUPPORT
     ompt_data_t ompt_parallel_data = ompt_data_none;
-    ompt_data_t *parent_task_data;
-    ompt_frame_t *ompt_frame;
-    ompt_data_t *implicit_task_data;
+    ompt_data_t *parent_task_data = NULL;
+    ompt_frame_t *ompt_frame = NULL;
     void *return_address = NULL;
 
     if (ompt_enabled.enabled) {
@@ -1458,267 +2034,44 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
     master_th->th.th_ident = loc;
 
-    if (master_th->th.th_teams_microtask && ap &&
-        microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
-      // AC: This is start of parallel that is nested inside teams construct.
-      // The team is actual (hot), all workers are ready at the fork barrier.
-      // No lock needed to initialize the team a bit, then free workers.
-      parent_team->t.t_ident = loc;
-      __kmp_alloc_argv_entries(argc, parent_team, TRUE);
-      parent_team->t.t_argc = argc;
-      argv = (void **)parent_team->t.t_argv;
-      for (i = argc - 1; i >= 0; --i)
-        *argv++ = va_arg(kmp_va_deref(ap), void *);
-      // Increment our nested depth levels, but not increase the serialization
-      if (parent_team == master_th->th.th_serial_team) {
-        // AC: we are in serialized parallel
-        __kmpc_serialized_parallel(loc, gtid);
-        KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
-
-        if (call_context == fork_context_gnu) {
-          // AC: need to decrement t_serialized for enquiry functions to work
-          // correctly, will restore at join time
-          parent_team->t.t_serialized--;
-          return TRUE;
-        }
-
-#if OMPD_SUPPORT
-        parent_team->t.t_pkfn = microtask;
-#endif
-
+    // Parallel closely nested in teams construct:
+    if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
+      return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
+                                 call_context, microtask, invoker,
+                                 master_set_numthreads, level,
 #if OMPT_SUPPORT
-        void *dummy;
-        void **exit_frame_p;
-
-        ompt_lw_taskteam_t lw_taskteam;
-
-        if (ompt_enabled.enabled) {
-          __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
-                                  &ompt_parallel_data, return_address);
-          exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
-
-          __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
-          // don't use lw_taskteam after linking. content was swaped
-
-          /* OMPT implicit task begin */
-          implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
-          if (ompt_enabled.ompt_callback_implicit_task) {
-            OMPT_CUR_TASK_INFO(master_th)->thread_num =
-                __kmp_tid_from_gtid(gtid);
-            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
-                implicit_task_data, 1,
-                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
-          }
-
-          /* OMPT state */
-          master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
-        } else {
-          exit_frame_p = &dummy;
-        }
-#endif
-        // AC: need to decrement t_serialized for enquiry functions to work
-        // correctly, will restore at join time
-        parent_team->t.t_serialized--;
-
-        {
-          KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-          KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-          __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
-#if OMPT_SUPPORT
-                                 ,
-                                 exit_frame_p
-#endif
-          );
-        }
-
-#if OMPT_SUPPORT
-        if (ompt_enabled.enabled) {
-          *exit_frame_p = NULL;
-          OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
-          if (ompt_enabled.ompt_callback_implicit_task) {
-            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                ompt_scope_end, NULL, implicit_task_data, 1,
-                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
-          }
-          ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
-          __ompt_lw_taskteam_unlink(master_th);
-          if (ompt_enabled.ompt_callback_parallel_end) {
-            ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
-                &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
-                OMPT_INVOKER(call_context) | ompt_parallel_team,
-                return_address);
-          }
-          master_th->th.ompt_thread_info.state = ompt_state_overhead;
-        }
-#endif
-        return TRUE;
-      }
-
-      parent_team->t.t_pkfn = microtask;
-      parent_team->t.t_invoke = invoker;
-      KMP_ATOMIC_INC(&root->r.r_in_parallel);
-      parent_team->t.t_active_level++;
-      parent_team->t.t_level++;
-      parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
-
-#if OMPT_SUPPORT
-      if (ompt_enabled.enabled) {
-        ompt_lw_taskteam_t lw_taskteam;
-        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
-                                &ompt_parallel_data, return_address);
-        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
-      }
-#endif
-
-      /* Change number of threads in the team if requested */
-      if (master_set_numthreads) { // The parallel has num_threads clause
-        if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
-          // AC: only can reduce number of threads dynamically, can't increase
-          kmp_info_t **other_threads = parent_team->t.t_threads;
-          // NOTE: if using distributed barrier, we need to run this code block
-          // even when the team size appears not to have changed from the max.
-          int old_proc = master_th->th.th_teams_size.nth;
-          if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
-              bp_dist_bar) {
-            __kmp_resize_dist_barrier(parent_team, old_proc,
-                                      master_set_numthreads);
-            __kmp_add_threads_to_team(parent_team, master_set_numthreads);
-          }
-          parent_team->t.t_nproc = master_set_numthreads;
-          for (i = 0; i < master_set_numthreads; ++i) {
-            other_threads[i]->th.th_team_nproc = master_set_numthreads;
-          }
-        }
-        // Keep extra threads hot in the team for possible next parallels
-        master_th->th.th_set_nproc = 0;
-      }
-
-#if USE_DEBUGGER
-      if (__kmp_debugging) { // Let debugger override number of threads.
-        int nth = __kmp_omp_num_threads(loc);
-        if (nth > 0) { // 0 means debugger doesn't want to change num threads
-          master_set_numthreads = nth;
-        }
-      }
-#endif
-
-      // Figure out the proc_bind policy for the nested parallel within teams
-      kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
-      // proc_bind_default means don't update
-      kmp_proc_bind_t proc_bind_icv = proc_bind_default;
-      if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
-        proc_bind = proc_bind_false;
-      } else {
-        // No proc_bind clause specified; use current proc-bind-var
-        if (proc_bind == proc_bind_default) {
-          proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
-        }
-        /* else: The proc_bind policy was specified explicitly on parallel
-           clause.
-           This overrides proc-bind-var for this parallel region, but does not
-           change proc-bind-var. */
-        // Figure the value of proc-bind-var for the child threads.
-        if ((level + 1 < __kmp_nested_proc_bind.used) &&
-            (__kmp_nested_proc_bind.bind_types[level + 1] !=
-             master_th->th.th_current_task->td_icvs.proc_bind)) {
-          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
-        }
-      }
-      KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
-      // Need to change the bind-var ICV to correct value for each implicit task
-      if (proc_bind_icv != proc_bind_default &&
-          master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
-        kmp_info_t **other_threads = parent_team->t.t_threads;
-        for (i = 0; i < master_th->th.th_team_nproc; ++i) {
-          other_threads[i]->th.th_current_task->td_icvs.proc_bind =
-              proc_bind_icv;
-        }
-      }
-      // Reset for next parallel region
-      master_th->th.th_set_proc_bind = proc_bind_default;
-
-#if USE_ITT_BUILD && USE_ITT_NOTIFY
-      if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
-           KMP_ITT_DEBUG) &&
-          __kmp_forkjoin_frames_mode == 3 &&
-          parent_team->t.t_active_level == 1 // only report frames at level 1
-          && master_th->th.th_teams_size.nteams == 1) {
-        kmp_uint64 tmp_time = __itt_get_timestamp();
-        master_th->th.th_frame_time = tmp_time;
-        parent_team->t.t_region_time = tmp_time;
-      }
-      if (__itt_stack_caller_create_ptr) {
-        KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
-        // create new stack stitching id before entering fork barrier
-        parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
-      }
-#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
-#if KMP_AFFINITY_SUPPORTED
-      __kmp_partition_places(parent_team);
-#endif
-
-      KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
-                    "master_th=%p, gtid=%d\n",
-                    root, parent_team, master_th, gtid));
-      __kmp_internal_fork(loc, gtid, parent_team);
-      KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
-                    "master_th=%p, gtid=%d\n",
-                    root, parent_team, master_th, gtid));
-
-      if (call_context == fork_context_gnu)
-        return TRUE;
-
-      /* Invoke microtask for PRIMARY thread */
-      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
-                    parent_team->t.t_id, parent_team->t.t_pkfn));
-
-      if (!parent_team->t.t_invoke(gtid)) {
-        KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
-      }
-      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
-                    parent_team->t.t_id, parent_team->t.t_pkfn));
-      KMP_MB(); /* Flush all pending memory write invalidates.  */
-
-      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
-
-      return TRUE;
-    } // Parallel closely nested in teams construct
-
-#if KMP_DEBUG
-    if (__kmp_tasking_mode != tskm_immediate_exec) {
-      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
-                       parent_team->t.t_task_team[master_th->th.th_task_state]);
-    }
+                                 ompt_parallel_data, return_address,
 #endif
+                                 ap);
+    } // End parallel closely nested in teams construct
 
     // Need this to happen before we determine the number of threads, not while
     // we are allocating the team
     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
-    int enter_teams = 0;
-    if (parent_team->t.t_active_level >=
-        master_th->th.th_current_task->td_icvs.max_active_levels) {
+
+    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
+
+    // Determine the number of threads
+    int enter_teams =
+        __kmp_is_entering_teams(active_level, level, teams_level, ap);
+    if ((!enter_teams &&
+         (parent_team->t.t_active_level >=
+          master_th->th.th_current_task->td_icvs.max_active_levels)) ||
+        (__kmp_library == library_serial)) {
+      KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
       nthreads = 1;
     } else {
-      enter_teams = ((ap == NULL && active_level == 0) ||
-                     (ap && teams_level > 0 && teams_level == level));
       nthreads = master_set_numthreads
                      ? master_set_numthreads
                      // TODO: get nproc directly from current task
                      : get__nproc_2(parent_team, master_tid);
+      // Use the thread_limit set for the current target task if exists, else go
+      // with the deduced nthreads
+      nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
+                     ? task_thread_limit
+                     : nthreads;
       // Check if we need to take forkjoin lock? (no need for serialized
-      // parallel out of teams construct). This code moved here from
-      // __kmp_reserve_threads() to speedup nested serialized parallels.
-      if (nthreads > 1) {
-        if ((get__max_active_levels(master_th) == 1 &&
-             (root->r.r_in_parallel && !enter_teams)) ||
-            (__kmp_library == library_serial)) {
-          KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
-                        " threads\n",
-                        gtid, nthreads));
-          nthreads = 1;
-        }
-      }
+      // parallel out of teams construct).
       if (nthreads > 1) {
         /* determine how many new threads we can use */
         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
@@ -1741,232 +2094,14 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     // If we temporarily changed the set number of threads then restore it now
     master_th->th.th_set_nproc = 0;
 
-    /* create a serialized parallel region? */
     if (nthreads == 1) {
-/* josh todo: hypothetical question: what do we do for OS X*? */
-#if KMP_OS_LINUX &&                                                            \
-    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
-      void *args[argc];
-#else
-      void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
-#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
-          KMP_ARCH_AARCH64) */
-
-      KA_TRACE(20,
-               ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
-
-      __kmpc_serialized_parallel(loc, gtid);
-
-#if OMPD_SUPPORT
-      master_th->th.th_serial_team->t.t_pkfn = microtask;
-#endif
-
-      if (call_context == fork_context_intel) {
-        /* TODO this sucks, use the compiler itself to pass args! :) */
-        master_th->th.th_serial_team->t.t_ident = loc;
-        if (!ap) {
-          // revert change made in __kmpc_serialized_parallel()
-          master_th->th.th_serial_team->t.t_level--;
-          // Get args from parent team for teams construct
-
-#if OMPT_SUPPORT
-          void *dummy;
-          void **exit_frame_p;
-          ompt_task_info_t *task_info;
-
-          ompt_lw_taskteam_t lw_taskteam;
-
-          if (ompt_enabled.enabled) {
-            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
-                                    &ompt_parallel_data, return_address);
-
-            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
-            // don't use lw_taskteam after linking. content was swaped
-
-            task_info = OMPT_CUR_TASK_INFO(master_th);
-            exit_frame_p = &(task_info->frame.exit_frame.ptr);
-            if (ompt_enabled.ompt_callback_implicit_task) {
-              OMPT_CUR_TASK_INFO(master_th)->thread_num =
-                  __kmp_tid_from_gtid(gtid);
-              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
-                  &(task_info->task_data), 1,
-                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
-                  ompt_task_implicit);
-            }
-
-            /* OMPT state */
-            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
-          } else {
-            exit_frame_p = &dummy;
-          }
-#endif
-
-          {
-            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-            __kmp_invoke_microtask(microtask, gtid, 0, argc,
-                                   parent_team->t.t_argv
-#if OMPT_SUPPORT
-                                   ,
-                                   exit_frame_p
-#endif
-            );
-          }
-
-#if OMPT_SUPPORT
-          if (ompt_enabled.enabled) {
-            *exit_frame_p = NULL;
-            if (ompt_enabled.ompt_callback_implicit_task) {
-              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                  ompt_scope_end, NULL, &(task_info->task_data), 1,
-                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
-                  ompt_task_implicit);
-            }
-            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
-            __ompt_lw_taskteam_unlink(master_th);
-            if (ompt_enabled.ompt_callback_parallel_end) {
-              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
-                  &ompt_parallel_data, parent_task_data,
-                  OMPT_INVOKER(call_context) | ompt_parallel_team,
-                  return_address);
-            }
-            master_th->th.ompt_thread_info.state = ompt_state_overhead;
-          }
-#endif
-        } else if (microtask == (microtask_t)__kmp_teams_master) {
-          KMP_DEBUG_ASSERT(master_th->th.th_team ==
-                           master_th->th.th_serial_team);
-          team = master_th->th.th_team;
-          // team->t.t_pkfn = microtask;
-          team->t.t_invoke = invoker;
-          __kmp_alloc_argv_entries(argc, team, TRUE);
-          team->t.t_argc = argc;
-          argv = (void **)team->t.t_argv;
-          if (ap) {
-            for (i = argc - 1; i >= 0; --i)
-              *argv++ = va_arg(kmp_va_deref(ap), void *);
-          } else {
-            for (i = 0; i < argc; ++i)
-              // Get args from parent team for teams construct
-              argv[i] = parent_team->t.t_argv[i];
-          }
-          // AC: revert change made in __kmpc_serialized_parallel()
-          //     because initial code in teams should have level=0
-          team->t.t_level--;
-          // AC: call special invoker for outer "parallel" of teams construct
-          invoker(gtid);
-#if OMPT_SUPPORT
-          if (ompt_enabled.enabled) {
-            ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
-            if (ompt_enabled.ompt_callback_implicit_task) {
-              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                  ompt_scope_end, NULL, &(task_info->task_data), 0,
-                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
-            }
-            if (ompt_enabled.ompt_callback_parallel_end) {
-              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
-                  &ompt_parallel_data, parent_task_data,
-                  OMPT_INVOKER(call_context) | ompt_parallel_league,
-                  return_address);
-            }
-            master_th->th.ompt_thread_info.state = ompt_state_overhead;
-          }
-#endif
-        } else {
-          argv = args;
-          for (i = argc - 1; i >= 0; --i)
-            *argv++ = va_arg(kmp_va_deref(ap), void *);
-          KMP_MB();
-
-#if OMPT_SUPPORT
-          void *dummy;
-          void **exit_frame_p;
-          ompt_task_info_t *task_info;
-
-          ompt_lw_taskteam_t lw_taskteam;
-
-          if (ompt_enabled.enabled) {
-            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
-                                    &ompt_parallel_data, return_address);
-            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
-            // don't use lw_taskteam after linking. content was swaped
-            task_info = OMPT_CUR_TASK_INFO(master_th);
-            exit_frame_p = &(task_info->frame.exit_frame.ptr);
-
-            /* OMPT implicit task begin */
-            implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
-            if (ompt_enabled.ompt_callback_implicit_task) {
-              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
-                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
-                  ompt_task_implicit);
-              OMPT_CUR_TASK_INFO(master_th)->thread_num =
-                  __kmp_tid_from_gtid(gtid);
-            }
-
-            /* OMPT state */
-            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
-          } else {
-            exit_frame_p = &dummy;
-          }
-#endif
-
-          {
-            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-            __kmp_invoke_microtask(microtask, gtid, 0, argc, args
+      return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
+                                    invoker, master_th, parent_team,
 #if OMPT_SUPPORT
-                                   ,
-                                   exit_frame_p
+                                    &ompt_parallel_data, &return_address,
+                                    &parent_task_data,
 #endif
-            );
-          }
-
-#if OMPT_SUPPORT
-          if (ompt_enabled.enabled) {
-            *exit_frame_p = NULL;
-            if (ompt_enabled.ompt_callback_implicit_task) {
-              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                  ompt_scope_end, NULL, &(task_info->task_data), 1,
-                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
-                  ompt_task_implicit);
-            }
-
-            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
-            __ompt_lw_taskteam_unlink(master_th);
-            if (ompt_enabled.ompt_callback_parallel_end) {
-              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
-                  &ompt_parallel_data, parent_task_data,
-                  OMPT_INVOKER(call_context) | ompt_parallel_team,
-                  return_address);
-            }
-            master_th->th.ompt_thread_info.state = ompt_state_overhead;
-          }
-#endif
-        }
-      } else if (call_context == fork_context_gnu) {
-#if OMPT_SUPPORT
-        ompt_lw_taskteam_t lwt;
-        __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
-                                return_address);
-
-        lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
-        __ompt_lw_taskteam_link(&lwt, master_th, 1);
-// don't use lw_taskteam after linking. content was swaped
-#endif
-
-        // we were called from GNU native code
-        KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
-        return FALSE;
-      } else {
-        KMP_ASSERT2(call_context < fork_context_last,
-                    "__kmp_fork_call: unknown fork_context parameter");
-      }
-
-      KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
-      KMP_MB();
-      return FALSE;
+                                    ap);
     } // if (nthreads == 1)
 
     // GEH: only modify the executing flag in the case when not serialized
@@ -1988,9 +2123,18 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
     // See if we need to make a copy of the ICVs.
     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
-    if ((level + 1 < __kmp_nested_nth.used) &&
-        (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
-      nthreads_icv = __kmp_nested_nth.nth[level + 1];
+    kmp_nested_nthreads_t *nested_nth = NULL;
+    if (!master_th->th.th_set_nested_nth &&
+        (level + 1 < parent_team->t.t_nested_nth->used) &&
+        (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
+      nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
+    } else if (master_th->th.th_set_nested_nth) {
+      nested_nth = __kmp_override_nested_nth(master_th, level);
+      if ((level + 1 < nested_nth->used) &&
+          (nested_nth->nth[level + 1] != nthreads_icv))
+        nthreads_icv = nested_nth->nth[level + 1];
+      else
+        nthreads_icv = 0; // don't update
     } else {
       nthreads_icv = 0; // don't update
     }
@@ -2099,6 +2243,24 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
 
+    // Check if hot team has potentially outdated list, and if so, free it
+    if (team->t.t_nested_nth &&
+        team->t.t_nested_nth != parent_team->t.t_nested_nth) {
+      KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
+      KMP_INTERNAL_FREE(team->t.t_nested_nth);
+      team->t.t_nested_nth = NULL;
+    }
+    team->t.t_nested_nth = parent_team->t.t_nested_nth;
+    if (master_th->th.th_set_nested_nth) {
+      if (!nested_nth)
+        nested_nth = __kmp_override_nested_nth(master_th, level);
+      team->t.t_nested_nth = nested_nth;
+      KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
+      master_th->th.th_set_nested_nth = NULL;
+      master_th->th.th_set_nested_nth_sz = 0;
+      master_th->th.th_nt_strict = false;
+    }
+
     // Update the floating point rounding in the team if required.
     propagateFPControl(team);
 #if OMPD_SUPPORT
@@ -2106,64 +2268,6 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       ompd_bp_parallel_begin();
 #endif
 
-    if (__kmp_tasking_mode != tskm_immediate_exec) {
-      // Set primary thread's task team to team's task team. Unless this is hot
-      // team, it should be NULL.
-      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
-                       parent_team->t.t_task_team[master_th->th.th_task_state]);
-      KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
-                    "%p, new task_team %p / team %p\n",
-                    __kmp_gtid_from_thread(master_th),
-                    master_th->th.th_task_team, parent_team,
-                    team->t.t_task_team[master_th->th.th_task_state], team));
-
-      if (active_level || master_th->th.th_task_team) {
-        // Take a memo of primary thread's task_state
-        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
-        if (master_th->th.th_task_state_top >=
-            master_th->th.th_task_state_stack_sz) { // increase size
-          kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
-          kmp_uint8 *old_stack, *new_stack;
-          kmp_uint32 i;
-          new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
-          for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
-            new_stack[i] = master_th->th.th_task_state_memo_stack[i];
-          }
-          for (i = master_th->th.th_task_state_stack_sz; i < new_size;
-               ++i) { // zero-init rest of stack
-            new_stack[i] = 0;
-          }
-          old_stack = master_th->th.th_task_state_memo_stack;
-          master_th->th.th_task_state_memo_stack = new_stack;
-          master_th->th.th_task_state_stack_sz = new_size;
-          __kmp_free(old_stack);
-        }
-        // Store primary thread's task_state on stack
-        master_th->th
-            .th_task_state_memo_stack[master_th->th.th_task_state_top] =
-            master_th->th.th_task_state;
-        master_th->th.th_task_state_top++;
-#if KMP_NESTED_HOT_TEAMS
-        if (master_th->th.th_hot_teams &&
-            active_level < __kmp_hot_teams_max_level &&
-            team == master_th->th.th_hot_teams[active_level].hot_team) {
-          // Restore primary thread's nested state if nested hot team
-          master_th->th.th_task_state =
-              master_th->th
-                  .th_task_state_memo_stack[master_th->th.th_task_state_top];
-        } else {
-#endif
-          master_th->th.th_task_state = 0;
-#if KMP_NESTED_HOT_TEAMS
-        }
-#endif
-      }
-#if !KMP_NESTED_HOT_TEAMS
-      KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
-                       (team == root->r.r_hot_team));
-#endif
-    }
-
     KA_TRACE(
         20,
         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
@@ -2371,8 +2475,7 @@ void __kmp_join_call(ident_t *loc, int gtid
                   __kmp_gtid_from_thread(master_th), team,
                   team->t.t_task_team[master_th->th.th_task_state],
                   master_th->th.th_task_team));
-    KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
-                     team->t.t_task_team[master_th->th.th_task_state]);
+    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
   }
 #endif
 
@@ -2396,6 +2499,9 @@ void __kmp_join_call(ident_t *loc, int gtid
 
 #if OMPT_SUPPORT
     if (ompt_enabled.enabled) {
+      if (fork_context == fork_context_gnu) {
+        __ompt_lw_taskteam_unlink(master_th);
+      }
       __kmp_join_restore_state(master_th, parent_team);
     }
 #endif
@@ -2430,12 +2536,6 @@ void __kmp_join_call(ident_t *loc, int gtid
       parent_team->t.t_stack_id = NULL;
     }
 #endif
-
-    if (team->t.t_nproc > 1 &&
-        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
-      team->t.b->update_num_threads(team->t.t_nproc);
-      __kmp_add_threads_to_team(team, team->t.t_nproc);
-    }
   }
 
   KMP_MB();
@@ -2613,18 +2713,11 @@ void __kmp_join_call(ident_t *loc, int gtid
   }
 
   if (__kmp_tasking_mode != tskm_immediate_exec) {
-    if (master_th->th.th_task_state_top >
-        0) { // Restore task state from memo stack
-      KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
-      // Remember primary thread's state if we re-use this nested hot team
-      master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
-          master_th->th.th_task_state;
-      --master_th->th.th_task_state_top; // pop
-      // Now restore state at this level
-      master_th->th.th_task_state =
-          master_th->th
-              .th_task_state_memo_stack[master_th->th.th_task_state_top];
-    }
+    // Restore primary thread's task state from team structure
+    KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
+                     team->t.t_primary_task_state == 1);
+    master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
+
     // Copy the task team from the parent team to the primary thread
     master_th->th.th_task_team =
         parent_team->t.t_task_team[master_th->th.th_task_state];
@@ -2642,7 +2735,7 @@ void __kmp_join_call(ident_t *loc, int gtid
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
 
 #if KMP_AFFINITY_SUPPORTED
-  if (master_th->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
+  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
     __kmp_reset_root_init_mask(gtid);
   }
 #endif
@@ -3223,6 +3316,8 @@ static kmp_internal_control_t __kmp_get_global_icvs(void) {
     // next parallel region (per thread)
     // (use a max ub on value if __kmp_parallel_initialize not called yet)
     __kmp_cg_max_nth, // int thread_limit;
+    __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
+    // on task. This is used in the case of target thread_limit
     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
     // for max_active_levels
     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
@@ -3299,6 +3394,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   root_team->t.t_serialized = 1;
   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
   root_team->t.t_sched.sched = r_sched.sched;
+  root_team->t.t_nested_nth = &__kmp_nested_nth;
   KA_TRACE(
       20,
       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
@@ -3336,6 +3432,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
   hot_team->t.t_sched.sched = r_sched.sched;
   hot_team->t.t_size_changed = 0;
+  hot_team->t.t_nested_nth = &__kmp_nested_nth;
 }
 
 #ifdef KMP_DEBUG
@@ -3934,7 +4031,7 @@ int __kmp_register_root(int initial_thread) {
   __kmp_root_counter++;
 
 #if OMPT_SUPPORT
-  if (!initial_thread && ompt_enabled.enabled) {
+  if (ompt_enabled.enabled) {
 
     kmp_info_t *root_thread = ompt_get_thread();
 
@@ -4202,6 +4299,7 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
   else // no tasking --> always safe to reap
     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
   this_thr->th.th_set_proc_bind = proc_bind_default;
+
 #if KMP_AFFINITY_SUPPORTED
   this_thr->th.th_new_place = this_thr->th.th_current_place;
 #endif
@@ -4311,17 +4409,6 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
 
   this_thr->th.th_next_pool = NULL;
 
-  if (!this_thr->th.th_task_state_memo_stack) {
-    size_t i;
-    this_thr->th.th_task_state_memo_stack =
-        (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
-    this_thr->th.th_task_state_top = 0;
-    this_thr->th.th_task_state_stack_sz = 4;
-    for (i = 0; i < this_thr->th.th_task_state_stack_sz;
-         ++i) // zero init the stack
-      this_thr->th.th_task_state_memo_stack[i] = 0;
-  }
-
   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
 
@@ -4346,8 +4433,10 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
 #endif
   KMP_MB();
 
-  /* first, try to get one from the thread pool */
-  if (__kmp_thread_pool) {
+  /* first, try to get one from the thread pool unless allocating thread is
+   * the main hidden helper thread. The hidden helper team should always
+   * allocate new OS threads. */
+  if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
     if (new_thr == __kmp_thread_pool_insert_pt) {
@@ -4376,8 +4465,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     TCW_4(__kmp_nth, __kmp_nth + 1);
 
     new_thr->th.th_task_state = 0;
-    new_thr->th.th_task_state_top = 0;
-    new_thr->th.th_task_state_stack_sz = 4;
 
     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
       // Make sure pool thread has transitioned to waiting on own thread struct
@@ -4412,7 +4499,7 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   }
 
   /* no, well fork a new one */
-  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
+  KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
 
 #if KMP_USE_MONITOR
@@ -4465,6 +4552,11 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   /* allocate space for it. */
   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
 
+  new_thr->th.th_nt_strict = false;
+  new_thr->th.th_nt_loc = NULL;
+  new_thr->th.th_nt_sev = severity_fatal;
+  new_thr->th.th_nt_msg = NULL;
+
   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
 
 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
@@ -4575,6 +4667,9 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   new_thr->th.th_active_in_pool = FALSE;
   TCW_4(new_thr->th.th_active, TRUE);
 
+  new_thr->th.th_set_nested_nth = NULL;
+  new_thr->th.th_set_nested_nth_sz = 0;
+
   /* adjust the global counters */
   __kmp_all_nth++;
   __kmp_nth++;
@@ -4603,6 +4698,11 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   }
 #endif /* KMP_ADJUST_BLOCKTIME */
 
+#if KMP_AFFINITY_SUPPORTED
+  // Set the affinity and topology information for new thread
+  __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
+#endif
+
   /* actually fork it and create the new worker thread */
   KF_TRACE(
       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
@@ -4695,26 +4795,20 @@ static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
 }
 
-#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
-/* Sets full mask for thread and returns old mask, no changes to structures. */
-static void
-__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
-  if (KMP_AFFINITY_CAPABLE()) {
-    int status;
-    if (old_mask != NULL) {
-      status = __kmp_get_system_affinity(old_mask, TRUE);
-      int error = errno;
-      if (status != 0) {
-        __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
-                    __kmp_msg_null);
-      }
-    }
-    __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
+#if KMP_AFFINITY_SUPPORTED
+static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
+                                          int first, int last, int newp) {
+  th->th.th_first_place = first;
+  th->th.th_last_place = last;
+  th->th.th_new_place = newp;
+  if (newp != th->th.th_current_place) {
+    if (__kmp_display_affinity && team->t.t_display_affinity != 1)
+      team->t.t_display_affinity = 1;
+    // Copy topology information associated with the new place
+    th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
+    th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
   }
 }
-#endif
-
-#if KMP_AFFINITY_SUPPORTED
 
 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
 // It calculates the worker + primary thread's partition based upon the parent
@@ -4731,6 +4825,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
   int first_place = master_th->th.th_first_place;
   int last_place = master_th->th.th_last_place;
   int masters_place = master_th->th.th_current_place;
+  int num_masks = __kmp_affinity.num_masks;
   team->t.t_first_place = first_place;
   team->t.t_last_place = last_place;
 
@@ -4753,13 +4848,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
     for (f = 1; f < n_th; f++) {
       kmp_info_t *th = team->t.t_threads[f];
       KMP_DEBUG_ASSERT(th != NULL);
-      th->th.th_first_place = first_place;
-      th->th.th_last_place = last_place;
-      th->th.th_new_place = masters_place;
-      if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
-          team->t.t_display_affinity != 1) {
-        team->t.t_display_affinity = 1;
-      }
+      __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
 
       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
                      "partition = [%d,%d]\n",
@@ -4775,7 +4864,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
     if (first_place <= last_place) {
       n_places = last_place - first_place + 1;
     } else {
-      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
+      n_places = num_masks - first_place + last_place + 1;
     }
     if (n_th <= n_places) {
       int place = masters_place;
@@ -4785,18 +4874,12 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
 
         if (place == last_place) {
           place = first_place;
-        } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+        } else if (place == (num_masks - 1)) {
           place = 0;
         } else {
           place++;
         }
-        th->th.th_first_place = first_place;
-        th->th.th_last_place = last_place;
-        th->th.th_new_place = place;
-        if (__kmp_display_affinity && place != th->th.th_current_place &&
-            team->t.t_display_affinity != 1) {
-          team->t.t_display_affinity = 1;
-        }
+        __kmp_set_thread_place(team, th, first_place, last_place, place);
 
         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
                        "partition = [%d,%d]\n",
@@ -4815,13 +4898,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
         kmp_info_t *th = team->t.t_threads[f];
         KMP_DEBUG_ASSERT(th != NULL);
 
-        th->th.th_first_place = first_place;
-        th->th.th_last_place = last_place;
-        th->th.th_new_place = place;
-        if (__kmp_display_affinity && place != th->th.th_current_place &&
-            team->t.t_display_affinity != 1) {
-          team->t.t_display_affinity = 1;
-        }
+        __kmp_set_thread_place(team, th, first_place, last_place, place);
         s_count++;
 
         if ((s_count == S) && rem && (gap_ct == gap)) {
@@ -4830,7 +4907,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
           // we added an extra thread to this place; move to next place
           if (place == last_place) {
             place = first_place;
-          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+          } else if (place == (num_masks - 1)) {
             place = 0;
           } else {
             place++;
@@ -4841,7 +4918,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
         } else if (s_count == S) { // place full; don't add extra
           if (place == last_place) {
             place = first_place;
-          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+          } else if (place == (num_masks - 1)) {
             place = 0;
           } else {
             place++;
@@ -4868,12 +4945,12 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
     if (first_place <= last_place) {
       n_places = last_place - first_place + 1;
     } else {
-      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
+      n_places = num_masks - first_place + last_place + 1;
     }
     if (n_th <= n_places) {
       int place = -1;
 
-      if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
+      if (n_places != num_masks) {
         int S = n_places / n_th;
         int s_count, rem, gap, gap_ct;
 
@@ -4888,17 +4965,12 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
           kmp_info_t *th = team->t.t_threads[f];
           KMP_DEBUG_ASSERT(th != NULL);
 
-          th->th.th_first_place = place;
-          th->th.th_new_place = place;
-          if (__kmp_display_affinity && place != th->th.th_current_place &&
-              team->t.t_display_affinity != 1) {
-            team->t.t_display_affinity = 1;
-          }
+          int fplace = place, nplace = place;
           s_count = 1;
           while (s_count < S) {
             if (place == last_place) {
               place = first_place;
-            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+            } else if (place == (num_masks - 1)) {
               place = 0;
             } else {
               place++;
@@ -4908,7 +4980,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
           if (rem && (gap_ct == gap)) {
             if (place == last_place) {
               place = first_place;
-            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+            } else if (place == (num_masks - 1)) {
               place = 0;
             } else {
               place++;
@@ -4916,12 +4988,12 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
             rem--;
             gap_ct = 0;
           }
-          th->th.th_last_place = place;
+          __kmp_set_thread_place(team, th, fplace, place, nplace);
           gap_ct++;
 
           if (place == last_place) {
             place = first_place;
-          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+          } else if (place == (num_masks - 1)) {
             place = 0;
           } else {
             place++;
@@ -4929,10 +5001,10 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
 
           KA_TRACE(100,
                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
-                    "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
+                    "partition = [%d,%d], num_masks: %u\n",
                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
                     f, th->th.th_new_place, th->th.th_first_place,
-                    th->th.th_last_place, __kmp_affinity_num_masks));
+                    th->th.th_last_place, num_masks));
         }
       } else {
         /* Having uniform space of available computation places I can create
@@ -4982,13 +5054,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
             KMP_DEBUG_ASSERT(last_place >= first_place);
             th = team->t.t_threads[f];
             KMP_DEBUG_ASSERT(th);
-            th->th.th_first_place = first;
-            th->th.th_new_place = place;
-            th->th.th_last_place = last;
-            if (__kmp_display_affinity && place != th->th.th_current_place &&
-                team->t.t_display_affinity != 1) {
-              team->t.t_display_affinity = 1;
-            }
+            __kmp_set_thread_place(team, th, first, last, place);
             KA_TRACE(100,
                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
                       "partition = [%d,%d], spacing = %.4f\n",
@@ -5014,13 +5080,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
         kmp_info_t *th = team->t.t_threads[f];
         KMP_DEBUG_ASSERT(th != NULL);
 
-        th->th.th_first_place = place;
-        th->th.th_last_place = place;
-        th->th.th_new_place = place;
-        if (__kmp_display_affinity && place != th->th.th_current_place &&
-            team->t.t_display_affinity != 1) {
-          team->t.t_display_affinity = 1;
-        }
+        __kmp_set_thread_place(team, th, place, place, place);
         s_count++;
 
         if ((s_count == S) && rem && (gap_ct == gap)) {
@@ -5029,7 +5089,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
           // we added an extra thread to this place; move on to next place
           if (place == last_place) {
             place = first_place;
-          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+          } else if (place == (num_masks - 1)) {
             place = 0;
           } else {
             place++;
@@ -5040,7 +5100,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
         } else if (s_count == S) { // place is full; don't add extra thread
           if (place == last_place) {
             place = first_place;
-          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+          } else if (place == (num_masks - 1)) {
             place = 0;
           } else {
             place++;
@@ -5210,6 +5270,15 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         // Activate team threads via th_used_in_team
         __kmp_add_threads_to_team(team, new_nproc);
       }
+      // When decreasing team size, threads no longer in the team should
+      // unref task team.
+      if (__kmp_tasking_mode != tskm_immediate_exec) {
+        for (f = new_nproc; f < team->t.t_nproc; f++) {
+          kmp_info_t *th = team->t.t_threads[f];
+          KMP_DEBUG_ASSERT(th);
+          th->th.th_task_team = NULL;
+        }
+      }
 #if KMP_NESTED_HOT_TEAMS
       if (__kmp_hot_teams_mode == 0) {
         // AC: saved number of threads should correspond to team's value in this
@@ -5220,11 +5289,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         /* release the extra threads we don't need any more */
         for (f = new_nproc; f < team->t.t_nproc; f++) {
           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
-          if (__kmp_tasking_mode != tskm_immediate_exec) {
-            // When decreasing team size, threads no longer in the team should
-            // unref task team.
-            team->t.t_threads[f]->th.th_task_team = NULL;
-          }
           __kmp_free_thread(team->t.t_threads[f]);
           team->t.t_threads[f] = NULL;
         }
@@ -5278,12 +5342,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #endif
       }
     } else { // team->t.t_nproc < new_nproc
-#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
-      kmp_affin_mask_t *old_mask;
-      if (KMP_AFFINITY_CAPABLE()) {
-        KMP_CPU_ALLOC(old_mask);
-      }
-#endif
 
       KA_TRACE(20,
                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
@@ -5326,13 +5384,14 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
           __kmp_reinitialize_team(team, new_icvs, NULL);
         }
 
-#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
+#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) &&   \
+    KMP_AFFINITY_SUPPORTED
         /* Temporarily set full mask for primary thread before creation of
            workers. The reason is that workers inherit the affinity from the
            primary thread, so if a lot of workers are created on the single
            core quickly, they don't get a chance to set their own affinity for
            a long time. */
-        __kmp_set_thread_affinity_mask_full_tmp(old_mask);
+        kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
 #endif
 
         /* allocate new threads for the hot team */
@@ -5362,12 +5421,10 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
           }
         }
 
-#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
-        if (KMP_AFFINITY_CAPABLE()) {
-          /* Restore initial primary thread's affinity mask */
-          __kmp_set_system_affinity(old_mask, TRUE);
-          KMP_CPU_FREE(old_mask);
-        }
+#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) &&   \
+    KMP_AFFINITY_SUPPORTED
+        /* Restore initial primary thread's affinity mask */
+        new_temp_affinity.restore();
 #endif
 #if KMP_NESTED_HOT_TEAMS
       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
@@ -5388,21 +5445,10 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         __kmp_initialize_info(team->t.t_threads[f], team, f,
                               __kmp_gtid_from_tid(f, team));
 
-      if (level) { // set th_task_state for new threads in nested hot team
-        // __kmp_initialize_info() no longer zeroes th_task_state, so we should
-        // only need to set the th_task_state for the new threads. th_task_state
-        // for primary thread will not be accurate until after this in
-        // __kmp_fork_call(), so we look to the primary thread's memo_stack to
-        // get the correct value.
-        for (f = old_nproc; f < team->t.t_nproc; ++f)
-          team->t.t_threads[f]->th.th_task_state =
-              team->t.t_threads[0]->th.th_task_state_memo_stack[level];
-      } else { // set th_task_state for new threads in non-nested hot team
-        // copy primary thread's state
-        kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
-        for (f = old_nproc; f < team->t.t_nproc; ++f)
-          team->t.t_threads[f]->th.th_task_state = old_state;
-      }
+      // set th_task_state for new threads in hot team with older thread's state
+      kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
+      for (f = old_nproc; f < team->t.t_nproc; ++f)
+        team->t.t_threads[f]->th.th_task_state = old_state;
 
 #ifdef KMP_DEBUG
       for (f = 0; f < team->t.t_nproc; ++f) {
@@ -5420,7 +5466,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       }
     } // Check changes in number of threads
 
-    kmp_info_t *master = team->t.t_threads[0];
     if (master->th.th_teams_microtask) {
       for (f = 1; f < new_nproc; ++f) {
         // propagate teams construct specific info to workers
@@ -5526,6 +5571,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       __ompt_team_assign_id(team, ompt_parallel_data);
 #endif
 
+      team->t.t_nested_nth = NULL;
+
       KMP_MB();
 
       return team;
@@ -5597,6 +5644,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 
   KMP_MB();
 
+  team->t.t_nested_nth = NULL;
+
   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
                 team->t.t_id));
 
@@ -5672,9 +5721,8 @@ void __kmp_free_team(kmp_root_t *root,
           }
 #endif
           // first check if thread is sleeping
-          kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
-          if (fl.is_sleeping())
-            fl.resume(__kmp_gtid_from_thread(th));
+          if (th->th.th_sleep_loc)
+            __kmp_null_resume_wrapper(th);
           KMP_CPU_PAUSE();
         }
       }
@@ -5700,6 +5748,14 @@ void __kmp_free_team(kmp_root_t *root,
       }
     }
 
+    // Before clearing parent pointer, check if nested_nth list should be freed
+    if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
+        team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
+      KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
+      KMP_INTERNAL_FREE(team->t.t_nested_nth);
+    }
+    team->t.t_nested_nth = NULL;
+
     // Reset pointer to parent team only for non-hot teams.
     team->t.t_parent = NULL;
     team->t.t_level = 0;
@@ -5709,8 +5765,8 @@ void __kmp_free_team(kmp_root_t *root,
     for (f = 1; f < team->t.t_nproc; ++f) {
       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
-        KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
-                                    1, 2);
+        (void)KMP_COMPARE_AND_STORE_ACQ32(
+            &(team->t.t_threads[f]->th.th_used_in_team), 1, 2);
       }
       __kmp_free_thread(team->t.t_threads[f]);
     }
@@ -6047,7 +6103,6 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) {
       __kmp_join_barrier(gtid);
     }
   }
-  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
 
 #if OMPD_SUPPORT
   if (ompd_state & OMPD_ENABLE_BP)
@@ -6216,11 +6271,6 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
     thread->th.th_pri_common = NULL;
   }
 
-  if (thread->th.th_task_state_memo_stack != NULL) {
-    __kmp_free(thread->th.th_task_state_memo_stack);
-    thread->th.th_task_state_memo_stack = NULL;
-  }
-
 #if KMP_USE_BGET
   if (thread->th.th_local.bget_data != NULL) {
     __kmp_finalize_bget(thread);
@@ -6683,6 +6733,13 @@ static inline char *__kmp_reg_status_name() {
 #endif
 } // __kmp_reg_status_get
 
+#if defined(KMP_USE_SHM)
+bool __kmp_shm_available = false;
+bool __kmp_tmp_available = false;
+// If /dev/shm is not accessible, we will create a temporary file under /tmp.
+char *temp_reg_status_file_name = nullptr;
+#endif
+
 void __kmp_register_library_startup(void) {
 
   char *name = __kmp_reg_status_name(); // Name of the environment variable.
@@ -6708,52 +6765,108 @@ void __kmp_register_library_startup(void) {
     char *value = NULL; // Actual value of the environment variable.
 
 #if defined(KMP_USE_SHM)
-    char *shm_name = __kmp_str_format("/%s", name);
-    int shm_preexist = 0;
-    char *data1;
-    int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
-    if ((fd1 == -1) && (errno == EEXIST)) {
-      // file didn't open because it already exists.
-      // try opening existing file
-      fd1 = shm_open(shm_name, O_RDWR, 0666);
-      if (fd1 == -1) { // file didn't open
-        // error out here
-        __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
-                    __kmp_msg_null);
-      } else {
-        // able to open existing file
-        shm_preexist = 1;
+    char *shm_name = nullptr;
+    char *data1 = nullptr;
+    __kmp_shm_available = __kmp_detect_shm();
+    if (__kmp_shm_available) {
+      int fd1 = -1;
+      shm_name = __kmp_str_format("/%s", name);
+      int shm_preexist = 0;
+      fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
+      if ((fd1 == -1) && (errno == EEXIST)) {
+        // file didn't open because it already exists.
+        // try opening existing file
+        fd1 = shm_open(shm_name, O_RDWR, 0600);
+        if (fd1 == -1) { // file didn't open
+          KMP_WARNING(FunctionError, "Can't open SHM");
+          __kmp_shm_available = false;
+        } else { // able to open existing file
+          shm_preexist = 1;
+        }
       }
-    } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
-      // already exists.
-      // error out here.
-      __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
-                  __kmp_msg_null);
-    }
-    if (shm_preexist == 0) {
-      // we created SHM now set size
-      if (ftruncate(fd1, SHM_SIZE) == -1) {
-        // error occured setting size;
-        __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
-                    KMP_ERR(errno), __kmp_msg_null);
+      if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
+        if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
+          KMP_WARNING(FunctionError, "Can't set size of SHM");
+          __kmp_shm_available = false;
+        }
       }
+      if (__kmp_shm_available) { // SHM exists, now map it
+        data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+                             fd1, 0);
+        if (data1 == MAP_FAILED) { // failed to map shared memory
+          KMP_WARNING(FunctionError, "Can't map SHM");
+          __kmp_shm_available = false;
+        }
+      }
+      if (__kmp_shm_available) { // SHM mapped
+        if (shm_preexist == 0) { // set data to SHM, set value
+          KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
+        }
+        // Read value from either what we just wrote or existing file.
+        value = __kmp_str_format("%s", data1); // read value from SHM
+        munmap(data1, SHM_SIZE);
+      }
+      if (fd1 != -1)
+        close(fd1);
+    }
+    if (!__kmp_shm_available)
+      __kmp_tmp_available = __kmp_detect_tmp();
+    if (!__kmp_shm_available && __kmp_tmp_available) {
+      // SHM failed to work due to an error other than that the file already
+      // exists. Try to create a temp file under /tmp.
+      // If /tmp isn't accessible, fall back to using environment variable.
+      // TODO: /tmp might not always be the temporary directory. For now we will
+      // not consider TMPDIR.
+      int fd1 = -1;
+      temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
+      int tmp_preexist = 0;
+      fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
+      if ((fd1 == -1) && (errno == EEXIST)) {
+        // file didn't open because it already exists.
+        // try opening existing file
+        fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
+        if (fd1 == -1) { // file didn't open if (fd1 == -1) {
+          KMP_WARNING(FunctionError, "Can't open TEMP");
+          __kmp_tmp_available = false;
+        } else {
+          tmp_preexist = 1;
+        }
+      }
+      if (__kmp_tmp_available && tmp_preexist == 0) {
+        // we created /tmp file now set size
+        if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
+          KMP_WARNING(FunctionError, "Can't set size of /tmp file");
+          __kmp_tmp_available = false;
+        }
+      }
+      if (__kmp_tmp_available) {
+        data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+                             fd1, 0);
+        if (data1 == MAP_FAILED) { // failed to map /tmp
+          KMP_WARNING(FunctionError, "Can't map /tmp");
+          __kmp_tmp_available = false;
+        }
+      }
+      if (__kmp_tmp_available) {
+        if (tmp_preexist == 0) { // set data to TMP, set value
+          KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
+        }
+        // Read value from either what we just wrote or existing file.
+        value = __kmp_str_format("%s", data1); // read value from SHM
+        munmap(data1, SHM_SIZE);
+      }
+      if (fd1 != -1)
+        close(fd1);
     }
-    data1 =
-        (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
-    if (data1 == MAP_FAILED) {
-      // failed to map shared memory
-      __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
-                  __kmp_msg_null);
-    }
-    if (shm_preexist == 0) { // set data to SHM, set value
-      KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
+    if (!__kmp_shm_available && !__kmp_tmp_available) {
+      // no /dev/shm and no /tmp -- fall back to environment variable
+      // Set environment variable, but do not overwrite if it exists.
+      __kmp_env_set(name, __kmp_registration_str, 0);
+      // read value to see if it got set
+      value = __kmp_env_get(name);
     }
-    // Read value from either what we just wrote or existing file.
-    value = __kmp_str_format("%s", data1); // read value from SHM
-    munmap(data1, SHM_SIZE);
-    close(fd1);
 #else // Windows and unix with static library
-    // Set environment variable, but do not overwrite if it is exist.
+    // Set environment variable, but do not overwrite if it exists.
     __kmp_env_set(name, __kmp_registration_str, 0);
     // read value to see if it got set
     value = __kmp_env_get(name);
@@ -6813,8 +6926,14 @@ void __kmp_register_library_startup(void) {
       case 2: { // Neighbor is dead.
 
 #if defined(KMP_USE_SHM)
-        // close shared memory.
-        shm_unlink(shm_name); // this removes file in /dev/shm
+        if (__kmp_shm_available) { // close shared memory.
+          shm_unlink(shm_name); // this removes file in /dev/shm
+        } else if (__kmp_tmp_available) {
+          unlink(temp_reg_status_file_name); // this removes the temp file
+        } else {
+          // Clear the variable and try to register library again.
+          __kmp_env_unset(name);
+        }
 #else
         // Clear the variable and try to register library again.
         __kmp_env_unset(name);
@@ -6827,7 +6946,8 @@ void __kmp_register_library_startup(void) {
     }
     KMP_INTERNAL_FREE((void *)value);
 #if defined(KMP_USE_SHM)
-    KMP_INTERNAL_FREE((void *)shm_name);
+    if (shm_name)
+      KMP_INTERNAL_FREE((void *)shm_name);
 #endif
   } // while
   KMP_INTERNAL_FREE((void *)name);
@@ -6840,18 +6960,32 @@ void __kmp_unregister_library(void) {
   char *value = NULL;
 
 #if defined(KMP_USE_SHM)
-  char *shm_name = __kmp_str_format("/%s", name);
-  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
-  if (fd1 == -1) {
-    // file did not open. return.
-    return;
-  }
-  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
-  if (data1 != MAP_FAILED) {
-    value = __kmp_str_format("%s", data1); // read value from SHM
-    munmap(data1, SHM_SIZE);
+  char *shm_name = nullptr;
+  int fd1;
+  if (__kmp_shm_available) {
+    shm_name = __kmp_str_format("/%s", name);
+    fd1 = shm_open(shm_name, O_RDONLY, 0600);
+    if (fd1 != -1) { // File opened successfully
+      char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
+      if (data1 != MAP_FAILED) {
+        value = __kmp_str_format("%s", data1); // read value from SHM
+        munmap(data1, SHM_SIZE);
+      }
+      close(fd1);
+    }
+  } else if (__kmp_tmp_available) { // try /tmp
+    fd1 = open(temp_reg_status_file_name, O_RDONLY);
+    if (fd1 != -1) { // File opened successfully
+      char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
+      if (data1 != MAP_FAILED) {
+        value = __kmp_str_format("%s", data1); // read value from /tmp
+        munmap(data1, SHM_SIZE);
+      }
+      close(fd1);
+    }
+  } else { // fall back to envirable
+    value = __kmp_env_get(name);
   }
-  close(fd1);
 #else
   value = __kmp_env_get(name);
 #endif
@@ -6861,14 +6995,23 @@ void __kmp_unregister_library(void) {
   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
 //  Ok, this is our variable. Delete it.
 #if defined(KMP_USE_SHM)
-    shm_unlink(shm_name); // this removes file in /dev/shm
+    if (__kmp_shm_available) {
+      shm_unlink(shm_name); // this removes file in /dev/shm
+    } else if (__kmp_tmp_available) {
+      unlink(temp_reg_status_file_name); // this removes the temp file
+    } else {
+      __kmp_env_unset(name);
+    }
 #else
     __kmp_env_unset(name);
 #endif
   }
 
 #if defined(KMP_USE_SHM)
-  KMP_INTERNAL_FREE(shm_name);
+  if (shm_name)
+    KMP_INTERNAL_FREE(shm_name);
+  if (temp_reg_status_file_name)
+    KMP_INTERNAL_FREE(temp_reg_status_file_name);
 #endif
 
   KMP_INTERNAL_FREE(__kmp_registration_str);
@@ -6967,6 +7110,11 @@ static void __kmp_do_serial_initialize(void) {
 
   __kmp_validate_locks();
 
+#if ENABLE_LIBOMPTARGET
+  /* Initialize functions from libomptarget */
+  __kmp_init_omptarget();
+#endif
+
   /* Initialize internal memory allocator */
   __kmp_init_allocator();
 
@@ -7192,10 +7340,12 @@ static void __kmp_do_serial_initialize(void) {
   __kmp_register_atfork();
 #endif
 
-#if !KMP_DYNAMIC_LIB
+#if !KMP_DYNAMIC_LIB ||                                                        \
+    ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
   {
     /* Invoke the exit handler when the program finishes, only for static
-       library. For dynamic library, we already have _fini and DllMain. */
+       library and macOS* dynamic. For other dynamic libraries, we already
+       have _fini and DllMain. */
     int rc = atexit(__kmp_internal_end_atexit);
     if (rc != 0) {
       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
@@ -7222,6 +7372,10 @@ static void __kmp_do_serial_initialize(void) {
 
   __kmp_init_serial = TRUE;
 
+  if (__kmp_version) {
+    __kmp_print_version_1();
+  }
+
   if (__kmp_settings) {
     __kmp_env_print();
   }
@@ -7275,7 +7429,7 @@ static void __kmp_do_middle_initialize(void) {
 #if KMP_AFFINITY_SUPPORTED
   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
   // number of cores on the machine.
-  __kmp_affinity_initialize();
+  __kmp_affinity_initialize(__kmp_affinity);
 
 #endif /* KMP_AFFINITY_SUPPORTED */
 
@@ -7461,6 +7615,14 @@ void __kmp_hidden_helper_initialize() {
     return;
   }
 
+#if KMP_AFFINITY_SUPPORTED
+  // Initialize hidden helper affinity settings.
+  // The above __kmp_parallel_initialize() will initialize
+  // regular affinity (and topology) if not already done.
+  if (!__kmp_hh_affinity.flags.initialized)
+    __kmp_affinity_initialize(__kmp_hh_affinity);
+#endif
+
   // Set the count of hidden helper tasks to be executed to zero
   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
 
@@ -7583,7 +7745,7 @@ int __kmp_invoke_task_func(int gtid) {
   );
 #if OMPT_SUPPORT
   *exit_frame_p = NULL;
-  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
+  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
 #endif
 
 #if KMP_STATS_ENABLED
@@ -7681,7 +7843,7 @@ int __kmp_invoke_teams_master(int gtid) {
 #endif
   __kmp_teams_master(gtid);
 #if OMPT_SUPPORT
-  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
+  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
 #endif
   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
   return 1;
@@ -7691,7 +7853,6 @@ int __kmp_invoke_teams_master(int gtid) {
    encountered by this team. since this should be enclosed in the forkjoin
    critical section it should avoid race conditions with asymmetrical nested
    parallelism */
-
 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
   kmp_info_t *thr = __kmp_threads[gtid];
 
@@ -7699,6 +7860,39 @@ void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
     thr->th.th_set_nproc = num_threads;
 }
 
+void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
+                                 int *num_threads_list) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+
+  KMP_DEBUG_ASSERT(list_length > 1);
+
+  if (num_threads_list[0] > 0)
+    thr->th.th_set_nproc = num_threads_list[0];
+  thr->th.th_set_nested_nth =
+      (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
+  for (kmp_uint32 i = 0; i < list_length; ++i)
+    thr->th.th_set_nested_nth[i] = num_threads_list[i];
+  thr->th.th_set_nested_nth_sz = list_length;
+}
+
+void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
+                                  const char *msg) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  thr->th.th_nt_strict = true;
+  thr->th.th_nt_loc = loc;
+  // if sev is unset make fatal
+  if (sev == severity_warning)
+    thr->th.th_nt_sev = sev;
+  else
+    thr->th.th_nt_sev = severity_fatal;
+  // if msg is unset, use an appropriate message
+  if (msg)
+    thr->th.th_nt_msg = msg;
+  else
+    thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
+                        "strict num_threads clause.";
+}
+
 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
                                     int num_threads) {
   KMP_DEBUG_ASSERT(thr);
@@ -7932,8 +8126,10 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
 
   __kmp_join_barrier(gtid); /* wait for everyone */
 #if OMPT_SUPPORT
+  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
   if (ompt_enabled.enabled &&
-      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
+      (ompt_state == ompt_state_wait_barrier_teams ||
+       ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
     int ds_tid = this_thr->th.th_info.ds.ds_tid;
     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
@@ -7944,15 +8140,16 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
 
+    ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
+    if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
+      sync_kind = ompt_sync_region_barrier_teams;
     if (ompt_enabled.ompt_callback_sync_region_wait) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
-          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
-          codeptr);
+          sync_kind, ompt_scope_end, NULL, task_data, codeptr);
     }
     if (ompt_enabled.ompt_callback_sync_region) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
-          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
-          codeptr);
+          sync_kind, ompt_scope_end, NULL, task_data, codeptr);
     }
 #endif
     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
@@ -8155,6 +8352,7 @@ void __kmp_cleanup(void) {
   __kmp_nested_nth.nth = NULL;
   __kmp_nested_nth.size = 0;
   __kmp_nested_nth.used = 0;
+
   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
   __kmp_nested_proc_bind.bind_types = NULL;
   __kmp_nested_proc_bind.size = 0;
@@ -8652,9 +8850,8 @@ void __kmp_aux_display_affinity(int gtid, const char *format) {
 }
 
 /* ------------------------------------------------------------------------ */
-
 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
-  int blocktime = arg; /* argument is in milliseconds */
+  int blocktime = arg; /* argument is in microseconds */
 #if KMP_USE_MONITOR
   int bt_intervals;
 #endif
@@ -8730,7 +8927,6 @@ __kmp_determine_reduction_method(
 
   int team_size;
 
-  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
 
 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
@@ -8751,10 +8947,12 @@ __kmp_determine_reduction_method(
     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
 
 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
-    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
+    KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
+    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||        \
+    KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
 
     int teamsize_cutoff = 4;
 
@@ -8778,11 +8976,15 @@ __kmp_determine_reduction_method(
 #else
 #error "Unknown or unsupported OS"
 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
-       // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
+       // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
+       // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
 
-#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
+#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS ||       \
+    KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
 
-#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS ||       \
+    KMP_OS_WASI || KMP_OS_AIX
 
     // basic tuning
 
@@ -8930,7 +9132,8 @@ int __kmp_pause_resource(kmp_pause_status_t level) {
       __kmp_soft_pause();
       return 0;
     }
-  } else if (level == kmp_hard_paused) { // requesting hard pause
+  } else if (level == kmp_hard_paused || level == kmp_stop_tool_paused) {
+    // requesting hard pause or stop_tool pause
     if (__kmp_pause_status != kmp_not_paused) {
       // error message about already being paused
       return 1;
@@ -9018,8 +9221,8 @@ void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
   // to wake it up.
   for (int f = 1; f < new_nthreads; ++f) {
     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
-    KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
-                                3);
+    (void)KMP_COMPARE_AND_STORE_ACQ32(
+        &(team->t.t_threads[f]->th.th_used_in_team), 0, 3);
     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
       __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
                       (kmp_flag_32<false, false> *)NULL);
@@ -9181,3 +9384,20 @@ void __kmp_set_nesting_mode_threads() {
   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
 }
+
+// Empty symbols to export (see exports_so.txt) when feature is disabled
+extern "C" {
+#if !KMP_STATS_ENABLED
+void __kmp_reset_stats() {}
+#endif
+#if !USE_DEBUGGER
+int __kmp_omp_debug_struct_info = FALSE;
+int __kmp_debugging = FALSE;
+#endif
+#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
+void __kmp_itt_fini_ittlib() {}
+void __kmp_itt_init_ittlib() {}
+#endif
+}
+
+// end of file
author	mikhnenko <[email protected]>	2025-07-15 20:05:43 +0300
committer	mikhnenko <[email protected]>	2025-07-15 20:52:16 +0300
commit	a40bd4f45bbc18fd95b1596e655b8942ceb2cf4b (patch)
tree	bce599ca02c778c277198de6d131d37db71997d0 /contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
parent	728e0eaef4dc1f1152d2c3a4cc1bbdf597f3ef3d (diff)