diff options
| author | thegeorg <[email protected]> | 2022-06-03 10:53:07 +0300 | 
|---|---|---|
| committer | thegeorg <[email protected]> | 2022-06-03 10:53:07 +0300 | 
| commit | a1d4361e379e2c72a469ad1bd64569cbc2db131f (patch) | |
| tree | 0caddb240a10132376e4653a31578e117d33f9fd /contrib/libs/cxxsupp/openmp/kmp_runtime.cpp | |
| parent | 41f55a521834080d9d703c099c0418cfff3a0546 (diff) | |
Update contrib/libs/cxxsupp/openmp to 14.0.4
ref:77c6cdda99b217d50c4deadca11f5611fa0dc168
Diffstat (limited to 'contrib/libs/cxxsupp/openmp/kmp_runtime.cpp')
| -rw-r--r-- | contrib/libs/cxxsupp/openmp/kmp_runtime.cpp | 430 | 
1 files changed, 371 insertions, 59 deletions
diff --git a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp index fe931bb157e..34f8a017438 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp @@ -107,6 +107,10 @@ static int __kmp_unregister_root_other_thread(int gtid);  static void __kmp_reap_thread(kmp_info_t *thread, int is_root);  kmp_info_t *__kmp_thread_pool_insert_pt = NULL; +void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, +                               int new_nthreads); +void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads); +  /* Calculate the identifier of the current thread */  /* fast (and somewhat portable) way to get unique identifier of executing     thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ @@ -910,7 +914,8 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,     assured that there are enough threads available, because we checked on that     earlier within critical section forkjoin */  static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, -                                    kmp_info_t *master_th, int master_gtid) { +                                    kmp_info_t *master_th, int master_gtid, +                                    int fork_teams_workers) {    int i;    int use_hot_team; @@ -999,7 +1004,12 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,      }  #if KMP_AFFINITY_SUPPORTED -    __kmp_partition_places(team); +    // Do not partition the places list for teams construct workers who +    // haven't actually been forked to do real work yet. This partitioning +    // will take place in the parallel region nested within the teams construct. +    if (!fork_teams_workers) { +      __kmp_partition_places(team); +    }  #endif    } @@ -1204,7 +1214,7 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {      this_thr->th.th_team = serial_team;      serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; -    KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, +    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,                    this_thr->th.th_current_task));      KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);      this_thr->th.th_current_task->td_flags.executing = 0; @@ -1563,15 +1573,24 @@ int __kmp_fork_call(ident_t *loc, int gtid,        /* Change number of threads in the team if requested */        if (master_set_numthreads) { // The parallel has num_threads clause -        if (master_set_numthreads < master_th->th.th_teams_size.nth) { +        if (master_set_numthreads <= master_th->th.th_teams_size.nth) {            // AC: only can reduce number of threads dynamically, can't increase            kmp_info_t **other_threads = parent_team->t.t_threads; +          // NOTE: if using distributed barrier, we need to run this code block +          // even when the team size appears not to have changed from the max. +          int old_proc = master_th->th.th_teams_size.nth; +          if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == +              bp_dist_bar) { +            __kmp_resize_dist_barrier(parent_team, old_proc, +                                      master_set_numthreads); +            __kmp_add_threads_to_team(parent_team, master_set_numthreads); +          }            parent_team->t.t_nproc = master_set_numthreads;            for (i = 0; i < master_set_numthreads; ++i) {              other_threads[i]->th.th_team_nproc = master_set_numthreads;            } -          // Keep extra threads hot in the team for possible next parallels          } +        // Keep extra threads hot in the team for possible next parallels          master_th->th.th_set_nproc = 0;        } @@ -1584,6 +1603,41 @@ int __kmp_fork_call(ident_t *loc, int gtid,        }  #endif +      // Figure out the proc_bind policy for the nested parallel within teams +      kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; +      // proc_bind_default means don't update +      kmp_proc_bind_t proc_bind_icv = proc_bind_default; +      if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { +        proc_bind = proc_bind_false; +      } else { +        // No proc_bind clause specified; use current proc-bind-var +        if (proc_bind == proc_bind_default) { +          proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; +        } +        /* else: The proc_bind policy was specified explicitly on parallel +           clause. +           This overrides proc-bind-var for this parallel region, but does not +           change proc-bind-var. */ +        // Figure the value of proc-bind-var for the child threads. +        if ((level + 1 < __kmp_nested_proc_bind.used) && +            (__kmp_nested_proc_bind.bind_types[level + 1] != +             master_th->th.th_current_task->td_icvs.proc_bind)) { +          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; +        } +      } +      KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind); +      // Need to change the bind-var ICV to correct value for each implicit task +      if (proc_bind_icv != proc_bind_default && +          master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) { +        kmp_info_t **other_threads = parent_team->t.t_threads; +        for (i = 0; i < master_th->th.th_team_nproc; ++i) { +          other_threads[i]->th.th_current_task->td_icvs.proc_bind = +              proc_bind_icv; +        } +      } +      // Reset for next parallel region +      master_th->th.th_set_proc_bind = proc_bind_default; +  #if USE_ITT_BUILD && USE_ITT_NOTIFY        if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||             KMP_ITT_DEBUG) && @@ -1600,6 +1654,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,          parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();        }  #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ +#if KMP_AFFINITY_SUPPORTED +      __kmp_partition_places(parent_team); +#endif        KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "                      "master_th=%p, gtid=%d\n", @@ -1635,6 +1692,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,      }  #endif +    // Need this to happen before we determine the number of threads, not while +    // we are allocating the team +    //__kmp_push_current_task_to_thread(master_th, parent_team, 0);      int enter_teams = 0;      if (parent_team->t.t_active_level >=          master_th->th.th_current_task->td_icvs.max_active_levels) { @@ -1642,13 +1702,10 @@ int __kmp_fork_call(ident_t *loc, int gtid,      } else {        enter_teams = ((ap == NULL && active_level == 0) ||                       (ap && teams_level > 0 && teams_level == level)); -      nthreads = -          master_set_numthreads -              ? master_set_numthreads -              : get__nproc_2( -                    parent_team, -                    master_tid); // TODO: get nproc directly from current task - +      nthreads = master_set_numthreads +                     ? master_set_numthreads +                     // TODO: get nproc directly from current task +                     : get__nproc_2(parent_team, master_tid);        // Check if we need to take forkjoin lock? (no need for serialized        // parallel out of teams construct). This code moved here from        // __kmp_reserve_threads() to speedup nested serialized parallels. @@ -1940,16 +1997,21 @@ int __kmp_fork_call(ident_t *loc, int gtid,      // Figure out the proc_bind_policy for the new team.      kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; -    kmp_proc_bind_t proc_bind_icv = -        proc_bind_default; // proc_bind_default means don't update +    // proc_bind_default means don't update +    kmp_proc_bind_t proc_bind_icv = proc_bind_default;      if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {        proc_bind = proc_bind_false;      } else { +      // No proc_bind clause specified; use current proc-bind-var for this +      // parallel region        if (proc_bind == proc_bind_default) { -        // No proc_bind clause specified; use current proc-bind-var for this -        // parallel region          proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;        } +      // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND +      if (master_th->th.th_teams_microtask && +          microtask == (microtask_t)__kmp_teams_master) { +        proc_bind = __kmp_teams_proc_bind; +      }        /* else: The proc_bind policy was specified explicitly on parallel clause.           This overrides proc-bind-var for this parallel region, but does not           change proc-bind-var. */ @@ -1957,7 +2019,11 @@ int __kmp_fork_call(ident_t *loc, int gtid,        if ((level + 1 < __kmp_nested_proc_bind.used) &&            (__kmp_nested_proc_bind.bind_types[level + 1] !=             master_th->th.th_current_task->td_icvs.proc_bind)) { -        proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; +        // Do not modify the proc bind icv for the two teams construct forks +        // They just let the proc bind icv pass through +        if (!master_th->th.th_teams_microtask || +            !(microtask == (microtask_t)__kmp_teams_master || ap == NULL)) +          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];        }      } @@ -1983,6 +2049,8 @@ int __kmp_fork_call(ident_t *loc, int gtid,  #endif                                   proc_bind, &new_icvs,                                   argc USE_NESTED_HOT_ARG(master_th)); +      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) +        copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);      } else {        /* allocate a new parallel team */        KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); @@ -1993,6 +2061,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,                                   proc_bind,                                   &master_th->th.th_current_task->td_icvs,                                   argc USE_NESTED_HOT_ARG(master_th)); +      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) +        copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, +                  &master_th->th.th_current_task->td_icvs);      }      KF_TRACE(          10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); @@ -2124,7 +2195,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,      if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong        root->r.r_active = TRUE; -    __kmp_fork_team_threads(root, team, master_th, gtid); +    __kmp_fork_team_threads(root, team, master_th, gtid, !ap);      __kmp_setup_icv_copy(team, nthreads,                           &master_th->th.th_current_task->td_icvs, loc); @@ -2359,6 +2430,12 @@ void __kmp_join_call(ident_t *loc, int gtid        parent_team->t.t_stack_id = NULL;      }  #endif + +    if (team->t.t_nproc > 1 && +        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +      team->t.b->update_num_threads(team->t.t_nproc); +      __kmp_add_threads_to_team(team, team->t.t_nproc); +    }    }    KMP_MB(); @@ -2387,6 +2464,14 @@ void __kmp_join_call(ident_t *loc, int gtid    } // active_level == 1  #endif /* USE_ITT_BUILD */ +#if KMP_AFFINITY_SUPPORTED +  if (!exit_teams) { +    // Restore master thread's partition. +    master_th->th.th_first_place = team->t.t_first_place; +    master_th->th.th_last_place = team->t.t_last_place; +  } +#endif // KMP_AFFINITY_SUPPORTED +    if (master_th->th.th_teams_microtask && !exit_teams &&        team->t.t_pkfn != (microtask_t)__kmp_teams_master &&        team->t.t_level == master_th->th.th_teams_level + 1) { @@ -2494,11 +2579,6 @@ void __kmp_join_call(ident_t *loc, int gtid                  master_th, team));    __kmp_pop_current_task_from_thread(master_th); -#if KMP_AFFINITY_SUPPORTED -  // Restore master thread's partition. -  master_th->th.th_first_place = team->t.t_first_place; -  master_th->th.th_last_place = team->t.t_last_place; -#endif // KMP_AFFINITY_SUPPORTED    master_th->th.th_def_allocator = team->t.t_def_allocator;  #if OMPD_SUPPORT @@ -2646,6 +2726,9 @@ void __kmp_set_num_threads(int new_nth, int gtid) {      __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); +    if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +      __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth); +    }      // Release the extra threads we don't need any more.      for (f = new_nth; f < hot_team->t.t_nproc; f++) {        KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); @@ -2665,6 +2748,11 @@ void __kmp_set_num_threads(int new_nth, int gtid) {      }  #endif +    if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +      hot_team->t.b->update_num_threads(new_nth); +      __kmp_add_threads_to_team(hot_team, new_nth); +    } +      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);      // Update the t_nproc field in the threads that are still active. @@ -4018,7 +4106,8 @@ void __kmp_unregister_root_current_thread(int gtid) {    kmp_task_team_t *task_team = thread->th.th_task_team;    // we need to wait for the proxy tasks before finishing the thread -  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { +  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks || +                            task_team->tt.tt_hidden_helper_task_encountered)) {  #if OMPT_SUPPORT      // the runtime is shutting down so we won't report any events      thread->th.ompt_thread_info.state = ompt_state_undefined; @@ -4112,7 +4201,6 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,    this_thr->th.th_team_nproc = team->t.t_nproc;    this_thr->th.th_team_master = master;    this_thr->th.th_team_serialized = team->t.t_serialized; -  TCW_PTR(this_thr->th.th_sleep_loc, NULL);    KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); @@ -4281,6 +4369,12 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,      new_thr->th.th_task_state_top = 0;      new_thr->th.th_task_state_stack_sz = 4; +    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +      // Make sure pool thread has transitioned to waiting on own thread struct +      KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0); +      // Thread activated in __kmp_allocate_team when increasing team size +    } +  #ifdef KMP_ADJUST_BLOCKTIME      /* Adjust blocktime back to zero if necessary */      /* Middle initialization might not have occurred yet */ @@ -4448,6 +4542,9 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,      balign[b].bb.use_oncore_barrier = 0;    } +  TCW_PTR(new_thr->th.th_sleep_loc, NULL); +  new_thr->th.th_sleep_loc_type = flag_unset; +    new_thr->th.th_spin_here = FALSE;    new_thr->th.th_next_waiting = 0;  #if KMP_OS_UNIX @@ -4976,6 +5073,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,    kmp_team_t *team;    int use_hot_team = !root->r.r_active;    int level = 0; +  int do_place_partition = 1;    KA_TRACE(20, ("__kmp_allocate_team: called\n"));    KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); @@ -4997,6 +5095,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,          ++level; // not increment if #teams==1, or for outer fork of the teams;          // increment otherwise        } +      // Do not perform the place partition if inner fork of the teams +      // Wait until nested parallel region encountered inside teams construct +      if ((master->th.th_teams_size.nteams == 1 && +           master->th.th_teams_level >= team->t.t_level) || +          (team->t.t_pkfn == (microtask_t)__kmp_teams_master)) +        do_place_partition = 0;      }      hot_teams = master->th.th_hot_teams;      if (level < __kmp_hot_teams_max_level && hot_teams && @@ -5027,6 +5131,17 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,      }  #endif +    if (team->t.t_nproc != new_nproc && +        __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +      // Distributed barrier may need a resize +      int old_nthr = team->t.t_nproc; +      __kmp_resize_dist_barrier(team, old_nthr, new_nproc); +    } + +    // If not doing the place partition, then reset the team's proc bind +    // to indicate that partitioning of all threads still needs to take place +    if (do_place_partition == 0) +      team->t.t_proc_bind = proc_bind_default;      // Has the number of threads changed?      /* Let's assume the most common case is that the number of threads is         unchanged, and put that case first. */ @@ -5056,16 +5171,20 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,        if ((team->t.t_size_changed == 0) &&            (team->t.t_proc_bind == new_proc_bind)) {          if (new_proc_bind == proc_bind_spread) { -          __kmp_partition_places( -              team, 1); // add flag to update only master for spread +          if (do_place_partition) { +            // add flag to update only master for spread +            __kmp_partition_places(team, 1); +          }          }          KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "                         "proc_bind = %d, partition = [%d,%d]\n",                         team->t.t_id, new_proc_bind, team->t.t_first_place,                         team->t.t_last_place));        } else { -        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); -        __kmp_partition_places(team); +        if (do_place_partition) { +          KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); +          __kmp_partition_places(team); +        }        }  #else        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); @@ -5076,6 +5195,11 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,                  new_nproc));        team->t.t_size_changed = 1; +      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +        // Barrier size already reduced earlier in this function +        // Activate team threads via th_used_in_team +        __kmp_add_threads_to_team(team, new_nproc); +      }  #if KMP_NESTED_HOT_TEAMS        if (__kmp_hot_teams_mode == 0) {          // AC: saved number of threads should correspond to team's value in this @@ -5137,10 +5261,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,        }  #endif -      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); +      if (do_place_partition) { +        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);  #if KMP_AFFINITY_SUPPORTED -      __kmp_partition_places(team); +        __kmp_partition_places(team);  #endif +      }      } else { // team->t.t_nproc < new_nproc  #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED        kmp_affin_mask_t *old_mask; @@ -5152,7 +5278,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,        KA_TRACE(20,                 ("__kmp_allocate_team: increasing hot team thread count to %d\n",                  new_nproc)); - +      int old_nproc = team->t.t_nproc; // save old value and use to update only        team->t.t_size_changed = 1;  #if KMP_NESTED_HOT_TEAMS @@ -5179,10 +5305,9 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,          KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);          team->t.t_nproc = new_nproc; // just get reserved threads involved        } else { -        // we may have some threads in reserve, but not enough -        team->t.t_nproc = -            hot_teams[level] -                .hot_team_nth; // get reserved threads involved if any +        // We may have some threads in reserve, but not enough; +        // get reserved threads involved if any. +        team->t.t_nproc = hot_teams[level].hot_team_nth;          hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size  #endif // KMP_NESTED_HOT_TEAMS          if (team->t.t_max_nproc < new_nproc) { @@ -5237,8 +5362,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,  #if KMP_NESTED_HOT_TEAMS        } // end of check of t_nproc vs. new_nproc vs. hot_team_nth  #endif // KMP_NESTED_HOT_TEAMS +      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +        // Barrier size already increased earlier in this function +        // Activate team threads via th_used_in_team +        __kmp_add_threads_to_team(team, new_nproc); +      }        /* make sure everyone is syncronized */ -      int old_nproc = team->t.t_nproc; // save old value and use to update only        // new threads below        __kmp_initialize_team(team, new_nproc, new_icvs,                              root->r.r_uber_thread->th.th_ident); @@ -5273,10 +5402,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,        }  #endif -      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); +      if (do_place_partition) { +        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);  #if KMP_AFFINITY_SUPPORTED -      __kmp_partition_places(team); +        __kmp_partition_places(team);  #endif +      }      } // Check changes in number of threads      kmp_info_t *master = team->t.t_threads[0]; @@ -5342,6 +5473,13 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,        /* take this team from the team pool */        __kmp_team_pool = team->t.t_next_pool; +      if (max_nproc > 1 && +          __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +        if (!team->t.b) { // Allocate barrier structure +          team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); +        } +      } +        /* setup the team for fresh use */        __kmp_initialize_team(team, new_nproc, new_icvs, NULL); @@ -5397,6 +5535,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,    /* and set it up */    team->t.t_max_nproc = max_nproc; +  if (max_nproc > 1 && +      __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +    // Allocate barrier structure +    team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); +  } +    /* NOTE well, for some reason allocating one big buffer and dividing it up       seems to really hurt performance a lot on the P4, so, let's not use this */    __kmp_allocate_team_arrays(team, max_nproc); @@ -5469,7 +5613,6 @@ void __kmp_free_team(kmp_root_t *root,    int use_hot_team = team == root->r.r_hot_team;  #if KMP_NESTED_HOT_TEAMS    int level; -  kmp_hot_team_ptr_t *hot_teams;    if (master) {      level = team->t.t_active_level - 1;      if (master->th.th_teams_microtask) { // in teams construct? @@ -5483,7 +5626,9 @@ void __kmp_free_team(kmp_root_t *root,          // team_of_workers before the parallel        } // team->t.t_level will be increased inside parallel      } -    hot_teams = master->th.th_hot_teams; +#if KMP_DEBUG +    kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams; +#endif      if (level < __kmp_hot_teams_max_level) {        KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);        use_hot_team = 1; @@ -5553,10 +5698,43 @@ void __kmp_free_team(kmp_root_t *root,      /* free the worker threads */      for (f = 1; f < team->t.t_nproc; ++f) {        KMP_DEBUG_ASSERT(team->t.t_threads[f]); +      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +        KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), +                                    1, 2); +      }        __kmp_free_thread(team->t.t_threads[f]); +    } + +    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +      if (team->t.b) { +        // wake up thread at old location +        team->t.b->go_release(); +        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { +          for (f = 1; f < team->t.t_nproc; ++f) { +            if (team->t.b->sleep[f].sleep) { +              __kmp_atomic_resume_64( +                  team->t.t_threads[f]->th.th_info.ds.ds_gtid, +                  (kmp_atomic_flag_64<> *)NULL); +            } +          } +        } +        // Wait for threads to be removed from team +        for (int f = 1; f < team->t.t_nproc; ++f) { +          while (team->t.t_threads[f]->th.th_used_in_team.load() != 0) +            KMP_CPU_PAUSE(); +        } +      } +    } + +    for (f = 1; f < team->t.t_nproc; ++f) {        team->t.t_threads[f] = NULL;      } +    if (team->t.t_max_nproc > 1 && +        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +      distributedBarrier::deallocate(team->t.b); +      team->t.b = NULL; +    }      /* put the team back in the team pool */      /* TODO limit size of team pool, call reap_team if pool too large */      team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); @@ -5955,11 +6133,18 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {        KA_TRACE(            20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",                 gtid)); -      /* Need release fence here to prevent seg faults for tree forkjoin barrier -       * (GEH) */ -      kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, -                         thread); -      __kmp_release_64(&flag); +      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { +        while ( +            !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3)) +          KMP_CPU_PAUSE(); +        __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL); +      } else { +        /* Need release fence here to prevent seg faults for tree forkjoin +           barrier (GEH) */ +        kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, +                           thread); +        __kmp_release_64(&flag); +      }      }      // Terminate OS thread. @@ -6054,6 +6239,31 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {  } // __kmp_reap_thread +static void __kmp_itthash_clean(kmp_info_t *th) { +#if USE_ITT_NOTIFY +  if (__kmp_itt_region_domains.count > 0) { +    for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { +      kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i]; +      while (bucket) { +        kmp_itthash_entry_t *next = bucket->next_in_bucket; +        __kmp_thread_free(th, bucket); +        bucket = next; +      } +    } +  } +  if (__kmp_itt_barrier_domains.count > 0) { +    for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { +      kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i]; +      while (bucket) { +        kmp_itthash_entry_t *next = bucket->next_in_bucket; +        __kmp_thread_free(th, bucket); +        bucket = next; +      } +    } +  } +#endif +} +  static void __kmp_internal_end(void) {    int i; @@ -6240,6 +6450,7 @@ void __kmp_internal_end_library(int gtid_req) {                    gtid));          return;        } else { +        __kmp_itthash_clean(__kmp_threads[gtid]);          KA_TRACE(              10,              ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); @@ -6486,7 +6697,7 @@ void __kmp_register_library_startup(void) {      char *value = NULL; // Actual value of the environment variable. -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library +#if defined(KMP_USE_SHM)      char *shm_name = __kmp_str_format("/%s", name);      int shm_preexist = 0;      char *data1; @@ -6591,7 +6802,7 @@ void __kmp_register_library_startup(void) {        } break;        case 2: { // Neighbor is dead. -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library +#if defined(KMP_USE_SHM)          // close shared memory.          shm_unlink(shm_name); // this removes file in /dev/shm  #else @@ -6605,7 +6816,7 @@ void __kmp_register_library_startup(void) {        }      }      KMP_INTERNAL_FREE((void *)value); -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library +#if defined(KMP_USE_SHM)      KMP_INTERNAL_FREE((void *)shm_name);  #endif    } // while @@ -6618,7 +6829,7 @@ void __kmp_unregister_library(void) {    char *name = __kmp_reg_status_name();    char *value = NULL; -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library +#if defined(KMP_USE_SHM)    char *shm_name = __kmp_str_format("/%s", name);    int fd1 = shm_open(shm_name, O_RDONLY, 0666);    if (fd1 == -1) { @@ -6639,14 +6850,14 @@ void __kmp_unregister_library(void) {    KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);    if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {  //  Ok, this is our variable. Delete it. -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library +#if defined(KMP_USE_SHM)      shm_unlink(shm_name); // this removes file in /dev/shm  #else      __kmp_env_unset(name);  #endif    } -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library +#if defined(KMP_USE_SHM)    KMP_INTERNAL_FREE(shm_name);  #endif @@ -6684,7 +6895,9 @@ static void __kmp_check_mic_type() {  static void __kmp_user_level_mwait_init() {    struct kmp_cpuid buf;    __kmp_x86_cpuid(7, 0, &buf); -  __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; +  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1); +  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait; +  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);    KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",                  __kmp_umwait_enabled));  } @@ -6844,8 +7057,8 @@ static void __kmp_do_serial_initialize(void) {  #if KMP_FAST_REDUCTION_BARRIER  #define kmp_reduction_barrier_gather_bb ((int)1)  #define kmp_reduction_barrier_release_bb ((int)1) -#define kmp_reduction_barrier_gather_pat bp_hyper_bar -#define kmp_reduction_barrier_release_pat bp_hyper_bar +#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt +#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt  #endif // KMP_FAST_REDUCTION_BARRIER    for (i = bs_plain_barrier; i < bs_last_barrier; i++) {      __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; @@ -7500,6 +7713,11 @@ static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,        num_threads = 1;      }    } else { +    if (num_threads < 0) { +      __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1), +                __kmp_msg_null); +      num_threads = 1; +    }      // This thread will be the primary thread of the league primary threads      // Store new thread limit; old limit is saved in th_cg_roots list      thr->th.th_current_task->td_icvs.thread_limit = num_threads; @@ -7531,9 +7749,13 @@ static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,  void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,                            int num_threads) {    kmp_info_t *thr = __kmp_threads[gtid]; -  KMP_DEBUG_ASSERT(num_teams >= 0); -  KMP_DEBUG_ASSERT(num_threads >= 0); - +  if (num_teams < 0) { +    // OpenMP specification requires requested values to be positive, +    // but people can send us any value, so we'd better check +    __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1), +              __kmp_msg_null); +    num_teams = 1; +  }    if (num_teams == 0) {      if (__kmp_nteams > 0) {        num_teams = __kmp_nteams; @@ -7590,7 +7812,7 @@ void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,    } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams      num_teams = num_teams_ub;    } else { // num_teams_lb <= num_teams <= num_teams_ub -    if (num_threads == 0) { +    if (num_threads <= 0) {        if (num_teams_ub > __kmp_teams_max_nth) {          num_teams = num_teams_lb;        } else { @@ -8702,6 +8924,96 @@ void __kmp_omp_display_env(int verbose) {    __kmp_release_bootstrap_lock(&__kmp_initz_lock);  } +// The team size is changing, so distributed barrier must be modified +void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, +                               int new_nthreads) { +  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] == +                   bp_dist_bar); +  kmp_info_t **other_threads = team->t.t_threads; + +  // We want all the workers to stop waiting on the barrier while we adjust the +  // size of the team. +  for (int f = 1; f < old_nthreads; ++f) { +    KMP_DEBUG_ASSERT(other_threads[f] != NULL); +    // Ignore threads that are already inactive or not present in the team +    if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) { +      // teams construct causes thread_limit to get passed in, and some of +      // those could be inactive; just ignore them +      continue; +    } +    // If thread is transitioning still to in_use state, wait for it +    if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) { +      while (team->t.t_threads[f]->th.th_used_in_team.load() == 3) +        KMP_CPU_PAUSE(); +    } +    // The thread should be in_use now +    KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1); +    // Transition to unused state +    team->t.t_threads[f]->th.th_used_in_team.store(2); +    KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2); +  } +  // Release all the workers +  kmp_uint64 new_value; // new value for go +  new_value = team->t.b->go_release(); + +  KMP_MFENCE(); + +  // Workers should see transition status 2 and move to 0; but may need to be +  // woken up first +  size_t my_go_index; +  int count = old_nthreads - 1; +  while (count > 0) { +    count = old_nthreads - 1; +    for (int f = 1; f < old_nthreads; ++f) { +      my_go_index = f / team->t.b->threads_per_go; +      if (other_threads[f]->th.th_used_in_team.load() != 0) { +        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers +          kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST( +              void *, other_threads[f]->th.th_sleep_loc); +          __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag); +        } +      } else { +        KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0); +        count--; +      } +    } +  } +  // Now update the barrier size +  team->t.b->update_num_threads(new_nthreads); +  team->t.b->go_reset(); +} + +void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) { +  // Add the threads back to the team +  KMP_DEBUG_ASSERT(team); +  // Threads were paused and pointed at th_used_in_team temporarily during a +  // resize of the team. We're going to set th_used_in_team to 3 to indicate to +  // the thread that it should transition itself back into the team. Then, if +  // blocktime isn't infinite, the thread could be sleeping, so we send a resume +  // to wake it up. +  for (int f = 1; f < new_nthreads; ++f) { +    KMP_DEBUG_ASSERT(team->t.t_threads[f]); +    KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0, +                                3); +    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads +      __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid, +                      (kmp_flag_32<false, false> *)NULL); +    } +  } +  // The threads should be transitioning to the team; when they are done, they +  // should have set th_used_in_team to 1. This loop forces master to wait until +  // all threads have moved into the team and are waiting in the barrier. +  int count = new_nthreads - 1; +  while (count > 0) { +    count = new_nthreads - 1; +    for (int f = 1; f < new_nthreads; ++f) { +      if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) { +        count--; +      } +    } +  } +} +  // Globals and functions for hidden helper task  kmp_info_t **__kmp_hidden_helper_threads;  kmp_info_t *__kmp_hidden_helper_main_thread;  | 
