summaryrefslogtreecommitdiffstats
path: root/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
diff options
context:
space:
mode:
authorthegeorg <[email protected]>2022-06-03 10:53:07 +0300
committerthegeorg <[email protected]>2022-06-03 10:53:07 +0300
commita1d4361e379e2c72a469ad1bd64569cbc2db131f (patch)
tree0caddb240a10132376e4653a31578e117d33f9fd /contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
parent41f55a521834080d9d703c099c0418cfff3a0546 (diff)
Update contrib/libs/cxxsupp/openmp to 14.0.4
ref:77c6cdda99b217d50c4deadca11f5611fa0dc168
Diffstat (limited to 'contrib/libs/cxxsupp/openmp/kmp_runtime.cpp')
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_runtime.cpp430
1 files changed, 371 insertions, 59 deletions
diff --git a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
index fe931bb157e..34f8a017438 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp
@@ -107,6 +107,10 @@ static int __kmp_unregister_root_other_thread(int gtid);
static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
+void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
+ int new_nthreads);
+void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
+
/* Calculate the identifier of the current thread */
/* fast (and somewhat portable) way to get unique identifier of executing
thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
@@ -910,7 +914,8 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
assured that there are enough threads available, because we checked on that
earlier within critical section forkjoin */
static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
- kmp_info_t *master_th, int master_gtid) {
+ kmp_info_t *master_th, int master_gtid,
+ int fork_teams_workers) {
int i;
int use_hot_team;
@@ -999,7 +1004,12 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
}
#if KMP_AFFINITY_SUPPORTED
- __kmp_partition_places(team);
+ // Do not partition the places list for teams construct workers who
+ // haven't actually been forked to do real work yet. This partitioning
+ // will take place in the parallel region nested within the teams construct.
+ if (!fork_teams_workers) {
+ __kmp_partition_places(team);
+ }
#endif
}
@@ -1204,7 +1214,7 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
this_thr->th.th_team = serial_team;
serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
- KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
+ KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
this_thr->th.th_current_task));
KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
this_thr->th.th_current_task->td_flags.executing = 0;
@@ -1563,15 +1573,24 @@ int __kmp_fork_call(ident_t *loc, int gtid,
/* Change number of threads in the team if requested */
if (master_set_numthreads) { // The parallel has num_threads clause
- if (master_set_numthreads < master_th->th.th_teams_size.nth) {
+ if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
// AC: only can reduce number of threads dynamically, can't increase
kmp_info_t **other_threads = parent_team->t.t_threads;
+ // NOTE: if using distributed barrier, we need to run this code block
+ // even when the team size appears not to have changed from the max.
+ int old_proc = master_th->th.th_teams_size.nth;
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
+ bp_dist_bar) {
+ __kmp_resize_dist_barrier(parent_team, old_proc,
+ master_set_numthreads);
+ __kmp_add_threads_to_team(parent_team, master_set_numthreads);
+ }
parent_team->t.t_nproc = master_set_numthreads;
for (i = 0; i < master_set_numthreads; ++i) {
other_threads[i]->th.th_team_nproc = master_set_numthreads;
}
- // Keep extra threads hot in the team for possible next parallels
}
+ // Keep extra threads hot in the team for possible next parallels
master_th->th.th_set_nproc = 0;
}
@@ -1584,6 +1603,41 @@ int __kmp_fork_call(ident_t *loc, int gtid,
}
#endif
+ // Figure out the proc_bind policy for the nested parallel within teams
+ kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
+ // proc_bind_default means don't update
+ kmp_proc_bind_t proc_bind_icv = proc_bind_default;
+ if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
+ proc_bind = proc_bind_false;
+ } else {
+ // No proc_bind clause specified; use current proc-bind-var
+ if (proc_bind == proc_bind_default) {
+ proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
+ }
+ /* else: The proc_bind policy was specified explicitly on parallel
+ clause.
+ This overrides proc-bind-var for this parallel region, but does not
+ change proc-bind-var. */
+ // Figure the value of proc-bind-var for the child threads.
+ if ((level + 1 < __kmp_nested_proc_bind.used) &&
+ (__kmp_nested_proc_bind.bind_types[level + 1] !=
+ master_th->th.th_current_task->td_icvs.proc_bind)) {
+ proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+ }
+ }
+ KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
+ // Need to change the bind-var ICV to correct value for each implicit task
+ if (proc_bind_icv != proc_bind_default &&
+ master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
+ kmp_info_t **other_threads = parent_team->t.t_threads;
+ for (i = 0; i < master_th->th.th_team_nproc; ++i) {
+ other_threads[i]->th.th_current_task->td_icvs.proc_bind =
+ proc_bind_icv;
+ }
+ }
+ // Reset for next parallel region
+ master_th->th.th_set_proc_bind = proc_bind_default;
+
#if USE_ITT_BUILD && USE_ITT_NOTIFY
if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
KMP_ITT_DEBUG) &&
@@ -1600,6 +1654,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
}
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+#if KMP_AFFINITY_SUPPORTED
+ __kmp_partition_places(parent_team);
+#endif
KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
@@ -1635,6 +1692,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
}
#endif
+ // Need this to happen before we determine the number of threads, not while
+ // we are allocating the team
+ //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
int enter_teams = 0;
if (parent_team->t.t_active_level >=
master_th->th.th_current_task->td_icvs.max_active_levels) {
@@ -1642,13 +1702,10 @@ int __kmp_fork_call(ident_t *loc, int gtid,
} else {
enter_teams = ((ap == NULL && active_level == 0) ||
(ap && teams_level > 0 && teams_level == level));
- nthreads =
- master_set_numthreads
- ? master_set_numthreads
- : get__nproc_2(
- parent_team,
- master_tid); // TODO: get nproc directly from current task
-
+ nthreads = master_set_numthreads
+ ? master_set_numthreads
+ // TODO: get nproc directly from current task
+ : get__nproc_2(parent_team, master_tid);
// Check if we need to take forkjoin lock? (no need for serialized
// parallel out of teams construct). This code moved here from
// __kmp_reserve_threads() to speedup nested serialized parallels.
@@ -1940,16 +1997,21 @@ int __kmp_fork_call(ident_t *loc, int gtid,
// Figure out the proc_bind_policy for the new team.
kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
- kmp_proc_bind_t proc_bind_icv =
- proc_bind_default; // proc_bind_default means don't update
+ // proc_bind_default means don't update
+ kmp_proc_bind_t proc_bind_icv = proc_bind_default;
if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
} else {
+ // No proc_bind clause specified; use current proc-bind-var for this
+ // parallel region
if (proc_bind == proc_bind_default) {
- // No proc_bind clause specified; use current proc-bind-var for this
- // parallel region
proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
}
+ // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
+ if (master_th->th.th_teams_microtask &&
+ microtask == (microtask_t)__kmp_teams_master) {
+ proc_bind = __kmp_teams_proc_bind;
+ }
/* else: The proc_bind policy was specified explicitly on parallel clause.
This overrides proc-bind-var for this parallel region, but does not
change proc-bind-var. */
@@ -1957,7 +2019,11 @@ int __kmp_fork_call(ident_t *loc, int gtid,
if ((level + 1 < __kmp_nested_proc_bind.used) &&
(__kmp_nested_proc_bind.bind_types[level + 1] !=
master_th->th.th_current_task->td_icvs.proc_bind)) {
- proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+ // Do not modify the proc bind icv for the two teams construct forks
+ // They just let the proc bind icv pass through
+ if (!master_th->th.th_teams_microtask ||
+ !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
+ proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
}
}
@@ -1983,6 +2049,8 @@ int __kmp_fork_call(ident_t *loc, int gtid,
#endif
proc_bind, &new_icvs,
argc USE_NESTED_HOT_ARG(master_th));
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
+ copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
} else {
/* allocate a new parallel team */
KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
@@ -1993,6 +2061,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
proc_bind,
&master_th->th.th_current_task->td_icvs,
argc USE_NESTED_HOT_ARG(master_th));
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
+ copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
+ &master_th->th.th_current_task->td_icvs);
}
KF_TRACE(
10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
@@ -2124,7 +2195,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
root->r.r_active = TRUE;
- __kmp_fork_team_threads(root, team, master_th, gtid);
+ __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
__kmp_setup_icv_copy(team, nthreads,
&master_th->th.th_current_task->td_icvs, loc);
@@ -2359,6 +2430,12 @@ void __kmp_join_call(ident_t *loc, int gtid
parent_team->t.t_stack_id = NULL;
}
#endif
+
+ if (team->t.t_nproc > 1 &&
+ __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ team->t.b->update_num_threads(team->t.t_nproc);
+ __kmp_add_threads_to_team(team, team->t.t_nproc);
+ }
}
KMP_MB();
@@ -2387,6 +2464,14 @@ void __kmp_join_call(ident_t *loc, int gtid
} // active_level == 1
#endif /* USE_ITT_BUILD */
+#if KMP_AFFINITY_SUPPORTED
+ if (!exit_teams) {
+ // Restore master thread's partition.
+ master_th->th.th_first_place = team->t.t_first_place;
+ master_th->th.th_last_place = team->t.t_last_place;
+ }
+#endif // KMP_AFFINITY_SUPPORTED
+
if (master_th->th.th_teams_microtask && !exit_teams &&
team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
team->t.t_level == master_th->th.th_teams_level + 1) {
@@ -2494,11 +2579,6 @@ void __kmp_join_call(ident_t *loc, int gtid
master_th, team));
__kmp_pop_current_task_from_thread(master_th);
-#if KMP_AFFINITY_SUPPORTED
- // Restore master thread's partition.
- master_th->th.th_first_place = team->t.t_first_place;
- master_th->th.th_last_place = team->t.t_last_place;
-#endif // KMP_AFFINITY_SUPPORTED
master_th->th.th_def_allocator = team->t.t_def_allocator;
#if OMPD_SUPPORT
@@ -2646,6 +2726,9 @@ void __kmp_set_num_threads(int new_nth, int gtid) {
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
+ }
// Release the extra threads we don't need any more.
for (f = new_nth; f < hot_team->t.t_nproc; f++) {
KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
@@ -2665,6 +2748,11 @@ void __kmp_set_num_threads(int new_nth, int gtid) {
}
#endif
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ hot_team->t.b->update_num_threads(new_nth);
+ __kmp_add_threads_to_team(hot_team, new_nth);
+ }
+
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
// Update the t_nproc field in the threads that are still active.
@@ -4018,7 +4106,8 @@ void __kmp_unregister_root_current_thread(int gtid) {
kmp_task_team_t *task_team = thread->th.th_task_team;
// we need to wait for the proxy tasks before finishing the thread
- if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
+ if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
+ task_team->tt.tt_hidden_helper_task_encountered)) {
#if OMPT_SUPPORT
// the runtime is shutting down so we won't report any events
thread->th.ompt_thread_info.state = ompt_state_undefined;
@@ -4112,7 +4201,6 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
this_thr->th.th_team_nproc = team->t.t_nproc;
this_thr->th.th_team_master = master;
this_thr->th.th_team_serialized = team->t.t_serialized;
- TCW_PTR(this_thr->th.th_sleep_loc, NULL);
KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
@@ -4281,6 +4369,12 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
new_thr->th.th_task_state_top = 0;
new_thr->th.th_task_state_stack_sz = 4;
+ if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ // Make sure pool thread has transitioned to waiting on own thread struct
+ KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
+ // Thread activated in __kmp_allocate_team when increasing team size
+ }
+
#ifdef KMP_ADJUST_BLOCKTIME
/* Adjust blocktime back to zero if necessary */
/* Middle initialization might not have occurred yet */
@@ -4448,6 +4542,9 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
balign[b].bb.use_oncore_barrier = 0;
}
+ TCW_PTR(new_thr->th.th_sleep_loc, NULL);
+ new_thr->th.th_sleep_loc_type = flag_unset;
+
new_thr->th.th_spin_here = FALSE;
new_thr->th.th_next_waiting = 0;
#if KMP_OS_UNIX
@@ -4976,6 +5073,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
kmp_team_t *team;
int use_hot_team = !root->r.r_active;
int level = 0;
+ int do_place_partition = 1;
KA_TRACE(20, ("__kmp_allocate_team: called\n"));
KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
@@ -4997,6 +5095,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
++level; // not increment if #teams==1, or for outer fork of the teams;
// increment otherwise
}
+ // Do not perform the place partition if inner fork of the teams
+ // Wait until nested parallel region encountered inside teams construct
+ if ((master->th.th_teams_size.nteams == 1 &&
+ master->th.th_teams_level >= team->t.t_level) ||
+ (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
+ do_place_partition = 0;
}
hot_teams = master->th.th_hot_teams;
if (level < __kmp_hot_teams_max_level && hot_teams &&
@@ -5027,6 +5131,17 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
}
#endif
+ if (team->t.t_nproc != new_nproc &&
+ __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ // Distributed barrier may need a resize
+ int old_nthr = team->t.t_nproc;
+ __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
+ }
+
+ // If not doing the place partition, then reset the team's proc bind
+ // to indicate that partitioning of all threads still needs to take place
+ if (do_place_partition == 0)
+ team->t.t_proc_bind = proc_bind_default;
// Has the number of threads changed?
/* Let's assume the most common case is that the number of threads is
unchanged, and put that case first. */
@@ -5056,16 +5171,20 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
if ((team->t.t_size_changed == 0) &&
(team->t.t_proc_bind == new_proc_bind)) {
if (new_proc_bind == proc_bind_spread) {
- __kmp_partition_places(
- team, 1); // add flag to update only master for spread
+ if (do_place_partition) {
+ // add flag to update only master for spread
+ __kmp_partition_places(team, 1);
+ }
}
KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
"proc_bind = %d, partition = [%d,%d]\n",
team->t.t_id, new_proc_bind, team->t.t_first_place,
team->t.t_last_place));
} else {
- KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
- __kmp_partition_places(team);
+ if (do_place_partition) {
+ KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+ __kmp_partition_places(team);
+ }
}
#else
KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
@@ -5076,6 +5195,11 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
new_nproc));
team->t.t_size_changed = 1;
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ // Barrier size already reduced earlier in this function
+ // Activate team threads via th_used_in_team
+ __kmp_add_threads_to_team(team, new_nproc);
+ }
#if KMP_NESTED_HOT_TEAMS
if (__kmp_hot_teams_mode == 0) {
// AC: saved number of threads should correspond to team's value in this
@@ -5137,10 +5261,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
}
#endif
- KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+ if (do_place_partition) {
+ KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
#if KMP_AFFINITY_SUPPORTED
- __kmp_partition_places(team);
+ __kmp_partition_places(team);
#endif
+ }
} else { // team->t.t_nproc < new_nproc
#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
kmp_affin_mask_t *old_mask;
@@ -5152,7 +5278,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
KA_TRACE(20,
("__kmp_allocate_team: increasing hot team thread count to %d\n",
new_nproc));
-
+ int old_nproc = team->t.t_nproc; // save old value and use to update only
team->t.t_size_changed = 1;
#if KMP_NESTED_HOT_TEAMS
@@ -5179,10 +5305,9 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
team->t.t_nproc = new_nproc; // just get reserved threads involved
} else {
- // we may have some threads in reserve, but not enough
- team->t.t_nproc =
- hot_teams[level]
- .hot_team_nth; // get reserved threads involved if any
+ // We may have some threads in reserve, but not enough;
+ // get reserved threads involved if any.
+ team->t.t_nproc = hot_teams[level].hot_team_nth;
hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
#endif // KMP_NESTED_HOT_TEAMS
if (team->t.t_max_nproc < new_nproc) {
@@ -5237,8 +5362,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
#if KMP_NESTED_HOT_TEAMS
} // end of check of t_nproc vs. new_nproc vs. hot_team_nth
#endif // KMP_NESTED_HOT_TEAMS
+ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ // Barrier size already increased earlier in this function
+ // Activate team threads via th_used_in_team
+ __kmp_add_threads_to_team(team, new_nproc);
+ }
/* make sure everyone is syncronized */
- int old_nproc = team->t.t_nproc; // save old value and use to update only
// new threads below
__kmp_initialize_team(team, new_nproc, new_icvs,
root->r.r_uber_thread->th.th_ident);
@@ -5273,10 +5402,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
}
#endif
- KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+ if (do_place_partition) {
+ KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
#if KMP_AFFINITY_SUPPORTED
- __kmp_partition_places(team);
+ __kmp_partition_places(team);
#endif
+ }
} // Check changes in number of threads
kmp_info_t *master = team->t.t_threads[0];
@@ -5342,6 +5473,13 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
/* take this team from the team pool */
__kmp_team_pool = team->t.t_next_pool;
+ if (max_nproc > 1 &&
+ __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ if (!team->t.b) { // Allocate barrier structure
+ team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
+ }
+ }
+
/* setup the team for fresh use */
__kmp_initialize_team(team, new_nproc, new_icvs, NULL);
@@ -5397,6 +5535,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
/* and set it up */
team->t.t_max_nproc = max_nproc;
+ if (max_nproc > 1 &&
+ __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ // Allocate barrier structure
+ team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
+ }
+
/* NOTE well, for some reason allocating one big buffer and dividing it up
seems to really hurt performance a lot on the P4, so, let's not use this */
__kmp_allocate_team_arrays(team, max_nproc);
@@ -5469,7 +5613,6 @@ void __kmp_free_team(kmp_root_t *root,
int use_hot_team = team == root->r.r_hot_team;
#if KMP_NESTED_HOT_TEAMS
int level;
- kmp_hot_team_ptr_t *hot_teams;
if (master) {
level = team->t.t_active_level - 1;
if (master->th.th_teams_microtask) { // in teams construct?
@@ -5483,7 +5626,9 @@ void __kmp_free_team(kmp_root_t *root,
// team_of_workers before the parallel
} // team->t.t_level will be increased inside parallel
}
- hot_teams = master->th.th_hot_teams;
+#if KMP_DEBUG
+ kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
+#endif
if (level < __kmp_hot_teams_max_level) {
KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
use_hot_team = 1;
@@ -5553,10 +5698,43 @@ void __kmp_free_team(kmp_root_t *root,
/* free the worker threads */
for (f = 1; f < team->t.t_nproc; ++f) {
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+ if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
+ 1, 2);
+ }
__kmp_free_thread(team->t.t_threads[f]);
+ }
+
+ if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ if (team->t.b) {
+ // wake up thread at old location
+ team->t.b->go_release();
+ if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+ for (f = 1; f < team->t.t_nproc; ++f) {
+ if (team->t.b->sleep[f].sleep) {
+ __kmp_atomic_resume_64(
+ team->t.t_threads[f]->th.th_info.ds.ds_gtid,
+ (kmp_atomic_flag_64<> *)NULL);
+ }
+ }
+ }
+ // Wait for threads to be removed from team
+ for (int f = 1; f < team->t.t_nproc; ++f) {
+ while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
+ KMP_CPU_PAUSE();
+ }
+ }
+ }
+
+ for (f = 1; f < team->t.t_nproc; ++f) {
team->t.t_threads[f] = NULL;
}
+ if (team->t.t_max_nproc > 1 &&
+ __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ distributedBarrier::deallocate(team->t.b);
+ team->t.b = NULL;
+ }
/* put the team back in the team pool */
/* TODO limit size of team pool, call reap_team if pool too large */
team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
@@ -5955,11 +6133,18 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
KA_TRACE(
20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
gtid));
- /* Need release fence here to prevent seg faults for tree forkjoin barrier
- * (GEH) */
- kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
- thread);
- __kmp_release_64(&flag);
+ if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+ while (
+ !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
+ KMP_CPU_PAUSE();
+ __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
+ } else {
+ /* Need release fence here to prevent seg faults for tree forkjoin
+ barrier (GEH) */
+ kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
+ thread);
+ __kmp_release_64(&flag);
+ }
}
// Terminate OS thread.
@@ -6054,6 +6239,31 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
} // __kmp_reap_thread
+static void __kmp_itthash_clean(kmp_info_t *th) {
+#if USE_ITT_NOTIFY
+ if (__kmp_itt_region_domains.count > 0) {
+ for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
+ kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
+ while (bucket) {
+ kmp_itthash_entry_t *next = bucket->next_in_bucket;
+ __kmp_thread_free(th, bucket);
+ bucket = next;
+ }
+ }
+ }
+ if (__kmp_itt_barrier_domains.count > 0) {
+ for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
+ kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
+ while (bucket) {
+ kmp_itthash_entry_t *next = bucket->next_in_bucket;
+ __kmp_thread_free(th, bucket);
+ bucket = next;
+ }
+ }
+ }
+#endif
+}
+
static void __kmp_internal_end(void) {
int i;
@@ -6240,6 +6450,7 @@ void __kmp_internal_end_library(int gtid_req) {
gtid));
return;
} else {
+ __kmp_itthash_clean(__kmp_threads[gtid]);
KA_TRACE(
10,
("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
@@ -6486,7 +6697,7 @@ void __kmp_register_library_startup(void) {
char *value = NULL; // Actual value of the environment variable.
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
char *shm_name = __kmp_str_format("/%s", name);
int shm_preexist = 0;
char *data1;
@@ -6591,7 +6802,7 @@ void __kmp_register_library_startup(void) {
} break;
case 2: { // Neighbor is dead.
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
// close shared memory.
shm_unlink(shm_name); // this removes file in /dev/shm
#else
@@ -6605,7 +6816,7 @@ void __kmp_register_library_startup(void) {
}
}
KMP_INTERNAL_FREE((void *)value);
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
KMP_INTERNAL_FREE((void *)shm_name);
#endif
} // while
@@ -6618,7 +6829,7 @@ void __kmp_unregister_library(void) {
char *name = __kmp_reg_status_name();
char *value = NULL;
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
char *shm_name = __kmp_str_format("/%s", name);
int fd1 = shm_open(shm_name, O_RDONLY, 0666);
if (fd1 == -1) {
@@ -6639,14 +6850,14 @@ void __kmp_unregister_library(void) {
KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
// Ok, this is our variable. Delete it.
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
shm_unlink(shm_name); // this removes file in /dev/shm
#else
__kmp_env_unset(name);
#endif
}
-#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+#if defined(KMP_USE_SHM)
KMP_INTERNAL_FREE(shm_name);
#endif
@@ -6684,7 +6895,9 @@ static void __kmp_check_mic_type() {
static void __kmp_user_level_mwait_init() {
struct kmp_cpuid buf;
__kmp_x86_cpuid(7, 0, &buf);
- __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
+ __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
+ __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
+ __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
__kmp_umwait_enabled));
}
@@ -6844,8 +7057,8 @@ static void __kmp_do_serial_initialize(void) {
#if KMP_FAST_REDUCTION_BARRIER
#define kmp_reduction_barrier_gather_bb ((int)1)
#define kmp_reduction_barrier_release_bb ((int)1)
-#define kmp_reduction_barrier_gather_pat bp_hyper_bar
-#define kmp_reduction_barrier_release_pat bp_hyper_bar
+#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
+#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
#endif // KMP_FAST_REDUCTION_BARRIER
for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
__kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
@@ -7500,6 +7713,11 @@ static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
num_threads = 1;
}
} else {
+ if (num_threads < 0) {
+ __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
+ __kmp_msg_null);
+ num_threads = 1;
+ }
// This thread will be the primary thread of the league primary threads
// Store new thread limit; old limit is saved in th_cg_roots list
thr->th.th_current_task->td_icvs.thread_limit = num_threads;
@@ -7531,9 +7749,13 @@ static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
int num_threads) {
kmp_info_t *thr = __kmp_threads[gtid];
- KMP_DEBUG_ASSERT(num_teams >= 0);
- KMP_DEBUG_ASSERT(num_threads >= 0);
-
+ if (num_teams < 0) {
+ // OpenMP specification requires requested values to be positive,
+ // but people can send us any value, so we'd better check
+ __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
+ __kmp_msg_null);
+ num_teams = 1;
+ }
if (num_teams == 0) {
if (__kmp_nteams > 0) {
num_teams = __kmp_nteams;
@@ -7590,7 +7812,7 @@ void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
} else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
num_teams = num_teams_ub;
} else { // num_teams_lb <= num_teams <= num_teams_ub
- if (num_threads == 0) {
+ if (num_threads <= 0) {
if (num_teams_ub > __kmp_teams_max_nth) {
num_teams = num_teams_lb;
} else {
@@ -8702,6 +8924,96 @@ void __kmp_omp_display_env(int verbose) {
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
}
+// The team size is changing, so distributed barrier must be modified
+void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
+ int new_nthreads) {
+ KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
+ bp_dist_bar);
+ kmp_info_t **other_threads = team->t.t_threads;
+
+ // We want all the workers to stop waiting on the barrier while we adjust the
+ // size of the team.
+ for (int f = 1; f < old_nthreads; ++f) {
+ KMP_DEBUG_ASSERT(other_threads[f] != NULL);
+ // Ignore threads that are already inactive or not present in the team
+ if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
+ // teams construct causes thread_limit to get passed in, and some of
+ // those could be inactive; just ignore them
+ continue;
+ }
+ // If thread is transitioning still to in_use state, wait for it
+ if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
+ while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
+ KMP_CPU_PAUSE();
+ }
+ // The thread should be in_use now
+ KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
+ // Transition to unused state
+ team->t.t_threads[f]->th.th_used_in_team.store(2);
+ KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
+ }
+ // Release all the workers
+ kmp_uint64 new_value; // new value for go
+ new_value = team->t.b->go_release();
+
+ KMP_MFENCE();
+
+ // Workers should see transition status 2 and move to 0; but may need to be
+ // woken up first
+ size_t my_go_index;
+ int count = old_nthreads - 1;
+ while (count > 0) {
+ count = old_nthreads - 1;
+ for (int f = 1; f < old_nthreads; ++f) {
+ my_go_index = f / team->t.b->threads_per_go;
+ if (other_threads[f]->th.th_used_in_team.load() != 0) {
+ if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
+ kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
+ void *, other_threads[f]->th.th_sleep_loc);
+ __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
+ }
+ } else {
+ KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
+ count--;
+ }
+ }
+ }
+ // Now update the barrier size
+ team->t.b->update_num_threads(new_nthreads);
+ team->t.b->go_reset();
+}
+
+void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
+ // Add the threads back to the team
+ KMP_DEBUG_ASSERT(team);
+ // Threads were paused and pointed at th_used_in_team temporarily during a
+ // resize of the team. We're going to set th_used_in_team to 3 to indicate to
+ // the thread that it should transition itself back into the team. Then, if
+ // blocktime isn't infinite, the thread could be sleeping, so we send a resume
+ // to wake it up.
+ for (int f = 1; f < new_nthreads; ++f) {
+ KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+ KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
+ 3);
+ if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
+ __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
+ (kmp_flag_32<false, false> *)NULL);
+ }
+ }
+ // The threads should be transitioning to the team; when they are done, they
+ // should have set th_used_in_team to 1. This loop forces master to wait until
+ // all threads have moved into the team and are waiting in the barrier.
+ int count = new_nthreads - 1;
+ while (count > 0) {
+ count = new_nthreads - 1;
+ for (int f = 1; f < new_nthreads; ++f) {
+ if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
+ count--;
+ }
+ }
+ }
+}
+
// Globals and functions for hidden helper task
kmp_info_t **__kmp_hidden_helper_threads;
kmp_info_t *__kmp_hidden_helper_main_thread;