diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:17 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:17 +0300 |
commit | d3a398281c6fd1d3672036cb2d63f842d2cb28c5 (patch) | |
tree | dd4bd3ca0f36b817e96812825ffaf10d645803f2 /contrib/libs/cxxsupp/openmp/kmp_barrier.cpp | |
parent | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (diff) | |
download | ydb-d3a398281c6fd1d3672036cb2d63f842d2cb28c5.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/cxxsupp/openmp/kmp_barrier.cpp')
-rw-r--r-- | contrib/libs/cxxsupp/openmp/kmp_barrier.cpp | 3302 |
1 files changed, 1651 insertions, 1651 deletions
diff --git a/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp b/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp index 23986c73ba..6b66dabba2 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp @@ -1,226 +1,226 @@ -/* - * kmp_barrier.cpp - */ - - -//===----------------------------------------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is dual licensed under the MIT and the University of Illinois Open -// Source Licenses. See LICENSE.txt for details. -// -//===----------------------------------------------------------------------===// - - -#include "kmp.h" -#include "kmp_wait_release.h" -#include "kmp_stats.h" -#include "kmp_itt.h" -#include "kmp_os.h" - - -#if KMP_MIC -#include <immintrin.h> -#define USE_NGO_STORES 1 -#endif // KMP_MIC - -#if KMP_MIC && USE_NGO_STORES -// ICV copying -#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) -#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) -#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) -#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory") -#else -#define ngo_load(src) ((void)0) -#define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) -#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) -#define ngo_sync() ((void)0) -#endif /* KMP_MIC && USE_NGO_STORES */ - -void __kmp_print_structure(void); // Forward declaration - -// ---------------------------- Barrier Algorithms ---------------------------- - -// Linear Barrier -static void -__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - void (*reduce)(void *, void *) - USE_ITT_BUILD_ARG(void * itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather); +/* + * kmp_barrier.cpp + */ + + +//===----------------------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// + + +#include "kmp.h" +#include "kmp_wait_release.h" +#include "kmp_stats.h" +#include "kmp_itt.h" +#include "kmp_os.h" + + +#if KMP_MIC +#include <immintrin.h> +#define USE_NGO_STORES 1 +#endif // KMP_MIC + +#if KMP_MIC && USE_NGO_STORES +// ICV copying +#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) +#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) +#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) +#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory") +#else +#define ngo_load(src) ((void)0) +#define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) +#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) +#define ngo_sync() ((void)0) +#endif /* KMP_MIC && USE_NGO_STORES */ + +void __kmp_print_structure(void); // Forward declaration + +// ---------------------------- Barrier Algorithms ---------------------------- + +// Linear Barrier +static void +__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + void (*reduce)(void *, void *) + USE_ITT_BUILD_ARG(void * itt_sync_obj) ) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather); kmp_team_t *team = this_thr->th.th_team; kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb; kmp_info_t **other_threads = team->t.t_threads; - - KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); - -#if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - save arrive time to the thread - if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); - } -#endif - // We now perform a linear reduction to signal that all of the threads have arrived. - if (!KMP_MASTER_TID(tid)) { - KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" - "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived, - thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); - // Mark arrival to master thread - /* After performing this write, a worker thread may not assume that the team is valid - any more - it could be deallocated by the master thread at any time. */ - kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]); - flag.release(); - } else { + + KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); + +#if USE_ITT_BUILD && USE_ITT_NOTIFY + // Barrier imbalance - save arrive time to the thread + if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); + } +#endif + // We now perform a linear reduction to signal that all of the threads have arrived. + if (!KMP_MASTER_TID(tid)) { + KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" + "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived, + thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); + // Mark arrival to master thread + /* After performing this write, a worker thread may not assume that the team is valid + any more - it could be deallocated by the master thread at any time. */ + kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]); + flag.release(); + } else { kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; int nproc = this_thr->th.th_team_nproc; int i; - // Don't have to worry about sleep bit here or atomic since team setting + // Don't have to worry about sleep bit here or atomic since team setting kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; - - // Collect all the worker team member threads. - for (i=1; i<nproc; ++i) { -#if KMP_CACHE_MANAGE - // Prefetch next thread's arrived count - if (i+1 < nproc) - KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived); -#endif /* KMP_CACHE_MANAGE */ - KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " - "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(i, team), team->t.t_id, i, - &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); - - // Wait for worker thread to arrive - kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); - flag.wait(this_thr, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); -#if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - write min of the thread time and the other thread time to the thread. - if (__kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, - other_threads[i]->th.th_bar_min_time); - } -#endif - if (reduce) { - KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid, - team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i)); - (*reduce)(this_thr->th.th_local.reduce_data, - other_threads[i]->th.th_local.reduce_data); - } - } - // Don't have to worry about sleep bit here or atomic since team setting - team_bar->b_arrived = new_state; - KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", - gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state)); - } - KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); -} - -static void -__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - int propagate_icvs - USE_ITT_BUILD_ARG(void *itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release); + + // Collect all the worker team member threads. + for (i=1; i<nproc; ++i) { +#if KMP_CACHE_MANAGE + // Prefetch next thread's arrived count + if (i+1 < nproc) + KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived); +#endif /* KMP_CACHE_MANAGE */ + KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " + "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(i, team), team->t.t_id, i, + &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); + + // Wait for worker thread to arrive + kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); + flag.wait(this_thr, FALSE + USE_ITT_BUILD_ARG(itt_sync_obj) ); +#if USE_ITT_BUILD && USE_ITT_NOTIFY + // Barrier imbalance - write min of the thread time and the other thread time to the thread. + if (__kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, + other_threads[i]->th.th_bar_min_time); + } +#endif + if (reduce) { + KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid, + team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i)); + (*reduce)(this_thr->th.th_local.reduce_data, + other_threads[i]->th.th_local.reduce_data); + } + } + // Don't have to worry about sleep bit here or atomic since team setting + team_bar->b_arrived = new_state; + KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", + gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state)); + } + KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +} + +static void +__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + int propagate_icvs + USE_ITT_BUILD_ARG(void *itt_sync_obj) ) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release); kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; kmp_team_t *team; - - if (KMP_MASTER_TID(tid)) { + + if (KMP_MASTER_TID(tid)) { unsigned int i; kmp_uint32 nproc = this_thr->th.th_team_nproc; kmp_info_t **other_threads; - - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - other_threads = team->t.t_threads; - - KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - - if (nproc > 1) { -#if KMP_BARRIER_ICV_PUSH - { - KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); - if (propagate_icvs) { - ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); - for (i=1; i<nproc; ++i) { - __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE); - ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, - &team->t.t_implicit_task_taskdata[0].td_icvs); - } - ngo_sync(); - } - } -#endif // KMP_BARRIER_ICV_PUSH - - // Now, release all of the worker threads - for (i=1; i<nproc; ++i) { -#if KMP_CACHE_MANAGE - // Prefetch next thread's go flag - if (i+1 < nproc) - KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go); -#endif /* KMP_CACHE_MANAGE */ - KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " - "go(%p): %u => %u\n", gtid, team->t.t_id, tid, - other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i, - &other_threads[i]->th.th_bar[bt].bb.b_go, - other_threads[i]->th.th_bar[bt].bb.b_go, - other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); - kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]); - flag.release(); - } - } - } else { // Wait for the MASTER thread to release us - KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", - gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); - flag.wait(this_thr, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); -#if USE_ITT_BUILD && USE_ITT_NOTIFY - if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { - // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled) - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); - // Cancel wait on previous parallel region... - __kmp_itt_task_starting(itt_sync_obj); - - if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) - return; - - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); - if (itt_sync_obj != NULL) - // Call prepare as early as possible for "new" barrier - __kmp_itt_task_finished(itt_sync_obj); - } else -#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ - // Early exit for reaping threads releasing forkjoin barrier - if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) ) - return; - // The worker thread may now assume that the team is valid. -#ifdef KMP_DEBUG - tid = __kmp_tid_from_gtid(gtid); - team = __kmp_threads[gtid]->th.th_team; -#endif - KMP_DEBUG_ASSERT(team != NULL); - TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); - KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", - gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); - KMP_MB(); // Flush all pending memory write invalidates. - } - KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); -} - -// Tree barrier -static void -__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - void (*reduce)(void *, void *) - USE_ITT_BUILD_ARG(void *itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather); + + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + other_threads = team->t.t_threads; + + KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + + if (nproc > 1) { +#if KMP_BARRIER_ICV_PUSH + { + KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); + if (propagate_icvs) { + ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); + for (i=1; i<nproc; ++i) { + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE); + ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, + &team->t.t_implicit_task_taskdata[0].td_icvs); + } + ngo_sync(); + } + } +#endif // KMP_BARRIER_ICV_PUSH + + // Now, release all of the worker threads + for (i=1; i<nproc; ++i) { +#if KMP_CACHE_MANAGE + // Prefetch next thread's go flag + if (i+1 < nproc) + KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go); +#endif /* KMP_CACHE_MANAGE */ + KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " + "go(%p): %u => %u\n", gtid, team->t.t_id, tid, + other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i, + &other_threads[i]->th.th_bar[bt].bb.b_go, + other_threads[i]->th.th_bar[bt].bb.b_go, + other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); + kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]); + flag.release(); + } + } + } else { // Wait for the MASTER thread to release us + KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", + gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); + kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + flag.wait(this_thr, TRUE + USE_ITT_BUILD_ARG(itt_sync_obj) ); +#if USE_ITT_BUILD && USE_ITT_NOTIFY + if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { + // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled) + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); + // Cancel wait on previous parallel region... + __kmp_itt_task_starting(itt_sync_obj); + + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); + if (itt_sync_obj != NULL) + // Call prepare as early as possible for "new" barrier + __kmp_itt_task_finished(itt_sync_obj); + } else +#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ + // Early exit for reaping threads releasing forkjoin barrier + if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) ) + return; + // The worker thread may now assume that the team is valid. +#ifdef KMP_DEBUG + tid = __kmp_tid_from_gtid(gtid); + team = __kmp_threads[gtid]->th.th_team; +#endif + KMP_DEBUG_ASSERT(team != NULL); + TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); + KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", + gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); + KMP_MB(); // Flush all pending memory write invalidates. + } + KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +} + +// Tree barrier +static void +__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + void (*reduce)(void *, void *) + USE_ITT_BUILD_ARG(void *itt_sync_obj) ) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather); kmp_team_t *team = this_thr->th.th_team; kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; kmp_info_t **other_threads = team->t.t_threads; @@ -230,92 +230,92 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, kmp_uint32 child; kmp_uint32 child_tid; kmp_uint64 new_state; - - KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); - -#if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - save arrive time to the thread - if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); - } -#endif - // Perform tree gather to wait until all threads have arrived; reduce any required data as we go - child_tid = (tid << branch_bits) + 1; - if (child_tid < nproc) { - // Parent threads wait for all their children to arrive - new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; - child = 1; - do { + + KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); + +#if USE_ITT_BUILD && USE_ITT_NOTIFY + // Barrier imbalance - save arrive time to the thread + if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); + } +#endif + // Perform tree gather to wait until all threads have arrived; reduce any required data as we go + child_tid = (tid << branch_bits) + 1; + if (child_tid < nproc) { + // Parent threads wait for all their children to arrive + new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; + child = 1; + do { kmp_info_t *child_thr = other_threads[child_tid]; kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; -#if KMP_CACHE_MANAGE - // Prefetch next thread's arrived count - if (child+1 <= branch_factor && child_tid+1 < nproc) - KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived); -#endif /* KMP_CACHE_MANAGE */ - KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " - "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, - &child_bar->b_arrived, new_state)); - // Wait for child to arrive - kmp_flag_64 flag(&child_bar->b_arrived, new_state); - flag.wait(this_thr, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); -#if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - write min of the thread time and a child time to the thread. - if (__kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, - child_thr->th.th_bar_min_time); - } -#endif - if (reduce) { - KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid)); - (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); - } - child++; - child_tid++; - } - while (child <= branch_factor && child_tid < nproc); - } - - if (!KMP_MASTER_TID(tid)) { // Worker threads +#if KMP_CACHE_MANAGE + // Prefetch next thread's arrived count + if (child+1 <= branch_factor && child_tid+1 < nproc) + KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived); +#endif /* KMP_CACHE_MANAGE */ + KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " + "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, + &child_bar->b_arrived, new_state)); + // Wait for child to arrive + kmp_flag_64 flag(&child_bar->b_arrived, new_state); + flag.wait(this_thr, FALSE + USE_ITT_BUILD_ARG(itt_sync_obj) ); +#if USE_ITT_BUILD && USE_ITT_NOTIFY + // Barrier imbalance - write min of the thread time and a child time to the thread. + if (__kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, + child_thr->th.th_bar_min_time); + } +#endif + if (reduce) { + KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid)); + (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); + } + child++; + child_tid++; + } + while (child <= branch_factor && child_tid < nproc); + } + + if (!KMP_MASTER_TID(tid)) { // Worker threads kmp_int32 parent_tid = (tid - 1) >> branch_bits; - - KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " - "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid, - &thr_bar->b_arrived, thr_bar->b_arrived, - thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); - - // Mark arrival to parent thread - /* After performing this write, a worker thread may not assume that the team is valid - any more - it could be deallocated by the master thread at any time. */ - kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]); - flag.release(); - } else { - // Need to update the team arrived pointer if we are the master thread - if (nproc > 1) // New value was already computed above - team->t.t_bar[bt].b_arrived = new_state; - else - team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; - KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", - gtid, team->t.t_id, tid, team->t.t_id, - &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); - } - KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); -} - -static void -__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - int propagate_icvs - USE_ITT_BUILD_ARG(void *itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release); + + KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " + "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid, + &thr_bar->b_arrived, thr_bar->b_arrived, + thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); + + // Mark arrival to parent thread + /* After performing this write, a worker thread may not assume that the team is valid + any more - it could be deallocated by the master thread at any time. */ + kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]); + flag.release(); + } else { + // Need to update the team arrived pointer if we are the master thread + if (nproc > 1) // New value was already computed above + team->t.t_bar[bt].b_arrived = new_state; + else + team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; + KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", + gtid, team->t.t_id, tid, team->t.t_id, + &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); + } + KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +} + +static void +__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + int propagate_icvs + USE_ITT_BUILD_ARG(void *itt_sync_obj) ) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release); kmp_team_t *team; kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; kmp_uint32 nproc; @@ -323,102 +323,102 @@ __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, kmp_uint32 branch_factor = 1 << branch_bits; kmp_uint32 child; kmp_uint32 child_tid; - - // Perform a tree release for all of the threads that have been gathered - if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet - KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", - gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); - // Wait for parent thread to release us - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); - flag.wait(this_thr, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); -#if USE_ITT_BUILD && USE_ITT_NOTIFY - if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { - // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled) - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); - // Cancel wait on previous parallel region... - __kmp_itt_task_starting(itt_sync_obj); - - if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) - return; - - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); - if (itt_sync_obj != NULL) - // Call prepare as early as possible for "new" barrier - __kmp_itt_task_finished(itt_sync_obj); - } else -#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ - // Early exit for reaping threads releasing forkjoin barrier - if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) - return; - - // The worker thread may now assume that the team is valid. - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - tid = __kmp_tid_from_gtid(gtid); - - TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); - KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", - gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); - KMP_MB(); // Flush all pending memory write invalidates. - } else { - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - } - nproc = this_thr->th.th_team_nproc; - child_tid = (tid << branch_bits) + 1; - - if (child_tid < nproc) { + + // Perform a tree release for all of the threads that have been gathered + if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet + KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", + gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); + // Wait for parent thread to release us + kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + flag.wait(this_thr, TRUE + USE_ITT_BUILD_ARG(itt_sync_obj) ); +#if USE_ITT_BUILD && USE_ITT_NOTIFY + if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { + // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled) + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); + // Cancel wait on previous parallel region... + __kmp_itt_task_starting(itt_sync_obj); + + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); + if (itt_sync_obj != NULL) + // Call prepare as early as possible for "new" barrier + __kmp_itt_task_finished(itt_sync_obj); + } else +#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ + // Early exit for reaping threads releasing forkjoin barrier + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + // The worker thread may now assume that the team is valid. + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + tid = __kmp_tid_from_gtid(gtid); + + TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); + KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", + gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); + KMP_MB(); // Flush all pending memory write invalidates. + } else { + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + } + nproc = this_thr->th.th_team_nproc; + child_tid = (tid << branch_bits) + 1; + + if (child_tid < nproc) { kmp_info_t **other_threads = team->t.t_threads; - child = 1; - // Parent threads release all their children - do { + child = 1; + // Parent threads release all their children + do { kmp_info_t *child_thr = other_threads[child_tid]; kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; -#if KMP_CACHE_MANAGE - // Prefetch next thread's go count - if (child+1 <= branch_factor && child_tid+1 < nproc) - KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go); -#endif /* KMP_CACHE_MANAGE */ - -#if KMP_BARRIER_ICV_PUSH - { - KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); - if (propagate_icvs) { - __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid], - team, child_tid, FALSE); - copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, - &team->t.t_implicit_task_taskdata[0].td_icvs); - } - } -#endif // KMP_BARRIER_ICV_PUSH - KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" - "go(%p): %u => %u\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(child_tid, team), team->t.t_id, - child_tid, &child_bar->b_go, child_bar->b_go, - child_bar->b_go + KMP_BARRIER_STATE_BUMP)); - // Release child from barrier - kmp_flag_64 flag(&child_bar->b_go, child_thr); - flag.release(); - child++; - child_tid++; - } - while (child <= branch_factor && child_tid < nproc); - } - KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); -} - - -// Hyper Barrier -static void -__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - void (*reduce)(void *, void *) - USE_ITT_BUILD_ARG(void *itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather); +#if KMP_CACHE_MANAGE + // Prefetch next thread's go count + if (child+1 <= branch_factor && child_tid+1 < nproc) + KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go); +#endif /* KMP_CACHE_MANAGE */ + +#if KMP_BARRIER_ICV_PUSH + { + KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); + if (propagate_icvs) { + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid], + team, child_tid, FALSE); + copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, + &team->t.t_implicit_task_taskdata[0].td_icvs); + } + } +#endif // KMP_BARRIER_ICV_PUSH + KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" + "go(%p): %u => %u\n", gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(child_tid, team), team->t.t_id, + child_tid, &child_bar->b_go, child_bar->b_go, + child_bar->b_go + KMP_BARRIER_STATE_BUMP)); + // Release child from barrier + kmp_flag_64 flag(&child_bar->b_go, child_thr); + flag.release(); + child++; + child_tid++; + } + while (child <= branch_factor && child_tid < nproc); + } + KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +} + + +// Hyper Barrier +static void +__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + void (*reduce)(void *, void *) + USE_ITT_BUILD_ARG(void *itt_sync_obj) ) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather); kmp_team_t *team = this_thr->th.th_team; kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; kmp_info_t **other_threads = team->t.t_threads; @@ -428,103 +428,103 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, kmp_uint32 branch_factor = 1 << branch_bits; kmp_uint32 offset; kmp_uint32 level; - - KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - - KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); - -#if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - save arrive time to the thread - if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); - } -#endif - /* Perform a hypercube-embedded tree gather to wait until all of the threads have - arrived, and reduce any required data as we go. */ - kmp_flag_64 p_flag(&thr_bar->b_arrived); - for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits) - { + + KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + + KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); + +#if USE_ITT_BUILD && USE_ITT_NOTIFY + // Barrier imbalance - save arrive time to the thread + if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); + } +#endif + /* Perform a hypercube-embedded tree gather to wait until all of the threads have + arrived, and reduce any required data as we go. */ + kmp_flag_64 p_flag(&thr_bar->b_arrived); + for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits) + { kmp_uint32 child; kmp_uint32 child_tid; - - if (((tid >> level) & (branch_factor - 1)) != 0) { + + if (((tid >> level) & (branch_factor - 1)) != 0) { kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1); - - KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " - "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid, - &thr_bar->b_arrived, thr_bar->b_arrived, - thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); - // Mark arrival to parent thread - /* After performing this write (in the last iteration of the enclosing for loop), - a worker thread may not assume that the team is valid any more - it could be - deallocated by the master thread at any time. */ - p_flag.set_waiter(other_threads[parent_tid]); - p_flag.release(); - break; - } - - // Parent threads wait for children to arrive - if (new_state == KMP_BARRIER_UNUSED_STATE) - new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; - for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads; - child++, child_tid+=(1 << level)) - { + + KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " + "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid, + &thr_bar->b_arrived, thr_bar->b_arrived, + thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); + // Mark arrival to parent thread + /* After performing this write (in the last iteration of the enclosing for loop), + a worker thread may not assume that the team is valid any more - it could be + deallocated by the master thread at any time. */ + p_flag.set_waiter(other_threads[parent_tid]); + p_flag.release(); + break; + } + + // Parent threads wait for children to arrive + if (new_state == KMP_BARRIER_UNUSED_STATE) + new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; + for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads; + child++, child_tid+=(1 << level)) + { kmp_info_t *child_thr = other_threads[child_tid]; kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; -#if KMP_CACHE_MANAGE +#if KMP_CACHE_MANAGE kmp_uint32 next_child_tid = child_tid + (1 << level); - // Prefetch next thread's arrived count - if (child+1 < branch_factor && next_child_tid < num_threads) - KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); -#endif /* KMP_CACHE_MANAGE */ - KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " - "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, - &child_bar->b_arrived, new_state)); - // Wait for child to arrive - kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); - c_flag.wait(this_thr, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); -#if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - write min of the thread time and a child time to the thread. - if (__kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, - child_thr->th.th_bar_min_time); - } -#endif - if (reduce) { - KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid)); - (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); - } - } - } - - if (KMP_MASTER_TID(tid)) { - // Need to update the team arrived pointer if we are the master thread - if (new_state == KMP_BARRIER_UNUSED_STATE) - team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; - else - team->t.t_bar[bt].b_arrived = new_state; - KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", - gtid, team->t.t_id, tid, team->t.t_id, - &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); - } - KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); -} - -// The reverse versions seem to beat the forward versions overall -#define KMP_REVERSE_HYPER_BAR -static void -__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - int propagate_icvs - USE_ITT_BUILD_ARG(void *itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release); + // Prefetch next thread's arrived count + if (child+1 < branch_factor && next_child_tid < num_threads) + KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); +#endif /* KMP_CACHE_MANAGE */ + KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " + "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, + &child_bar->b_arrived, new_state)); + // Wait for child to arrive + kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); + c_flag.wait(this_thr, FALSE + USE_ITT_BUILD_ARG(itt_sync_obj) ); +#if USE_ITT_BUILD && USE_ITT_NOTIFY + // Barrier imbalance - write min of the thread time and a child time to the thread. + if (__kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, + child_thr->th.th_bar_min_time); + } +#endif + if (reduce) { + KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid)); + (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); + } + } + } + + if (KMP_MASTER_TID(tid)) { + // Need to update the team arrived pointer if we are the master thread + if (new_state == KMP_BARRIER_UNUSED_STATE) + team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; + else + team->t.t_bar[bt].b_arrived = new_state; + KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", + gtid, team->t.t_id, tid, team->t.t_id, + &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); + } + KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +} + +// The reverse versions seem to beat the forward versions overall +#define KMP_REVERSE_HYPER_BAR +static void +__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + int propagate_icvs + USE_ITT_BUILD_ARG(void *itt_sync_obj) ) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release); kmp_team_t *team; kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb; kmp_info_t **other_threads; @@ -535,1208 +535,1208 @@ __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid kmp_uint32 child_tid; kmp_uint32 offset; kmp_uint32 level; - - /* Perform a hypercube-embedded tree release for all of the threads that have been gathered. - If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse - order of the corresponding gather, otherwise threads are released in the same order. */ - if (KMP_MASTER_TID(tid)) { // master - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); -#if KMP_BARRIER_ICV_PUSH - if (propagate_icvs) { // master already has ICVs in final destination; copy - copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs); - } -#endif - } - else { // Handle fork barrier workers who aren't part of a team yet - KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", - gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); - // Wait for parent thread to release us - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); - flag.wait(this_thr, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); -#if USE_ITT_BUILD && USE_ITT_NOTIFY - if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { - // In fork barrier where we could not get the object reliably - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); - // Cancel wait on previous parallel region... - __kmp_itt_task_starting(itt_sync_obj); - - if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) - return; - - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); - if (itt_sync_obj != NULL) - // Call prepare as early as possible for "new" barrier - __kmp_itt_task_finished(itt_sync_obj); - } else -#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ - // Early exit for reaping threads releasing forkjoin barrier - if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) - return; - - // The worker thread may now assume that the team is valid. - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - tid = __kmp_tid_from_gtid(gtid); - - TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); - KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", - gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); - KMP_MB(); // Flush all pending memory write invalidates. - } - num_threads = this_thr->th.th_team_nproc; - other_threads = team->t.t_threads; - -#ifdef KMP_REVERSE_HYPER_BAR - // Count up to correct level for parent - for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0); - level+=branch_bits, offset<<=branch_bits); - - // Now go down from there - for (level-=branch_bits, offset>>=branch_bits; offset != 0; - level-=branch_bits, offset>>=branch_bits) -#else - // Go down the tree, level by level - for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits) -#endif // KMP_REVERSE_HYPER_BAR - { -#ifdef KMP_REVERSE_HYPER_BAR - /* Now go in reverse order through the children, highest to lowest. - Initial setting of child is conservative here. */ - child = num_threads >> ((level==0)?level:level-1); - for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level); - child>=1; child--, child_tid-=(1<<level)) -#else - if (((tid >> level) & (branch_factor - 1)) != 0) - // No need to go lower than this, since this is the level parent would be notified - break; - // Iterate through children on this level of the tree - for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads; - child++, child_tid+=(1<<level)) -#endif // KMP_REVERSE_HYPER_BAR - { - if (child_tid >= num_threads) continue; // Child doesn't exist so keep going - else { + + /* Perform a hypercube-embedded tree release for all of the threads that have been gathered. + If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse + order of the corresponding gather, otherwise threads are released in the same order. */ + if (KMP_MASTER_TID(tid)) { // master + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +#if KMP_BARRIER_ICV_PUSH + if (propagate_icvs) { // master already has ICVs in final destination; copy + copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs); + } +#endif + } + else { // Handle fork barrier workers who aren't part of a team yet + KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", + gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); + // Wait for parent thread to release us + kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + flag.wait(this_thr, TRUE + USE_ITT_BUILD_ARG(itt_sync_obj) ); +#if USE_ITT_BUILD && USE_ITT_NOTIFY + if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { + // In fork barrier where we could not get the object reliably + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); + // Cancel wait on previous parallel region... + __kmp_itt_task_starting(itt_sync_obj); + + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); + if (itt_sync_obj != NULL) + // Call prepare as early as possible for "new" barrier + __kmp_itt_task_finished(itt_sync_obj); + } else +#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ + // Early exit for reaping threads releasing forkjoin barrier + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + // The worker thread may now assume that the team is valid. + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + tid = __kmp_tid_from_gtid(gtid); + + TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); + KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", + gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); + KMP_MB(); // Flush all pending memory write invalidates. + } + num_threads = this_thr->th.th_team_nproc; + other_threads = team->t.t_threads; + +#ifdef KMP_REVERSE_HYPER_BAR + // Count up to correct level for parent + for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0); + level+=branch_bits, offset<<=branch_bits); + + // Now go down from there + for (level-=branch_bits, offset>>=branch_bits; offset != 0; + level-=branch_bits, offset>>=branch_bits) +#else + // Go down the tree, level by level + for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits) +#endif // KMP_REVERSE_HYPER_BAR + { +#ifdef KMP_REVERSE_HYPER_BAR + /* Now go in reverse order through the children, highest to lowest. + Initial setting of child is conservative here. */ + child = num_threads >> ((level==0)?level:level-1); + for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level); + child>=1; child--, child_tid-=(1<<level)) +#else + if (((tid >> level) & (branch_factor - 1)) != 0) + // No need to go lower than this, since this is the level parent would be notified + break; + // Iterate through children on this level of the tree + for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads; + child++, child_tid+=(1<<level)) +#endif // KMP_REVERSE_HYPER_BAR + { + if (child_tid >= num_threads) continue; // Child doesn't exist so keep going + else { kmp_info_t *child_thr = other_threads[child_tid]; kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; -#if KMP_CACHE_MANAGE +#if KMP_CACHE_MANAGE kmp_uint32 next_child_tid = child_tid - (1 << level); - // Prefetch next thread's go count -# ifdef KMP_REVERSE_HYPER_BAR - if (child-1 >= 1 && next_child_tid < num_threads) -# else - if (child+1 < branch_factor && next_child_tid < num_threads) -# endif // KMP_REVERSE_HYPER_BAR - KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); -#endif /* KMP_CACHE_MANAGE */ - -#if KMP_BARRIER_ICV_PUSH - if (propagate_icvs) // push my fixed ICVs to my child - copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); -#endif // KMP_BARRIER_ICV_PUSH - - KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" - "go(%p): %u => %u\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(child_tid, team), team->t.t_id, - child_tid, &child_bar->b_go, child_bar->b_go, - child_bar->b_go + KMP_BARRIER_STATE_BUMP)); - // Release child from barrier - kmp_flag_64 flag(&child_bar->b_go, child_thr); - flag.release(); - } - } - } -#if KMP_BARRIER_ICV_PUSH - if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest - __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); - copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs); - } -#endif - KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); -} - -// Hierarchical Barrier - -// Initialize thread barrier data -/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the - minimum amount of initialization required based on how the team has changed. Returns true if - leaf children will require both on-core and traditional wake-up mechanisms. For example, if the - team size increases, threads already in the team will respond to on-core wakeup on their parent - thread, but threads newly added to the team will only be listening on the their local b_go. */ -static bool -__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc, - int gtid, int tid, kmp_team_t *team) -{ - // Checks to determine if (re-)initialization is needed - bool uninitialized = thr_bar->team == NULL; - bool team_changed = team != thr_bar->team; - bool team_sz_changed = nproc != thr_bar->nproc; - bool tid_changed = tid != thr_bar->old_tid; - bool retval = false; - - if (uninitialized || team_sz_changed) { - __kmp_get_hierarchy(nproc, thr_bar); - } - - if (uninitialized || team_sz_changed || tid_changed) { - thr_bar->my_level = thr_bar->depth-1; // default for master - thr_bar->parent_tid = -1; // default for master - if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy - kmp_uint32 d=0; - while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level - kmp_uint32 rem; - if (d == thr_bar->depth-2) { // reached level right below the master - thr_bar->parent_tid = 0; - thr_bar->my_level = d; - break; - } - else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster? - // thread is not a subtree root at next level, so this is max - thr_bar->parent_tid = tid - rem; - thr_bar->my_level = d; - break; - } - ++d; - } - } - thr_bar->offset = 7-(tid-thr_bar->parent_tid-1); - thr_bar->old_tid = tid; - thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; - thr_bar->team = team; - thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; - } - if (uninitialized || team_changed || tid_changed) { - thr_bar->team = team; - thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; - retval = true; - } - if (uninitialized || team_sz_changed || tid_changed) { - thr_bar->nproc = nproc; - thr_bar->leaf_kids = thr_bar->base_leaf_kids; - if (thr_bar->my_level == 0) thr_bar->leaf_kids=0; - if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc) - thr_bar->leaf_kids = nproc - tid - 1; - thr_bar->leaf_state = 0; - for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1; - } - return retval; -} - -static void -__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, - int gtid, int tid, void (*reduce) (void *, void *) - USE_ITT_BUILD_ARG(void * itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather); + // Prefetch next thread's go count +# ifdef KMP_REVERSE_HYPER_BAR + if (child-1 >= 1 && next_child_tid < num_threads) +# else + if (child+1 < branch_factor && next_child_tid < num_threads) +# endif // KMP_REVERSE_HYPER_BAR + KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); +#endif /* KMP_CACHE_MANAGE */ + +#if KMP_BARRIER_ICV_PUSH + if (propagate_icvs) // push my fixed ICVs to my child + copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); +#endif // KMP_BARRIER_ICV_PUSH + + KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" + "go(%p): %u => %u\n", gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(child_tid, team), team->t.t_id, + child_tid, &child_bar->b_go, child_bar->b_go, + child_bar->b_go + KMP_BARRIER_STATE_BUMP)); + // Release child from barrier + kmp_flag_64 flag(&child_bar->b_go, child_thr); + flag.release(); + } + } + } +#if KMP_BARRIER_ICV_PUSH + if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs); + } +#endif + KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +} + +// Hierarchical Barrier + +// Initialize thread barrier data +/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the + minimum amount of initialization required based on how the team has changed. Returns true if + leaf children will require both on-core and traditional wake-up mechanisms. For example, if the + team size increases, threads already in the team will respond to on-core wakeup on their parent + thread, but threads newly added to the team will only be listening on the their local b_go. */ +static bool +__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc, + int gtid, int tid, kmp_team_t *team) +{ + // Checks to determine if (re-)initialization is needed + bool uninitialized = thr_bar->team == NULL; + bool team_changed = team != thr_bar->team; + bool team_sz_changed = nproc != thr_bar->nproc; + bool tid_changed = tid != thr_bar->old_tid; + bool retval = false; + + if (uninitialized || team_sz_changed) { + __kmp_get_hierarchy(nproc, thr_bar); + } + + if (uninitialized || team_sz_changed || tid_changed) { + thr_bar->my_level = thr_bar->depth-1; // default for master + thr_bar->parent_tid = -1; // default for master + if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy + kmp_uint32 d=0; + while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level + kmp_uint32 rem; + if (d == thr_bar->depth-2) { // reached level right below the master + thr_bar->parent_tid = 0; + thr_bar->my_level = d; + break; + } + else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster? + // thread is not a subtree root at next level, so this is max + thr_bar->parent_tid = tid - rem; + thr_bar->my_level = d; + break; + } + ++d; + } + } + thr_bar->offset = 7-(tid-thr_bar->parent_tid-1); + thr_bar->old_tid = tid; + thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; + thr_bar->team = team; + thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; + } + if (uninitialized || team_changed || tid_changed) { + thr_bar->team = team; + thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; + retval = true; + } + if (uninitialized || team_sz_changed || tid_changed) { + thr_bar->nproc = nproc; + thr_bar->leaf_kids = thr_bar->base_leaf_kids; + if (thr_bar->my_level == 0) thr_bar->leaf_kids=0; + if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc) + thr_bar->leaf_kids = nproc - tid - 1; + thr_bar->leaf_state = 0; + for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1; + } + return retval; +} + +static void +__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, + int gtid, int tid, void (*reduce) (void *, void *) + USE_ITT_BUILD_ARG(void * itt_sync_obj) ) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather); kmp_team_t *team = this_thr->th.th_team; kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb; kmp_uint32 nproc = this_thr->th.th_team_nproc; kmp_info_t **other_threads = team->t.t_threads; kmp_uint64 new_state; - - int level = team->t.t_level; -#if OMP_40_ENABLED - if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct? - if (this_thr->th.th_teams_size.nteams > 1) - ++level; // level was not increased in teams construct for team_of_masters -#endif - if (level == 1) thr_bar->use_oncore_barrier = 1; - else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested - - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); - -#if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - save arrive time to the thread - if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); - } -#endif - - (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); - - if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) + + int level = team->t.t_level; +#if OMP_40_ENABLED + if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct? + if (this_thr->th.th_teams_size.nteams > 1) + ++level; // level was not increased in teams construct for team_of_masters +#endif + if (level == 1) thr_bar->use_oncore_barrier = 1; + else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested + + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); + +#if USE_ITT_BUILD && USE_ITT_NOTIFY + // Barrier imbalance - save arrive time to the thread + if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); + } +#endif + + (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); + + if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) kmp_int32 child_tid; - new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; - if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { - if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag - kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n", - gtid, team->t.t_id, tid)); - kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state); - flag.wait(this_thr, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - if (reduce) { - for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) { - KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid)); - (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data); - } - } - (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits - } - // Next, wait for higher level children on each child's b_arrived flag - for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0 - kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d]; - if (last > nproc) last = nproc; - for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { + new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; + if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { + if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag + kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n", + gtid, team->t.t_id, tid)); + kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state); + flag.wait(this_thr, FALSE + USE_ITT_BUILD_ARG(itt_sync_obj) ); + if (reduce) { + for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) { + KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid)); + (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data); + } + } + (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits + } + // Next, wait for higher level children on each child's b_arrived flag + for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0 + kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d]; + if (last > nproc) last = nproc; + for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { kmp_info_t *child_thr = other_threads[child_tid]; kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " - "arrived(%p) == %llu\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); - kmp_flag_64 flag(&child_bar->b_arrived, new_state); - flag.wait(this_thr, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - if (reduce) { - KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid)); - (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); - } - } - } - } - else { // Blocktime is not infinite - for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first - kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d]; - if (last > nproc) last = nproc; - for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " + "arrived(%p) == %llu\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); + kmp_flag_64 flag(&child_bar->b_arrived, new_state); + flag.wait(this_thr, FALSE + USE_ITT_BUILD_ARG(itt_sync_obj) ); + if (reduce) { + KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid)); + (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); + } + } + } + } + else { // Blocktime is not infinite + for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first + kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d]; + if (last > nproc) last = nproc; + for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { kmp_info_t *child_thr = other_threads[child_tid]; kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " - "arrived(%p) == %llu\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); - kmp_flag_64 flag(&child_bar->b_arrived, new_state); - flag.wait(this_thr, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - if (reduce) { - KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid)); - (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); - } - } - } - } - } - // All subordinates are gathered; now release parent if not master thread - - if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " - "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid, - &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP)); - /* Mark arrival to parent: After performing this write, a worker thread may not assume that - the team is valid any more - it could be deallocated by the master thread at any time. */ - if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME - || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it - kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]); - flag.release(); - } - else { // Leaf does special release on the "offset" bits of parent's b_arrived flag - thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; - kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset); - flag.set_waiter(other_threads[thr_bar->parent_tid]); - flag.release(); - } - } else { // Master thread needs to update the team's b_arrived value - team->t.t_bar[bt].b_arrived = new_state; - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", - gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); - } - // Is the team access below unsafe or just technically invalid? - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); -} - -static void -__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - int propagate_icvs - USE_ITT_BUILD_ARG(void * itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release); + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " + "arrived(%p) == %llu\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); + kmp_flag_64 flag(&child_bar->b_arrived, new_state); + flag.wait(this_thr, FALSE + USE_ITT_BUILD_ARG(itt_sync_obj) ); + if (reduce) { + KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid)); + (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); + } + } + } + } + } + // All subordinates are gathered; now release parent if not master thread + + if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " + "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid, + &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP)); + /* Mark arrival to parent: After performing this write, a worker thread may not assume that + the team is valid any more - it could be deallocated by the master thread at any time. */ + if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME + || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it + kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]); + flag.release(); + } + else { // Leaf does special release on the "offset" bits of parent's b_arrived flag + thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; + kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset); + flag.set_waiter(other_threads[thr_bar->parent_tid]); + flag.release(); + } + } else { // Master thread needs to update the team's b_arrived value + team->t.t_bar[bt].b_arrived = new_state; + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", + gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); + } + // Is the team access below unsafe or just technically invalid? + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +} + +static void +__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + int propagate_icvs + USE_ITT_BUILD_ARG(void * itt_sync_obj) ) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release); kmp_team_t *team; kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; kmp_uint32 nproc; - bool team_change = false; // indicates on-core barrier shouldn't be used - - if (KMP_MASTER_TID(tid)) { - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - } - else { // Worker threads - // Wait for parent thread to release me - if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME - || thr_bar->my_level != 0 || thr_bar->team == NULL) { - // Use traditional method of waiting on my own b_go flag - thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); - flag.wait(this_thr, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time - } - else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested - // Wait on my "offset" bits on parent's b_go flag - thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; - kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset, - bt, this_thr - USE_ITT_BUILD_ARG(itt_sync_obj) ); - flag.wait(this_thr, TRUE); - if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go - TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time - } - else { // Reset my bits on parent's b_go flag - ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0; - } - } - thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; - // Early exit for reaping threads releasing forkjoin barrier - if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) - return; - // The worker thread may now assume that the team is valid. - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - tid = __kmp_tid_from_gtid(gtid); - - KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", - gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); - KMP_MB(); // Flush all pending memory write invalidates. - } - - nproc = this_thr->th.th_team_nproc; - int level = team->t.t_level; -#if OMP_40_ENABLED - if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct? - if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level) - ++level; // level was not increased in teams construct for team_of_workers - if( this_thr->th.th_teams_size.nteams > 1 ) - ++level; // level was not increased in teams construct for team_of_masters - } -#endif - if (level == 1) thr_bar->use_oncore_barrier = 1; - else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested - - // If the team size has increased, we still communicate with old leaves via oncore barrier. - unsigned short int old_leaf_kids = thr_bar->leaf_kids; - kmp_uint64 old_leaf_state = thr_bar->leaf_state; - team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); - // But if the entire team changes, we won't use oncore barrier at all - if (team_change) old_leaf_kids = 0; - -#if KMP_BARRIER_ICV_PUSH - if (propagate_icvs) { - __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); - if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy - copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs); - } - else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime - if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) - // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store - copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, - &thr_bar->parent_bar->th_fixed_icvs); - // non-leaves will get ICVs piggybacked with b_go via NGO store - } - else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs - if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access - copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); - else // leaves copy parent's fixed ICVs directly to local ICV store - copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, - &thr_bar->parent_bar->th_fixed_icvs); - } - } -#endif // KMP_BARRIER_ICV_PUSH - - // Now, release my children - if (thr_bar->my_level) { // not a leaf + bool team_change = false; // indicates on-core barrier shouldn't be used + + if (KMP_MASTER_TID(tid)) { + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + } + else { // Worker threads + // Wait for parent thread to release me + if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME + || thr_bar->my_level != 0 || thr_bar->team == NULL) { + // Use traditional method of waiting on my own b_go flag + thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; + kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + flag.wait(this_thr, TRUE + USE_ITT_BUILD_ARG(itt_sync_obj) ); + TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time + } + else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested + // Wait on my "offset" bits on parent's b_go flag + thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; + kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset, + bt, this_thr + USE_ITT_BUILD_ARG(itt_sync_obj) ); + flag.wait(this_thr, TRUE); + if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go + TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time + } + else { // Reset my bits on parent's b_go flag + ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0; + } + } + thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; + // Early exit for reaping threads releasing forkjoin barrier + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + // The worker thread may now assume that the team is valid. + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + tid = __kmp_tid_from_gtid(gtid); + + KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", + gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); + KMP_MB(); // Flush all pending memory write invalidates. + } + + nproc = this_thr->th.th_team_nproc; + int level = team->t.t_level; +#if OMP_40_ENABLED + if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct? + if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level) + ++level; // level was not increased in teams construct for team_of_workers + if( this_thr->th.th_teams_size.nteams > 1 ) + ++level; // level was not increased in teams construct for team_of_masters + } +#endif + if (level == 1) thr_bar->use_oncore_barrier = 1; + else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested + + // If the team size has increased, we still communicate with old leaves via oncore barrier. + unsigned short int old_leaf_kids = thr_bar->leaf_kids; + kmp_uint64 old_leaf_state = thr_bar->leaf_state; + team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); + // But if the entire team changes, we won't use oncore barrier at all + if (team_change) old_leaf_kids = 0; + +#if KMP_BARRIER_ICV_PUSH + if (propagate_icvs) { + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); + if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy + copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs); + } + else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime + if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) + // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, + &thr_bar->parent_bar->th_fixed_icvs); + // non-leaves will get ICVs piggybacked with b_go via NGO store + } + else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs + if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access + copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); + else // leaves copy parent's fixed ICVs directly to local ICV store + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, + &thr_bar->parent_bar->th_fixed_icvs); + } + } +#endif // KMP_BARRIER_ICV_PUSH + + // Now, release my children + if (thr_bar->my_level) { // not a leaf kmp_int32 child_tid; - kmp_uint32 last; - if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { - if (KMP_MASTER_TID(tid)) { // do a flat release - // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go. - thr_bar->b_go = KMP_BARRIER_STATE_BUMP; - // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line - ngo_load(&thr_bar->th_fixed_icvs); - // This loops over all the threads skipping only the leaf nodes in the hierarchy - for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) { + kmp_uint32 last; + if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { + if (KMP_MASTER_TID(tid)) { // do a flat release + // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go. + thr_bar->b_go = KMP_BARRIER_STATE_BUMP; + // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line + ngo_load(&thr_bar->th_fixed_icvs); + // This loops over all the threads skipping only the leaf nodes in the hierarchy + for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) { kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb; - KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)" - " go(%p): %u => %u\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, - child_bar->b_go + KMP_BARRIER_STATE_BUMP)); - // Use ngo store (if available) to both store ICVs and release child via child's b_go - ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); - } - ngo_sync(); - } - TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time - // Now, release leaf children - if (thr_bar->leaf_kids) { // if there are any - // We test team_change on the off-chance that the level 1 team changed. - if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new - if (old_leaf_kids) { // release old leaf kids - thr_bar->b_go |= old_leaf_state; - } - // Release new leaf kids - last = tid+thr_bar->skip_per_level[1]; - if (last > nproc) last = nproc; - for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1 + KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)" + " go(%p): %u => %u\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, + child_bar->b_go + KMP_BARRIER_STATE_BUMP)); + // Use ngo store (if available) to both store ICVs and release child via child's b_go + ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); + } + ngo_sync(); + } + TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time + // Now, release leaf children + if (thr_bar->leaf_kids) { // if there are any + // We test team_change on the off-chance that the level 1 team changed. + if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new + if (old_leaf_kids) { // release old leaf kids + thr_bar->b_go |= old_leaf_state; + } + // Release new leaf kids + last = tid+thr_bar->skip_per_level[1]; + if (last > nproc) last = nproc; + for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1 kmp_info_t *child_thr = team->t.t_threads[child_tid]; kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; - KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" - " T#%d(%d:%d) go(%p): %u => %u\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, - child_bar->b_go + KMP_BARRIER_STATE_BUMP)); - // Release child using child's b_go flag - kmp_flag_64 flag(&child_bar->b_go, child_thr); - flag.release(); - } - } - else { // Release all children at once with leaf_state bits on my own b_go flag - thr_bar->b_go |= thr_bar->leaf_state; - } - } - } - else { // Blocktime is not infinite; do a simple hierarchical release - for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first - last = tid+thr_bar->skip_per_level[d+1]; - kmp_uint32 skip = thr_bar->skip_per_level[d]; - if (last > nproc) last = nproc; - for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { + KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" + " T#%d(%d:%d) go(%p): %u => %u\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, + child_bar->b_go + KMP_BARRIER_STATE_BUMP)); + // Release child using child's b_go flag + kmp_flag_64 flag(&child_bar->b_go, child_thr); + flag.release(); + } + } + else { // Release all children at once with leaf_state bits on my own b_go flag + thr_bar->b_go |= thr_bar->leaf_state; + } + } + } + else { // Blocktime is not infinite; do a simple hierarchical release + for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first + last = tid+thr_bar->skip_per_level[d+1]; + kmp_uint32 skip = thr_bar->skip_per_level[d]; + if (last > nproc) last = nproc; + for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { kmp_info_t *child_thr = team->t.t_threads[child_tid]; kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; - KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)" - " go(%p): %u => %u\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, - child_bar->b_go + KMP_BARRIER_STATE_BUMP)); - // Release child using child's b_go flag - kmp_flag_64 flag(&child_bar->b_go, child_thr); - flag.release(); - } - } - } -#if KMP_BARRIER_ICV_PUSH - if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest - copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs); -#endif // KMP_BARRIER_ICV_PUSH - } - KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); -} - -// ---------------------------- End of Barrier Algorithms ---------------------------- - -// Internal function to do a barrier. -/* If is_split is true, do a split barrier, otherwise, do a plain barrier - If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier - Returns 0 if master thread, 1 if worker thread. */ -int -__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, - void *reduce_data, void (*reduce)(void *, void *)) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_barrier); + KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)" + " go(%p): %u => %u\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, + child_bar->b_go + KMP_BARRIER_STATE_BUMP)); + // Release child using child's b_go flag + kmp_flag_64 flag(&child_bar->b_go, child_thr); + flag.release(); + } + } + } +#if KMP_BARRIER_ICV_PUSH + if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs); +#endif // KMP_BARRIER_ICV_PUSH + } + KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +} + +// ---------------------------- End of Barrier Algorithms ---------------------------- + +// Internal function to do a barrier. +/* If is_split is true, do a split barrier, otherwise, do a plain barrier + If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier + Returns 0 if master thread, 1 if worker thread. */ +int +__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, + void *reduce_data, void (*reduce)(void *, void *)) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_barrier); int tid = __kmp_tid_from_gtid(gtid); kmp_info_t *this_thr = __kmp_threads[gtid]; kmp_team_t *team = this_thr->th.th_team; int status = 0; - ident_t *loc = __kmp_threads[gtid]->th.th_ident; -#if OMPT_SUPPORT - ompt_task_id_t my_task_id; - ompt_parallel_id_t my_parallel_id; -#endif - - KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", - gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); - -#if OMPT_SUPPORT - if (ompt_enabled) { -#if OMPT_BLAME - my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id; - my_parallel_id = team->t.ompt_team_info.parallel_id; - -#if OMPT_TRACE - if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) { - if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) { - ompt_callbacks.ompt_callback(ompt_event_single_others_end)( - my_parallel_id, my_task_id); - } - } -#endif - if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { - ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( - my_parallel_id, my_task_id); - } -#endif - // It is OK to report the barrier state after the barrier begin callback. - // According to the OMPT specification, a compliant implementation may - // even delay reporting this state until the barrier begins to wait. - this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; - } -#endif - - if (! team->t.t_serialized) { -#if USE_ITT_BUILD - // This value will be used in itt notify events below. - void *itt_sync_obj = NULL; -# if USE_ITT_NOTIFY - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); -# endif -#endif /* USE_ITT_BUILD */ - if (__kmp_tasking_mode == tskm_extra_barrier) { - __kmp_tasking_barrier(team, this_thr, gtid); - KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", - gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); - } - - /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when - the team struct is not guaranteed to exist. */ - // See note about the corresponding code in __kmp_join_barrier() being performance-critical. - if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { - this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; - this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; - } - -#if USE_ITT_BUILD - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_starting(gtid, itt_sync_obj); -#endif /* USE_ITT_BUILD */ -#if USE_DEBUGGER - // Let the debugger know: the thread arrived to the barrier and waiting. - if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure. - team->t.t_bar[bt].b_master_arrived += 1; - } else { - this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; - } // if -#endif /* USE_DEBUGGER */ - if (reduce != NULL) { - //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 - this_thr->th.th_local.reduce_data = reduce_data; - } - - if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) - __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1 - - switch (__kmp_barrier_gather_pattern[bt]) { - case bp_hyper_bar: { - KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear - __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_hierarchical_bar: { - __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce - USE_ITT_BUILD_ARG(itt_sync_obj)); - break; - } - case bp_tree_bar: { - KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear - __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - default: { - __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce - USE_ITT_BUILD_ARG(itt_sync_obj) ); - } - } - - KMP_MB(); - - if (KMP_MASTER_TID(tid)) { - status = 0; - if (__kmp_tasking_mode != tskm_immediate_exec) { - __kmp_task_team_wait(this_thr, team - USE_ITT_BUILD_ARG(itt_sync_obj) ); - } -#if USE_DEBUGGER - // Let the debugger know: All threads are arrived and starting leaving the barrier. - team->t.t_bar[bt].b_team_arrived += 1; -#endif - -#if USE_ITT_BUILD - /* TODO: In case of split reduction barrier, master thread may send acquired event early, - before the final summation into the shared variable is done (final summation can be a - long operation for array reductions). */ - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_middle(gtid, itt_sync_obj); -#endif /* USE_ITT_BUILD */ -#if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier - report frame end (only if active_level == 1) - if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && -#if OMP_40_ENABLED - this_thr->th.th_teams_microtask == NULL && -#endif - team->t.t_active_level == 1) - { - kmp_uint64 cur_time = __itt_get_timestamp(); - kmp_info_t **other_threads = team->t.t_threads; - int nproc = this_thr->th.th_team_nproc; - int i; - switch(__kmp_forkjoin_frames_mode) { - case 1: - __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); - this_thr->th.th_frame_time = cur_time; - break; - case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed) - __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); - break; - case 3: - if( __itt_metadata_add_ptr ) { - // Initialize with master's wait time - kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; - for (i=1; i<nproc; ++i) { - delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time ); - } - __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL)); - } - __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); - this_thr->th.th_frame_time = cur_time; - break; - } - } -#endif /* USE_ITT_BUILD */ - } else { - status = 1; -#if USE_ITT_BUILD - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_middle(gtid, itt_sync_obj); -#endif /* USE_ITT_BUILD */ - } - if (status == 1 || ! is_split) { - switch (__kmp_barrier_release_pattern[bt]) { - case bp_hyper_bar: { - KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); - __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_hierarchical_bar: { - __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_tree_bar: { - KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); - __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - default: { - __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - } - } - if (__kmp_tasking_mode != tskm_immediate_exec) { - __kmp_task_team_sync(this_thr, team); - } - } - -#if USE_ITT_BUILD - /* GEH: TODO: Move this under if-condition above and also include in - __kmp_end_split_barrier(). This will more accurately represent the actual release time - of the threads for split barriers. */ - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_finished(gtid, itt_sync_obj); -#endif /* USE_ITT_BUILD */ - } else { // Team is serialized. - status = 0; - if (__kmp_tasking_mode != tskm_immediate_exec) { -#if OMP_41_ENABLED - if ( this_thr->th.th_task_team != NULL ) { - void *itt_sync_obj = NULL; -#if USE_ITT_NOTIFY - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { - itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); - __kmp_itt_barrier_starting(gtid, itt_sync_obj); - } -#endif - - KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE); - __kmp_task_team_wait(this_thr, team - USE_ITT_BUILD_ARG(itt_sync_obj)); - __kmp_task_team_setup(this_thr, team, 0); - -#if USE_ITT_BUILD - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_finished(gtid, itt_sync_obj); -#endif /* USE_ITT_BUILD */ - } -#else - // The task team should be NULL for serialized code (tasks will be executed immediately) - KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL); - KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL); -#endif - } - } - KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", - gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status)); - -#if OMPT_SUPPORT - if (ompt_enabled) { -#if OMPT_BLAME - if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { - ompt_callbacks.ompt_callback(ompt_event_barrier_end)( - my_parallel_id, my_task_id); - } -#endif - this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; - } -#endif - - return status; -} - - -void -__kmp_end_split_barrier(enum barrier_type bt, int gtid) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier); - int tid = __kmp_tid_from_gtid(gtid); - kmp_info_t *this_thr = __kmp_threads[gtid]; - kmp_team_t *team = this_thr->th.th_team; - - if (!team->t.t_serialized) { - if (KMP_MASTER_GTID(gtid)) { - switch (__kmp_barrier_release_pattern[bt]) { - case bp_hyper_bar: { - KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); - __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(NULL) ); - break; - } - case bp_hierarchical_bar: { - __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(NULL)); - break; - } - case bp_tree_bar: { - KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); - __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(NULL) ); - break; - } - default: { - __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(NULL) ); - } - } - if (__kmp_tasking_mode != tskm_immediate_exec) { - __kmp_task_team_sync(this_thr, team); - } // if - } - } -} - - -void -__kmp_join_barrier(int gtid) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier); + ident_t *loc = __kmp_threads[gtid]->th.th_ident; +#if OMPT_SUPPORT + ompt_task_id_t my_task_id; + ompt_parallel_id_t my_parallel_id; +#endif + + KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", + gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); + +#if OMPT_SUPPORT + if (ompt_enabled) { +#if OMPT_BLAME + my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id; + my_parallel_id = team->t.ompt_team_info.parallel_id; + +#if OMPT_TRACE + if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) { + if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) { + ompt_callbacks.ompt_callback(ompt_event_single_others_end)( + my_parallel_id, my_task_id); + } + } +#endif + if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { + ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( + my_parallel_id, my_task_id); + } +#endif + // It is OK to report the barrier state after the barrier begin callback. + // According to the OMPT specification, a compliant implementation may + // even delay reporting this state until the barrier begins to wait. + this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; + } +#endif + + if (! team->t.t_serialized) { +#if USE_ITT_BUILD + // This value will be used in itt notify events below. + void *itt_sync_obj = NULL; +# if USE_ITT_NOTIFY + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); +# endif +#endif /* USE_ITT_BUILD */ + if (__kmp_tasking_mode == tskm_extra_barrier) { + __kmp_tasking_barrier(team, this_thr, gtid); + KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", + gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); + } + + /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when + the team struct is not guaranteed to exist. */ + // See note about the corresponding code in __kmp_join_barrier() being performance-critical. + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { + this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; + this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; + } + +#if USE_ITT_BUILD + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_starting(gtid, itt_sync_obj); +#endif /* USE_ITT_BUILD */ +#if USE_DEBUGGER + // Let the debugger know: the thread arrived to the barrier and waiting. + if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure. + team->t.t_bar[bt].b_master_arrived += 1; + } else { + this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; + } // if +#endif /* USE_DEBUGGER */ + if (reduce != NULL) { + //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 + this_thr->th.th_local.reduce_data = reduce_data; + } + + if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) + __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1 + + switch (__kmp_barrier_gather_pattern[bt]) { + case bp_hyper_bar: { + KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear + __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce + USE_ITT_BUILD_ARG(itt_sync_obj) ); + break; + } + case bp_hierarchical_bar: { + __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce + USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + case bp_tree_bar: { + KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear + __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce + USE_ITT_BUILD_ARG(itt_sync_obj) ); + break; + } + default: { + __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce + USE_ITT_BUILD_ARG(itt_sync_obj) ); + } + } + + KMP_MB(); + + if (KMP_MASTER_TID(tid)) { + status = 0; + if (__kmp_tasking_mode != tskm_immediate_exec) { + __kmp_task_team_wait(this_thr, team + USE_ITT_BUILD_ARG(itt_sync_obj) ); + } +#if USE_DEBUGGER + // Let the debugger know: All threads are arrived and starting leaving the barrier. + team->t.t_bar[bt].b_team_arrived += 1; +#endif + +#if USE_ITT_BUILD + /* TODO: In case of split reduction barrier, master thread may send acquired event early, + before the final summation into the shared variable is done (final summation can be a + long operation for array reductions). */ + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_middle(gtid, itt_sync_obj); +#endif /* USE_ITT_BUILD */ +#if USE_ITT_BUILD && USE_ITT_NOTIFY + // Barrier - report frame end (only if active_level == 1) + if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && +#if OMP_40_ENABLED + this_thr->th.th_teams_microtask == NULL && +#endif + team->t.t_active_level == 1) + { + kmp_uint64 cur_time = __itt_get_timestamp(); + kmp_info_t **other_threads = team->t.t_threads; + int nproc = this_thr->th.th_team_nproc; + int i; + switch(__kmp_forkjoin_frames_mode) { + case 1: + __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); + this_thr->th.th_frame_time = cur_time; + break; + case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed) + __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); + break; + case 3: + if( __itt_metadata_add_ptr ) { + // Initialize with master's wait time + kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; + for (i=1; i<nproc; ++i) { + delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time ); + } + __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL)); + } + __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); + this_thr->th.th_frame_time = cur_time; + break; + } + } +#endif /* USE_ITT_BUILD */ + } else { + status = 1; +#if USE_ITT_BUILD + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_middle(gtid, itt_sync_obj); +#endif /* USE_ITT_BUILD */ + } + if (status == 1 || ! is_split) { + switch (__kmp_barrier_release_pattern[bt]) { + case bp_hyper_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); + __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE + USE_ITT_BUILD_ARG(itt_sync_obj) ); + break; + } + case bp_hierarchical_bar: { + __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE + USE_ITT_BUILD_ARG(itt_sync_obj) ); + break; + } + case bp_tree_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); + __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE + USE_ITT_BUILD_ARG(itt_sync_obj) ); + break; + } + default: { + __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE + USE_ITT_BUILD_ARG(itt_sync_obj) ); + } + } + if (__kmp_tasking_mode != tskm_immediate_exec) { + __kmp_task_team_sync(this_thr, team); + } + } + +#if USE_ITT_BUILD + /* GEH: TODO: Move this under if-condition above and also include in + __kmp_end_split_barrier(). This will more accurately represent the actual release time + of the threads for split barriers. */ + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_finished(gtid, itt_sync_obj); +#endif /* USE_ITT_BUILD */ + } else { // Team is serialized. + status = 0; + if (__kmp_tasking_mode != tskm_immediate_exec) { +#if OMP_41_ENABLED + if ( this_thr->th.th_task_team != NULL ) { + void *itt_sync_obj = NULL; +#if USE_ITT_NOTIFY + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { + itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); + __kmp_itt_barrier_starting(gtid, itt_sync_obj); + } +#endif + + KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE); + __kmp_task_team_wait(this_thr, team + USE_ITT_BUILD_ARG(itt_sync_obj)); + __kmp_task_team_setup(this_thr, team, 0); + +#if USE_ITT_BUILD + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_finished(gtid, itt_sync_obj); +#endif /* USE_ITT_BUILD */ + } +#else + // The task team should be NULL for serialized code (tasks will be executed immediately) + KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL); + KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL); +#endif + } + } + KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", + gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status)); + +#if OMPT_SUPPORT + if (ompt_enabled) { +#if OMPT_BLAME + if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { + ompt_callbacks.ompt_callback(ompt_event_barrier_end)( + my_parallel_id, my_task_id); + } +#endif + this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; + } +#endif + + return status; +} + + +void +__kmp_end_split_barrier(enum barrier_type bt, int gtid) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier); + int tid = __kmp_tid_from_gtid(gtid); + kmp_info_t *this_thr = __kmp_threads[gtid]; + kmp_team_t *team = this_thr->th.th_team; + + if (!team->t.t_serialized) { + if (KMP_MASTER_GTID(gtid)) { + switch (__kmp_barrier_release_pattern[bt]) { + case bp_hyper_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); + __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE + USE_ITT_BUILD_ARG(NULL) ); + break; + } + case bp_hierarchical_bar: { + __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE + USE_ITT_BUILD_ARG(NULL)); + break; + } + case bp_tree_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); + __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE + USE_ITT_BUILD_ARG(NULL) ); + break; + } + default: { + __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE + USE_ITT_BUILD_ARG(NULL) ); + } + } + if (__kmp_tasking_mode != tskm_immediate_exec) { + __kmp_task_team_sync(this_thr, team); + } // if + } + } +} + + +void +__kmp_join_barrier(int gtid) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier); kmp_info_t *this_thr = __kmp_threads[gtid]; kmp_team_t *team; kmp_uint nproc; - kmp_info_t *master_thread; - int tid; -#ifdef KMP_DEBUG - int team_id; -#endif /* KMP_DEBUG */ -#if USE_ITT_BUILD - void *itt_sync_obj = NULL; -# if USE_ITT_NOTIFY - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need - // Get object created at fork_barrier - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); -# endif -#endif /* USE_ITT_BUILD */ - KMP_MB(); - - // Get current info - team = this_thr->th.th_team; - nproc = this_thr->th.th_team_nproc; - KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); - tid = __kmp_tid_from_gtid(gtid); -#ifdef KMP_DEBUG - team_id = team->t.t_id; -#endif /* KMP_DEBUG */ - master_thread = this_thr->th.th_team_master; -#ifdef KMP_DEBUG - if (master_thread != team->t.t_threads[0]) { - __kmp_print_structure(); - } -#endif /* KMP_DEBUG */ - KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); - KMP_MB(); - - // Verify state - KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); - KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); - KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); - KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); - KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid)); - -#if OMPT_SUPPORT -#if OMPT_TRACE - if (ompt_enabled && - ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { - ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( - team->t.ompt_team_info.parallel_id, - team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); - } -#endif - this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; -#endif - - if (__kmp_tasking_mode == tskm_extra_barrier) { - __kmp_tasking_barrier(team, this_thr, gtid); - KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid)); - } -# ifdef KMP_DEBUG - if (__kmp_tasking_mode != tskm_immediate_exec) { - KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n", - __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state], - this_thr->th.th_task_team)); - KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]); - } -# endif /* KMP_DEBUG */ - - /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the - team struct is not guaranteed to exist. Doing these loads causes a cache miss slows - down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite, - since the values are not used by __kmp_wait_template() in that case. */ - if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { - this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; - this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; - } - -#if USE_ITT_BUILD - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_starting(gtid, itt_sync_obj); -#endif /* USE_ITT_BUILD */ - - switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { - case bp_hyper_bar: { - KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); - __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_hierarchical_bar: { - __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_tree_bar: { - KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); - __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - default: { - __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL - USE_ITT_BUILD_ARG(itt_sync_obj) ); - } - } - - /* From this point on, the team data structure may be deallocated at any time by the - master thread - it is unsafe to reference it in any of the worker threads. Any per-team - data items that need to be referenced before the end of the barrier should be moved to - the kmp_task_team_t structs. */ - if (KMP_MASTER_TID(tid)) { - if (__kmp_tasking_mode != tskm_immediate_exec) { - // Master shouldn't call decrease_load(). // TODO: enable master threads. - // Master should have th_may_decrease_load == 0. // TODO: enable master threads. - __kmp_task_team_wait(this_thr, team - USE_ITT_BUILD_ARG(itt_sync_obj) ); - } -#if USE_ITT_BUILD - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_middle(gtid, itt_sync_obj); -#endif /* USE_ITT_BUILD */ - -# if USE_ITT_BUILD && USE_ITT_NOTIFY - // Join barrier - report frame end - if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && -#if OMP_40_ENABLED - this_thr->th.th_teams_microtask == NULL && -#endif - team->t.t_active_level == 1) - { - kmp_uint64 cur_time = __itt_get_timestamp(); - ident_t * loc = team->t.t_ident; - kmp_info_t **other_threads = team->t.t_threads; - int nproc = this_thr->th.th_team_nproc; - int i; - switch(__kmp_forkjoin_frames_mode) { - case 1: - __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); - break; - case 2: - __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); - break; - case 3: - if( __itt_metadata_add_ptr ) { - // Initialize with master's wait time - kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; - for (i=1; i<nproc; ++i) { - delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time ); - } - __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0); - } - __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); - this_thr->th.th_frame_time = cur_time; - break; - } - } -# endif /* USE_ITT_BUILD */ - } -#if USE_ITT_BUILD - else { - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_middle(gtid, itt_sync_obj); - } -#endif /* USE_ITT_BUILD */ - -#if KMP_DEBUG - if (KMP_MASTER_TID(tid)) { - KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", - gtid, team_id, tid, nproc)); - } -#endif /* KMP_DEBUG */ - - // TODO now, mark worker threads as done so they may be disbanded - KMP_MB(); // Flush all pending memory write invalidates. - KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); - + kmp_info_t *master_thread; + int tid; +#ifdef KMP_DEBUG + int team_id; +#endif /* KMP_DEBUG */ +#if USE_ITT_BUILD + void *itt_sync_obj = NULL; +# if USE_ITT_NOTIFY + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need + // Get object created at fork_barrier + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); +# endif +#endif /* USE_ITT_BUILD */ + KMP_MB(); + + // Get current info + team = this_thr->th.th_team; + nproc = this_thr->th.th_team_nproc; + KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); + tid = __kmp_tid_from_gtid(gtid); +#ifdef KMP_DEBUG + team_id = team->t.t_id; +#endif /* KMP_DEBUG */ + master_thread = this_thr->th.th_team_master; +#ifdef KMP_DEBUG + if (master_thread != team->t.t_threads[0]) { + __kmp_print_structure(); + } +#endif /* KMP_DEBUG */ + KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); + KMP_MB(); + + // Verify state + KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); + KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); + KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); + KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); + KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid)); + #if OMPT_SUPPORT - if (ompt_enabled) { -#if OMPT_BLAME - if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { - ompt_callbacks.ompt_callback(ompt_event_barrier_end)( - team->t.ompt_team_info.parallel_id, - team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); - } -#endif - - // return to default state - this_thr->th.ompt_thread_info.state = ompt_state_overhead; - } -#endif -} - - -// TODO release worker threads' fork barriers as we are ready instead of all at once -void -__kmp_fork_barrier(int gtid, int tid) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier); - kmp_info_t *this_thr = __kmp_threads[gtid]; - kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; -#if USE_ITT_BUILD - void * itt_sync_obj = NULL; -#endif /* USE_ITT_BUILD */ - - KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", - gtid, (team != NULL) ? team->t.t_id : -1, tid)); - - // th_team pointer only valid for master thread here - if (KMP_MASTER_TID(tid)) { -#if USE_ITT_BUILD && USE_ITT_NOTIFY - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { - // Create itt barrier object - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); - __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing - } -#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ - -#ifdef KMP_DEBUG +#if OMPT_TRACE + if (ompt_enabled && + ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { + ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( + team->t.ompt_team_info.parallel_id, + team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); + } +#endif + this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; +#endif + + if (__kmp_tasking_mode == tskm_extra_barrier) { + __kmp_tasking_barrier(team, this_thr, gtid); + KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid)); + } +# ifdef KMP_DEBUG + if (__kmp_tasking_mode != tskm_immediate_exec) { + KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n", + __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state], + this_thr->th.th_task_team)); + KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]); + } +# endif /* KMP_DEBUG */ + + /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the + team struct is not guaranteed to exist. Doing these loads causes a cache miss slows + down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite, + since the values are not used by __kmp_wait_template() in that case. */ + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { + this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; + this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; + } + +#if USE_ITT_BUILD + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_starting(gtid, itt_sync_obj); +#endif /* USE_ITT_BUILD */ + + switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { + case bp_hyper_bar: { + KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); + __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL + USE_ITT_BUILD_ARG(itt_sync_obj) ); + break; + } + case bp_hierarchical_bar: { + __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL + USE_ITT_BUILD_ARG(itt_sync_obj) ); + break; + } + case bp_tree_bar: { + KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); + __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL + USE_ITT_BUILD_ARG(itt_sync_obj) ); + break; + } + default: { + __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL + USE_ITT_BUILD_ARG(itt_sync_obj) ); + } + } + + /* From this point on, the team data structure may be deallocated at any time by the + master thread - it is unsafe to reference it in any of the worker threads. Any per-team + data items that need to be referenced before the end of the barrier should be moved to + the kmp_task_team_t structs. */ + if (KMP_MASTER_TID(tid)) { + if (__kmp_tasking_mode != tskm_immediate_exec) { + // Master shouldn't call decrease_load(). // TODO: enable master threads. + // Master should have th_may_decrease_load == 0. // TODO: enable master threads. + __kmp_task_team_wait(this_thr, team + USE_ITT_BUILD_ARG(itt_sync_obj) ); + } +#if USE_ITT_BUILD + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_middle(gtid, itt_sync_obj); +#endif /* USE_ITT_BUILD */ + +# if USE_ITT_BUILD && USE_ITT_NOTIFY + // Join barrier - report frame end + if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && +#if OMP_40_ENABLED + this_thr->th.th_teams_microtask == NULL && +#endif + team->t.t_active_level == 1) + { + kmp_uint64 cur_time = __itt_get_timestamp(); + ident_t * loc = team->t.t_ident; + kmp_info_t **other_threads = team->t.t_threads; + int nproc = this_thr->th.th_team_nproc; + int i; + switch(__kmp_forkjoin_frames_mode) { + case 1: + __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); + break; + case 2: + __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); + break; + case 3: + if( __itt_metadata_add_ptr ) { + // Initialize with master's wait time + kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; + for (i=1; i<nproc; ++i) { + delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time ); + } + __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0); + } + __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); + this_thr->th.th_frame_time = cur_time; + break; + } + } +# endif /* USE_ITT_BUILD */ + } +#if USE_ITT_BUILD + else { + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_middle(gtid, itt_sync_obj); + } +#endif /* USE_ITT_BUILD */ + +#if KMP_DEBUG + if (KMP_MASTER_TID(tid)) { + KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", + gtid, team_id, tid, nproc)); + } +#endif /* KMP_DEBUG */ + + // TODO now, mark worker threads as done so they may be disbanded + KMP_MB(); // Flush all pending memory write invalidates. + KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); + +#if OMPT_SUPPORT + if (ompt_enabled) { +#if OMPT_BLAME + if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { + ompt_callbacks.ompt_callback(ompt_event_barrier_end)( + team->t.ompt_team_info.parallel_id, + team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); + } +#endif + + // return to default state + this_thr->th.ompt_thread_info.state = ompt_state_overhead; + } +#endif +} + + +// TODO release worker threads' fork barriers as we are ready instead of all at once +void +__kmp_fork_barrier(int gtid, int tid) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier); + kmp_info_t *this_thr = __kmp_threads[gtid]; + kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; +#if USE_ITT_BUILD + void * itt_sync_obj = NULL; +#endif /* USE_ITT_BUILD */ + + KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", + gtid, (team != NULL) ? team->t.t_id : -1, tid)); + + // th_team pointer only valid for master thread here + if (KMP_MASTER_TID(tid)) { +#if USE_ITT_BUILD && USE_ITT_NOTIFY + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { + // Create itt barrier object + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); + __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing + } +#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ + +#ifdef KMP_DEBUG kmp_info_t **other_threads = team->t.t_threads; int i; - - // Verify state - KMP_MB(); - - for(i=1; i<team->t.t_nproc; ++i) { - KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n", - gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, - team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, - other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); - KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) - & ~(KMP_BARRIER_SLEEP_STATE)) - == KMP_INIT_BARRIER_STATE); - KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); - } -#endif - - if (__kmp_tasking_mode != tskm_immediate_exec) { - __kmp_task_team_setup(this_thr, team, 0); // 0 indicates setup current task team if nthreads > 1 - } - - /* The master thread may have changed its blocktime between the join barrier and the - fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can - access it when the team struct is not guaranteed to exist. */ - // See note about the corresponding code in __kmp_join_barrier() being performance-critical - if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { - this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; - this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; - } - } // master - - switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { - case bp_hyper_bar: { - KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); - __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_hierarchical_bar: { - __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_tree_bar: { - KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); - __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - default: { - __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - } - } - - // Early exit for reaping threads releasing forkjoin barrier - if (TCR_4(__kmp_global.g.g_done)) { - this_thr->th.th_task_team = NULL; - -#if USE_ITT_BUILD && USE_ITT_NOTIFY - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { - if (!KMP_MASTER_TID(tid)) { - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); - if (itt_sync_obj) - __kmp_itt_barrier_finished(gtid, itt_sync_obj); - } - } -#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ - KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); - return; - } - - /* We can now assume that a valid team structure has been allocated by the master and - propagated to all worker threads. The current thread, however, may not be part of the - team, so we can't blindly assume that the team pointer is non-null. */ - team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); - KMP_DEBUG_ASSERT(team != NULL); - tid = __kmp_tid_from_gtid(gtid); - - -#if KMP_BARRIER_ICV_PULL - /* Master thread's copy of the ICVs was set up on the implicit taskdata in - __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has - this data before this function is called. We cannot modify __kmp_fork_call() to look at - the fixed ICVs in the master's thread struct, because it is not always the case that the - threads arrays have been allocated when __kmp_fork_call() is executed. */ - { - KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); - if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs - // Copy the initial ICVs from the master's thread struct to the implicit task for this tid. - KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); - __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); - copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, - &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs); - } - } -#endif // KMP_BARRIER_ICV_PULL - - if (__kmp_tasking_mode != tskm_immediate_exec) { - __kmp_task_team_sync(this_thr, team); - } - -#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED - kmp_proc_bind_t proc_bind = team->t.t_proc_bind; - if (proc_bind == proc_bind_intel) { -#endif -#if KMP_AFFINITY_SUPPORTED - // Call dynamic affinity settings - if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { - __kmp_balanced_affinity(tid, team->t.t_nproc); - } -#endif // KMP_AFFINITY_SUPPORTED -#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED - } - else if (proc_bind != proc_bind_false) { - if (this_thr->th.th_new_place == this_thr->th.th_current_place) { - KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", - __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place)); - } - else { - __kmp_affinity_set_place(gtid); - } - } -#endif - -#if USE_ITT_BUILD && USE_ITT_NOTIFY - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { - if (!KMP_MASTER_TID(tid)) { - // Get correct barrier object - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); - __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired - } // (prepare called inside barrier_release) - } -#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ - KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid)); -} - - -void -__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc ) -{ - KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy); - - KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); - KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); - - /* Master thread's copy of the ICVs was set up on the implicit taskdata in - __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has - this data before this function is called. */ -#if KMP_BARRIER_ICV_PULL - /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where - all of the worker threads can access them and make their own copies after the barrier. */ - KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point - copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs); - KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", - 0, team->t.t_threads[0], team)); -#elif KMP_BARRIER_ICV_PUSH - // The ICVs will be propagated in the fork barrier, so nothing needs to be done here. - KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", - 0, team->t.t_threads[0], team)); -#else - // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time. - ngo_load(new_icvs); - KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point - for (int f=1; f<new_nproc; ++f) { // Skip the master thread - // TODO: GEH - pass in better source location info since usually NULL here - KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", - f, team->t.t_threads[f], team)); - __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); - ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); - KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", - f, team->t.t_threads[f], team)); - } - ngo_sync(); -#endif // KMP_BARRIER_ICV_PULL -} + + // Verify state + KMP_MB(); + + for(i=1; i<team->t.t_nproc; ++i) { + KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n", + gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, + team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, + other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); + KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) + & ~(KMP_BARRIER_SLEEP_STATE)) + == KMP_INIT_BARRIER_STATE); + KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); + } +#endif + + if (__kmp_tasking_mode != tskm_immediate_exec) { + __kmp_task_team_setup(this_thr, team, 0); // 0 indicates setup current task team if nthreads > 1 + } + + /* The master thread may have changed its blocktime between the join barrier and the + fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can + access it when the team struct is not guaranteed to exist. */ + // See note about the corresponding code in __kmp_join_barrier() being performance-critical + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { + this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; + this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; + } + } // master + + switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { + case bp_hyper_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); + __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE + USE_ITT_BUILD_ARG(itt_sync_obj) ); + break; + } + case bp_hierarchical_bar: { + __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE + USE_ITT_BUILD_ARG(itt_sync_obj) ); + break; + } + case bp_tree_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); + __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE + USE_ITT_BUILD_ARG(itt_sync_obj) ); + break; + } + default: { + __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE + USE_ITT_BUILD_ARG(itt_sync_obj) ); + } + } + + // Early exit for reaping threads releasing forkjoin barrier + if (TCR_4(__kmp_global.g.g_done)) { + this_thr->th.th_task_team = NULL; + +#if USE_ITT_BUILD && USE_ITT_NOTIFY + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { + if (!KMP_MASTER_TID(tid)) { + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); + if (itt_sync_obj) + __kmp_itt_barrier_finished(gtid, itt_sync_obj); + } + } +#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ + KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); + return; + } + + /* We can now assume that a valid team structure has been allocated by the master and + propagated to all worker threads. The current thread, however, may not be part of the + team, so we can't blindly assume that the team pointer is non-null. */ + team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); + KMP_DEBUG_ASSERT(team != NULL); + tid = __kmp_tid_from_gtid(gtid); + + +#if KMP_BARRIER_ICV_PULL + /* Master thread's copy of the ICVs was set up on the implicit taskdata in + __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has + this data before this function is called. We cannot modify __kmp_fork_call() to look at + the fixed ICVs in the master's thread struct, because it is not always the case that the + threads arrays have been allocated when __kmp_fork_call() is executed. */ + { + KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); + if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs + // Copy the initial ICVs from the master's thread struct to the implicit task for this tid. + KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, + &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs); + } + } +#endif // KMP_BARRIER_ICV_PULL + + if (__kmp_tasking_mode != tskm_immediate_exec) { + __kmp_task_team_sync(this_thr, team); + } + +#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED + kmp_proc_bind_t proc_bind = team->t.t_proc_bind; + if (proc_bind == proc_bind_intel) { +#endif +#if KMP_AFFINITY_SUPPORTED + // Call dynamic affinity settings + if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { + __kmp_balanced_affinity(tid, team->t.t_nproc); + } +#endif // KMP_AFFINITY_SUPPORTED +#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED + } + else if (proc_bind != proc_bind_false) { + if (this_thr->th.th_new_place == this_thr->th.th_current_place) { + KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", + __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place)); + } + else { + __kmp_affinity_set_place(gtid); + } + } +#endif + +#if USE_ITT_BUILD && USE_ITT_NOTIFY + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { + if (!KMP_MASTER_TID(tid)) { + // Get correct barrier object + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); + __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired + } // (prepare called inside barrier_release) + } +#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ + KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid)); +} + + +void +__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc ) +{ + KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy); + + KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); + KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); + + /* Master thread's copy of the ICVs was set up on the implicit taskdata in + __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has + this data before this function is called. */ +#if KMP_BARRIER_ICV_PULL + /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where + all of the worker threads can access them and make their own copies after the barrier. */ + KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point + copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs); + KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", + 0, team->t.t_threads[0], team)); +#elif KMP_BARRIER_ICV_PUSH + // The ICVs will be propagated in the fork barrier, so nothing needs to be done here. + KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", + 0, team->t.t_threads[0], team)); +#else + // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time. + ngo_load(new_icvs); + KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point + for (int f=1; f<new_nproc; ++f) { // Skip the master thread + // TODO: GEH - pass in better source location info since usually NULL here + KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", + f, team->t.t_threads[f], team)); + __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); + ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); + KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", + f, team->t.t_threads[f], team)); + } + ngo_sync(); +#endif // KMP_BARRIER_ICV_PULL +} |