diff options
author | thegeorg <thegeorg@yandex-team.ru> | 2022-06-03 10:53:07 +0300 |
---|---|---|
committer | thegeorg <thegeorg@yandex-team.ru> | 2022-06-03 10:53:07 +0300 |
commit | a1d4361e379e2c72a469ad1bd64569cbc2db131f (patch) | |
tree | 0caddb240a10132376e4653a31578e117d33f9fd /contrib/libs/cxxsupp/openmp/kmp_tasking.cpp | |
parent | 41f55a521834080d9d703c099c0418cfff3a0546 (diff) | |
download | ydb-a1d4361e379e2c72a469ad1bd64569cbc2db131f.tar.gz |
Update contrib/libs/cxxsupp/openmp to 14.0.4
ref:77c6cdda99b217d50c4deadca11f5611fa0dc168
Diffstat (limited to 'contrib/libs/cxxsupp/openmp/kmp_tasking.cpp')
-rw-r--r-- | contrib/libs/cxxsupp/openmp/kmp_tasking.cpp | 212 |
1 files changed, 129 insertions, 83 deletions
diff --git a/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp b/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp index 55e9c30763..e445438524 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp @@ -324,10 +324,16 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { kmp_info_t *thread = __kmp_threads[gtid]; kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); - // We don't need to map to shadow gtid if it is already hidden helper thread - if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) { - gtid = KMP_GTID_TO_SHADOW_GTID(gtid); - thread = __kmp_threads[gtid]; + // If we encounter a hidden helper task, and the current thread is not a + // hidden helper thread, we have to give the task to any hidden helper thread + // starting from its shadow one. + if (UNLIKELY(taskdata->td_flags.hidden_helper && + !KMP_HIDDEN_HELPER_THREAD(gtid))) { + kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid); + __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid)); + // Signal the hidden helper threads. + __kmp_hidden_helper_worker_thread_signal(); + return TASK_SUCCESSFULLY_PUSHED; } kmp_task_team_t *task_team = thread->th.th_task_team; @@ -434,16 +440,8 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { gtid, taskdata, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); - auto hidden_helper = taskdata->td_flags.hidden_helper; - __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); - // Signal one worker thread to execute the task - if (UNLIKELY(hidden_helper)) { - // Wake hidden helper threads up if they're sleeping - __kmp_hidden_helper_worker_thread_signal(); - } - return TASK_SUCCESSFULLY_PUSHED; } @@ -809,6 +807,24 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid, gtid, taskdata, children)); } +// Only need to keep track of child task counts if any of the following: +// 1. team parallel and tasking not serialized; +// 2. it is a proxy or detachable or hidden helper task +// 3. the children counter of its parent task is greater than 0. +// The reason for the 3rd one is for serialized team that found detached task, +// hidden helper task, T. In this case, the execution of T is still deferred, +// and it is also possible that a regular task depends on T. In this case, if we +// don't track the children, task synchronization will be broken. +static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) { + kmp_tasking_flags_t flags = taskdata->td_flags; + bool ret = !(flags.team_serial || flags.tasking_ser); + ret = ret || flags.proxy == TASK_PROXY || + flags.detachable == TASK_DETACHABLE || flags.hidden_helper; + ret = ret || + KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0; + return ret; +} + // __kmp_task_finish: bookkeeping to do when a task finishes execution // // gtid: global thread ID for calling thread @@ -825,8 +841,9 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, kmp_info_t *thread = __kmp_threads[gtid]; kmp_task_team_t *task_team = thread->th.th_task_team; // might be NULL for serial teams... +#if KMP_DEBUG kmp_int32 children = 0; - +#endif KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " "task %p\n", gtid, taskdata, resumed_task)); @@ -934,16 +951,15 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, if (ompt) __ompt_task_finish(task, resumed_task, ompt_task_complete); #endif - - // Only need to keep track of count if team parallel and tasking not - // serialized, or task is detachable and event has already been fulfilled - if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || - taskdata->td_flags.detachable == TASK_DETACHABLE || - taskdata->td_flags.hidden_helper) { + // TODO: What would be the balance between the conditions in the function + // and an atomic operation? + if (__kmp_track_children_task(taskdata)) { __kmp_release_deps(gtid, taskdata); // Predecrement simulated by "- 1" calculation - children = - KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; +#if KMP_DEBUG + children = -1 + +#endif + KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks); KMP_DEBUG_ASSERT(children >= 0); if (taskdata->td_taskgroup) KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); @@ -1189,7 +1205,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task; kmp_taskdata_t *taskdata; kmp_info_t *thread = __kmp_threads[gtid]; - kmp_info_t *encountering_thread = thread; kmp_team_t *team = thread->th.th_team; kmp_taskdata_t *parent_task = thread->th.th_current_task; size_t shareds_offset; @@ -1201,15 +1216,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, if (__kmp_enable_hidden_helper) { if (!TCR_4(__kmp_init_hidden_helper)) __kmp_hidden_helper_initialize(); - - // For a hidden helper task encountered by a regular thread, we will push - // the task to the (gtid%__kmp_hidden_helper_threads_num)-th hidden helper - // thread. - if (!KMP_HIDDEN_HELPER_THREAD(gtid)) { - thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)]; - // We don't change the parent-child relation for hidden helper task as - // we need that to do per-task-region synchronization. - } } else { // If the hidden helper task is not enabled, reset the flag to FALSE. flags->hidden_helper = FALSE; @@ -1232,8 +1238,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, // Untied task encountered causes the TSC algorithm to check entire deque of // the victim thread. If no untied task encountered, then checking the head // of the deque should be enough. - KMP_CHECK_UPDATE( - encountering_thread->th.th_task_team->tt.tt_untied_task_encountered, 1); + KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1); } // Detachable tasks are not proxy tasks yet but could be in the future. Doing @@ -1247,32 +1252,30 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, } /* are we running in a sequential parallel or tskm_immediate_exec... we need tasking support enabled */ - if ((encountering_thread->th.th_task_team) == NULL) { + if ((thread->th.th_task_team) == NULL) { /* This should only happen if the team is serialized setup a task team and propagate it to the thread */ KMP_DEBUG_ASSERT(team->t.t_serialized); KA_TRACE(30, ("T#%d creating task team in __kmp_task_alloc for proxy task\n", gtid)); - __kmp_task_team_setup( - encountering_thread, team, - 1); // 1 indicates setup the current team regardless of nthreads - encountering_thread->th.th_task_team = - team->t.t_task_team[encountering_thread->th.th_task_state]; + // 1 indicates setup the current team regardless of nthreads + __kmp_task_team_setup(thread, team, 1); + thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; } - kmp_task_team_t *task_team = encountering_thread->th.th_task_team; + kmp_task_team_t *task_team = thread->th.th_task_team; /* tasking must be enabled now as the task might not be pushed */ if (!KMP_TASKING_ENABLED(task_team)) { KA_TRACE( 30, ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); - __kmp_enable_tasking(task_team, encountering_thread); - kmp_int32 tid = encountering_thread->th.th_info.ds.ds_tid; + __kmp_enable_tasking(task_team, thread); + kmp_int32 tid = thread->th.th_info.ds.ds_tid; kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; // No lock needed since only owner can allocate if (thread_data->td.td_deque == NULL) { - __kmp_alloc_task_deque(encountering_thread, thread_data); + __kmp_alloc_task_deque(thread, thread_data); } } @@ -1297,11 +1300,11 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, // Avoid double allocation here by combining shareds with taskdata #if USE_FAST_MEMORY - taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( - encountering_thread, shareds_offset + sizeof_shareds); + taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + + sizeof_shareds); #else /* ! USE_FAST_MEMORY */ - taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( - encountering_thread, shareds_offset + sizeof_shareds); + taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + + sizeof_shareds); #endif /* USE_FAST_MEMORY */ task = KMP_TASKDATA_TO_TASK(taskdata); @@ -1328,7 +1331,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, taskdata->td_task_id = KMP_GEN_TASK_ID(); taskdata->td_team = thread->th.th_team; - taskdata->td_alloc_thread = encountering_thread; + taskdata->td_alloc_thread = thread; taskdata->td_parent = parent_task; taskdata->td_level = parent_task->td_level + 1; // increment nesting level KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); @@ -1342,10 +1345,16 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); taskdata->td_flags = *flags; - taskdata->encountering_gtid = gtid; taskdata->td_task_team = thread->th.th_task_team; taskdata->td_size_alloc = shareds_offset + sizeof_shareds; taskdata->td_flags.tasktype = TASK_EXPLICIT; + // If it is hidden helper task, we need to set the team and task team + // correspondingly. + if (flags->hidden_helper) { + kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)]; + taskdata->td_team = shadow_thread->th.th_team; + taskdata->td_task_team = shadow_thread->th.th_task_team; + } // GEH - TODO: fix this to copy parent task's value of tasking_ser flag taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); @@ -1382,11 +1391,9 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, if (UNLIKELY(ompt_enabled.enabled)) __ompt_task_init(taskdata, gtid); #endif - // Only need to keep track of child task counts if team parallel and tasking - // not serialized or if it is a proxy or detachable or hidden helper task - if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE || - flags->hidden_helper || - !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { + // TODO: What would be the balance between the conditions in the function and + // an atomic operation? + if (__kmp_track_children_task(taskdata)) { KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); if (parent_task->td_taskgroup) KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); @@ -1438,11 +1445,12 @@ kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, size_t sizeof_shareds, kmp_routine_entry_t task_entry, kmp_int64 device_id) { - if (__kmp_enable_hidden_helper) { - auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags); + auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags); + // target task is untied defined in the specification + input_flags.tiedness = TASK_UNTIED; + + if (__kmp_enable_hidden_helper) input_flags.hidden_helper = TRUE; - input_flags.tiedness = TASK_UNTIED; - } return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t, sizeof_shareds, task_entry); @@ -1613,13 +1621,15 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task) #endif + if (task->routine != NULL) { #ifdef KMP_GOMP_COMPAT - if (taskdata->td_flags.native) { - ((void (*)(void *))(*(task->routine)))(task->shareds); - } else + if (taskdata->td_flags.native) { + ((void (*)(void *))(*(task->routine)))(task->shareds); + } else #endif /* KMP_GOMP_COMPAT */ - { - (*(task->routine))(gtid, task); + { + (*(task->routine))(gtid, task); + } } KMP_POP_PARTITIONED_TIMER(); @@ -2833,15 +2843,14 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, // We need to un-mark this victim as a finished victim. This must be done // before releasing the lock, or else other threads (starting with the // primary thread victim) might be prematurely released from the barrier!!! - kmp_int32 count; - - count = KMP_ATOMIC_INC(unfinished_threads); - +#if KMP_DEBUG + kmp_int32 count = +#endif + KMP_ATOMIC_INC(unfinished_threads); KA_TRACE( 20, ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", gtid, count + 1, task_team)); - *thread_finished = FALSE; } TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1); @@ -2948,8 +2957,7 @@ static inline int __kmp_execute_tasks_template( (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != NULL)) { asleep = 1; - __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), - other_thread->th.th_sleep_loc); + __kmp_null_resume_wrapper(other_thread); // A sleeping thread should not have any tasks on it's queue. // There is a slight possibility that it resumes, steals a task // from another thread, which spawns more tasks, all in the time @@ -3034,9 +3042,10 @@ static inline int __kmp_execute_tasks_template( // done. This decrement might be to the spin location, and result in the // termination condition being satisfied. if (!*thread_finished) { - kmp_int32 count; - - count = KMP_ATOMIC_DEC(unfinished_threads) - 1; +#if KMP_DEBUG + kmp_int32 count = -1 + +#endif + KMP_ATOMIC_DEC(unfinished_threads); KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " "unfinished_threads to %d task_team=%p\n", gtid, count, task_team)); @@ -3065,6 +3074,18 @@ static inline int __kmp_execute_tasks_template( return FALSE; } + // Check the flag again to see if it has already done in case to be trapped + // into infinite loop when a if0 task depends on a hidden helper task + // outside any parallel region. Detached tasks are not impacted in this case + // because the only thread executing this function has to execute the proxy + // task so it is in another code path that has the same check. + if (flag == NULL || (!final_spin && flag->done_check())) { + KA_TRACE(15, + ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", + gtid)); + return TRUE; + } + // We could be getting tasks from target constructs; if this is the only // thread, keep trying to execute tasks from own queue if (nthreads == 1 && @@ -3098,6 +3119,16 @@ int __kmp_execute_tasks_64( thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); } +template <bool C, bool S> +int __kmp_atomic_execute_tasks_64( + kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag, + int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), + kmp_int32 is_constrained) { + return __kmp_execute_tasks_template( + thread, gtid, flag, final_spin, + thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); +} + int __kmp_execute_tasks_oncore( kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), @@ -3124,6 +3155,14 @@ template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32, int *USE_ITT_BUILD_ARG(void *), kmp_int32); +template int __kmp_atomic_execute_tasks_64<false, true>( + kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int, + int *USE_ITT_BUILD_ARG(void *), kmp_int32); + +template int __kmp_atomic_execute_tasks_64<true, false>( + kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int, + int *USE_ITT_BUILD_ARG(void *), kmp_int32); + // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the // next barrier so they can assist in executing enqueued tasks. // First thread in allocates the task team atomically. @@ -3162,7 +3201,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team, // tasks and execute them. In extra barrier mode, tasks do not sleep // at the separate tasking barrier, so this isn't a problem. for (i = 0; i < nthreads; i++) { - volatile void *sleep_loc; + void *sleep_loc; kmp_info_t *thread = threads_data[i].td.td_thr; if (i == this_thr->th.th_info.ds.ds_tid) { @@ -3179,7 +3218,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team, KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", __kmp_gtid_from_thread(this_thr), __kmp_gtid_from_thread(thread))); - __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); + __kmp_null_resume_wrapper(thread); } else { KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", __kmp_gtid_from_thread(this_thr), @@ -3451,6 +3490,7 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, TCW_4(task_team->tt.tt_found_tasks, FALSE); TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); + TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); task_team->tt.tt_nproc = nthreads = team->t.t_nproc; KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); @@ -3512,9 +3552,11 @@ void __kmp_reap_task_teams(void) { void __kmp_wait_to_unref_task_teams(void) { kmp_info_t *thread; kmp_uint32 spins; + kmp_uint64 time; int done; KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); for (;;) { done = TRUE; @@ -3547,7 +3589,7 @@ void __kmp_wait_to_unref_task_teams(void) { __kmp_gtid_from_thread(thread))); if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { - volatile void *sleep_loc; + void *sleep_loc; // If the thread is sleeping, awaken it. if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != NULL) { @@ -3555,7 +3597,7 @@ void __kmp_wait_to_unref_task_teams(void) { 10, ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); - __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); + __kmp_null_resume_wrapper(thread); } } } @@ -3564,7 +3606,7 @@ void __kmp_wait_to_unref_task_teams(void) { } // If oversubscribed or have waited a bit, yield. - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } } @@ -3613,6 +3655,7 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); TCW_4(task_team->tt.tt_found_tasks, FALSE); TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); + TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team->t.t_nproc); TCW_4(task_team->tt.tt_active, TRUE); @@ -3705,8 +3748,10 @@ void __kmp_task_team_wait( "setting active to false, setting local and team's pointer to NULL\n", __kmp_gtid_from_thread(this_thr), task_team)); KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || - task_team->tt.tt_found_proxy_tasks == TRUE); + task_team->tt.tt_found_proxy_tasks == TRUE || + task_team->tt.tt_hidden_helper_task_encountered == TRUE); TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); + TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); TCW_SYNC_4(task_team->tt.tt_active, FALSE); KMP_MB(); @@ -3869,11 +3914,12 @@ static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { } static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { +#if KMP_DEBUG kmp_int32 children = 0; - // Predecrement simulated by "- 1" calculation - children = - KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; + children = -1 + +#endif + KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks); KMP_DEBUG_ASSERT(children >= 0); // Remove the imaginary children @@ -3936,7 +3982,7 @@ void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) { // This should be similar to start_k = __kmp_get_random( thread ) % nthreads // but we cannot use __kmp_get_random here - kmp_int32 start_k = start; + kmp_int32 start_k = start % nthreads; kmp_int32 pass = 1; kmp_int32 k = start_k; |