diff options
author | thegeorg <thegeorg@yandex-team.com> | 2022-10-20 12:16:22 +0300 |
---|---|---|
committer | thegeorg <thegeorg@yandex-team.com> | 2022-10-20 12:16:22 +0300 |
commit | da5ee816c1598acf602c1c42845b544878400d34 (patch) | |
tree | 47c0acdeae9bbd5ceb1019b6c8e94ada327d7776 | |
parent | d37715ef865ba1c48ca505f8b96151ae6d417657 (diff) | |
download | ydb-da5ee816c1598acf602c1c42845b544878400d34.tar.gz |
Update contrib/libs/cxxsupp/openmp to 15.0.2
30 files changed, 1121 insertions, 204 deletions
diff --git a/contrib/libs/cxxsupp/openmp/kmp.h b/contrib/libs/cxxsupp/openmp/kmp.h index 9502167474..4b9602626a 100644 --- a/contrib/libs/cxxsupp/openmp/kmp.h +++ b/contrib/libs/cxxsupp/openmp/kmp.h @@ -100,18 +100,18 @@ class kmp_stats_list; #ifndef HWLOC_OBJ_PACKAGE #define HWLOC_OBJ_PACKAGE HWLOC_OBJ_SOCKET #endif -#if HWLOC_API_VERSION >= 0x00020000 -// hwloc 2.0 changed type of depth of object from unsigned to int -typedef int kmp_hwloc_depth_t; -#else -typedef unsigned int kmp_hwloc_depth_t; -#endif #endif #if KMP_ARCH_X86 || KMP_ARCH_X86_64 #include <xmmintrin.h> #endif +// The below has to be defined before including "kmp_barrier.h". +#define KMP_INTERNAL_MALLOC(sz) malloc(sz) +#define KMP_INTERNAL_FREE(p) free(p) +#define KMP_INTERNAL_REALLOC(p, sz) realloc((p), (sz)) +#define KMP_INTERNAL_CALLOC(n, sz) calloc((n), (sz)) + #include "kmp_debug.h" #include "kmp_lock.h" #include "kmp_version.h" @@ -841,7 +841,9 @@ extern unsigned __kmp_affinity_num_masks; extern void __kmp_affinity_bind_thread(int which); extern kmp_affin_mask_t *__kmp_affin_fullMask; +extern kmp_affin_mask_t *__kmp_affin_origMask; extern char *__kmp_cpuinfo_file; +extern bool __kmp_affin_reset; #endif /* KMP_AFFINITY_SUPPORTED */ @@ -967,7 +969,6 @@ extern omp_memspace_handle_t const omp_large_cap_mem_space; extern omp_memspace_handle_t const omp_const_mem_space; extern omp_memspace_handle_t const omp_high_bw_mem_space; extern omp_memspace_handle_t const omp_low_lat_mem_space; -// Preview of target memory support extern omp_memspace_handle_t const llvm_omp_target_host_mem_space; extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space; extern omp_memspace_handle_t const llvm_omp_target_device_mem_space; @@ -987,7 +988,6 @@ extern omp_allocator_handle_t const omp_low_lat_mem_alloc; extern omp_allocator_handle_t const omp_cgroup_mem_alloc; extern omp_allocator_handle_t const omp_pteam_mem_alloc; extern omp_allocator_handle_t const omp_thread_mem_alloc; -// Preview of target memory support extern omp_allocator_handle_t const llvm_omp_target_host_mem_alloc; extern omp_allocator_handle_t const llvm_omp_target_shared_mem_alloc; extern omp_allocator_handle_t const llvm_omp_target_device_mem_alloc; @@ -1124,7 +1124,7 @@ extern void __kmp_init_target_mem(); #if KMP_OS_UNIX && (KMP_ARCH_X86 || KMP_ARCH_X86_64) // HW TSC is used to reduce overhead (clock tick instead of nanosecond). extern kmp_uint64 __kmp_ticks_per_msec; -#if KMP_COMPILER_ICC +#if KMP_COMPILER_ICC || KMP_COMPILER_ICX #define KMP_NOW() ((kmp_uint64)_rdtsc()) #else #define KMP_NOW() __kmp_hardware_timestamp() @@ -1334,7 +1334,10 @@ static inline int __kmp_tpause(uint32_t hint, uint64_t counter) { char flag; __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n" "setb %0" - : "=r"(flag) + // The "=q" restraint means any register accessible as rl + // in 32-bit mode: a, b, c, and d; + // in 64-bit mode: any integer register + : "=q"(flag) : "a"(timeLo), "d"(timeHi), "c"(hint) :); return flag; @@ -1361,7 +1364,10 @@ static inline int __kmp_umwait(uint32_t hint, uint64_t counter) { char flag; __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n" "setb %0" - : "=r"(flag) + // The "=q" restraint means any register accessible as rl + // in 32-bit mode: a, b, c, and d; + // in 64-bit mode: any integer register + : "=q"(flag) : "a"(timeLo), "d"(timeHi), "c"(hint) :); return flag; @@ -2548,11 +2554,22 @@ typedef union KMP_ALIGN_CACHE kmp_thread_data { char td_pad[KMP_PAD(kmp_base_thread_data_t, CACHE_LINE)]; } kmp_thread_data_t; +typedef struct kmp_task_pri { + kmp_thread_data_t td; + kmp_int32 priority; + kmp_task_pri *next; +} kmp_task_pri_t; + // Data for task teams which are used when tasking is enabled for the team typedef struct kmp_base_task_team { kmp_bootstrap_lock_t tt_threads_lock; /* Lock used to allocate per-thread part of task team */ /* must be bootstrap lock since used at library shutdown*/ + + // TODO: check performance vs kmp_tas_lock_t + kmp_bootstrap_lock_t tt_task_pri_lock; /* Lock to access priority tasks */ + kmp_task_pri_t *tt_task_pri_list; + kmp_task_team_t *tt_next; /* For linking the task team free list */ kmp_thread_data_t *tt_threads_data; /* Array of per-thread structures for task team */ @@ -2564,6 +2581,7 @@ typedef struct kmp_base_task_team { kmp_int32 tt_max_threads; // # entries allocated for threads_data array kmp_int32 tt_found_proxy_tasks; // found proxy tasks since last barrier kmp_int32 tt_untied_task_encountered; + std::atomic<kmp_int32> tt_num_task_pri; // number of priority tasks enqueued // There is hidden helper thread encountered in this task team so that we must // wait when waiting on task team kmp_int32 tt_hidden_helper_task_encountered; @@ -2973,6 +2991,15 @@ struct fortran_inx_info { kmp_int32 data; }; +// This list type exists to hold old __kmp_threads arrays so that +// old references to them may complete while reallocation takes place when +// expanding the array. The items in this list are kept alive until library +// shutdown. +typedef struct kmp_old_threads_list_t { + kmp_info_t **threads; + struct kmp_old_threads_list_t *next; +} kmp_old_threads_list_t; + /* ------------------------------------------------------------------------ */ extern int __kmp_settings; @@ -3036,6 +3063,8 @@ extern int __kmp_storage_map_verbose_specified; #if KMP_ARCH_X86 || KMP_ARCH_X86_64 extern kmp_cpuinfo_t __kmp_cpuinfo; static inline bool __kmp_is_hybrid_cpu() { return __kmp_cpuinfo.flags.hybrid; } +#elif KMP_OS_DARWIN && KMP_ARCH_AARCH64 +static inline bool __kmp_is_hybrid_cpu() { return true; } #else static inline bool __kmp_is_hybrid_cpu() { return false; } #endif @@ -3043,6 +3072,7 @@ static inline bool __kmp_is_hybrid_cpu() { return false; } extern volatile int __kmp_init_serial; extern volatile int __kmp_init_gtid; extern volatile int __kmp_init_common; +extern volatile int __kmp_need_register_serial; extern volatile int __kmp_init_middle; extern volatile int __kmp_init_parallel; #if KMP_USE_MONITOR @@ -3150,6 +3180,7 @@ extern int __kmp_tp_cached; /* whether threadprivate cache has been created (__kmpc_threadprivate_cached()) */ extern int __kmp_dflt_blocktime; /* number of milliseconds to wait before blocking (env setting) */ +extern bool __kmp_wpolicy_passive; /* explicitly set passive wait policy */ #if KMP_USE_MONITOR extern int __kmp_monitor_wakeups; /* number of times monitor wakes up per second */ @@ -3253,6 +3284,8 @@ extern int __kmp_teams_thread_limit; /* the following are protected by the fork/join lock */ /* write: lock read: anytime */ extern kmp_info_t **__kmp_threads; /* Descriptors for the threads */ +/* Holds old arrays of __kmp_threads until library shutdown */ +extern kmp_old_threads_list_t *__kmp_old_threads_list; /* read/write: lock */ extern volatile kmp_team_t *__kmp_team_pool; extern volatile kmp_info_t *__kmp_thread_pool; @@ -3451,11 +3484,6 @@ extern void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL); #define __kmp_thread_free(th, ptr) \ ___kmp_thread_free((th), (ptr)KMP_SRC_LOC_CURR) -#define KMP_INTERNAL_MALLOC(sz) malloc(sz) -#define KMP_INTERNAL_FREE(p) free(p) -#define KMP_INTERNAL_REALLOC(p, sz) realloc((p), (sz)) -#define KMP_INTERNAL_CALLOC(n, sz) calloc((n), (sz)) - extern void __kmp_push_num_threads(ident_t *loc, int gtid, int num_threads); extern void __kmp_push_proc_bind(ident_t *loc, int gtid, @@ -3601,8 +3629,18 @@ static inline void __kmp_assign_root_init_mask() { r->r.r_affinity_assigned = TRUE; } } +static inline void __kmp_reset_root_init_mask(int gtid) { + kmp_info_t *th = __kmp_threads[gtid]; + kmp_root_t *r = th->th.th_root; + if (r->r.r_uber_thread == th && r->r.r_affinity_assigned) { + __kmp_set_system_affinity(__kmp_affin_origMask, FALSE); + KMP_CPU_COPY(th->th.th_affin_mask, __kmp_affin_origMask); + r->r.r_affinity_assigned = FALSE; + } +} #else /* KMP_AFFINITY_SUPPORTED */ #define __kmp_assign_root_init_mask() /* Nothing */ +static inline void __kmp_reset_root_init_mask(int gtid) {} #endif /* KMP_AFFINITY_SUPPORTED */ // No need for KMP_AFFINITY_SUPPORTED guard as only one field in the // format string is for affinity, so platforms that do not support @@ -3865,6 +3903,11 @@ KMP_EXPORT kmp_int32 __kmpc_barrier_master_nowait(ident_t *, KMP_EXPORT kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid); KMP_EXPORT void __kmpc_end_single(ident_t *, kmp_int32 global_tid); +KMP_EXPORT kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 global_tid); +KMP_EXPORT kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 global_tid, + kmp_int32 numberOfSections); +KMP_EXPORT void __kmpc_end_sections(ident_t *loc, kmp_int32 global_tid); + KMP_EXPORT void KMPC_FOR_STATIC_INIT(ident_t *loc, kmp_int32 global_tid, kmp_int32 schedtype, kmp_int32 *plastiter, kmp_int *plower, kmp_int *pupper, @@ -3878,6 +3921,9 @@ KMP_EXPORT void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid, void (*cpy_func)(void *, void *), kmp_int32 didit); +KMP_EXPORT void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, + void *cpy_data); + extern void KMPC_SET_NUM_THREADS(int arg); extern void KMPC_SET_DYNAMIC(int flag); extern void KMPC_SET_NESTED(int flag); diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp index 414a27fb05..b9a8d49d8d 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp @@ -138,6 +138,18 @@ const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) { return "unknown"; } +#if KMP_AFFINITY_SUPPORTED +// If affinity is supported, check the affinity +// verbose and warning flags before printing warning +#define KMP_AFF_WARNING(...) \ + if (__kmp_affinity_verbose || \ + (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { \ + KMP_WARNING(__VA_ARGS__); \ + } +#else +#define KMP_AFF_WARNING KMP_WARNING +#endif + //////////////////////////////////////////////////////////////////////////////// // kmp_hw_thread_t methods int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { @@ -818,16 +830,16 @@ void kmp_topology_t::canonicalize() { // First try core, then thread, then package kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET}; for (auto g : gran_types) { - if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) { + if (get_equivalent_type(g) != KMP_HW_UNKNOWN) { gran_type = g; break; } } KMP_ASSERT(gran_type != KMP_HW_UNKNOWN); // Warn user what granularity setting will be used instead - KMP_WARNING(AffGranularityBad, "KMP_AFFINITY", - __kmp_hw_get_catalog_string(__kmp_affinity_gran), - __kmp_hw_get_catalog_string(gran_type)); + KMP_AFF_WARNING(AffGranularityBad, "KMP_AFFINITY", + __kmp_hw_get_catalog_string(__kmp_affinity_gran), + __kmp_hw_get_catalog_string(gran_type)); __kmp_affinity_gran = gran_type; } #if KMP_GROUP_AFFINITY @@ -839,12 +851,12 @@ void kmp_topology_t::canonicalize() { // processor groups that cover a socket, then the runtime must // restrict the granularity down to the processor group level. if (__kmp_num_proc_groups > 1) { - int gran_depth = __kmp_topology->get_level(gran_type); - int proc_group_depth = __kmp_topology->get_level(KMP_HW_PROC_GROUP); + int gran_depth = get_level(gran_type); + int proc_group_depth = get_level(KMP_HW_PROC_GROUP); if (gran_depth >= 0 && proc_group_depth >= 0 && gran_depth < proc_group_depth) { - KMP_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY", - __kmp_hw_get_catalog_string(__kmp_affinity_gran)); + KMP_AFF_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY", + __kmp_hw_get_catalog_string(__kmp_affinity_gran)); __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP; } } @@ -966,16 +978,16 @@ bool kmp_topology_t::filter_hw_subset() { if (equivalent_type != KMP_HW_UNKNOWN) { __kmp_hw_subset->at(i).type = equivalent_type; } else { - KMP_WARNING(AffHWSubsetNotExistGeneric, - __kmp_hw_get_catalog_string(type)); + KMP_AFF_WARNING(AffHWSubsetNotExistGeneric, + __kmp_hw_get_catalog_string(type)); return false; } // Check to see if current layer has already been // specified either directly or through an equivalent type if (specified[equivalent_type] != KMP_HW_UNKNOWN) { - KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), - __kmp_hw_get_catalog_string(specified[equivalent_type])); + KMP_AFF_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), + __kmp_hw_get_catalog_string(specified[equivalent_type])); return false; } specified[equivalent_type] = type; @@ -985,8 +997,8 @@ bool kmp_topology_t::filter_hw_subset() { if (max_count < 0 || (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) { bool plural = (num > 1); - KMP_WARNING(AffHWSubsetManyGeneric, - __kmp_hw_get_catalog_string(type, plural)); + KMP_AFF_WARNING(AffHWSubsetManyGeneric, + __kmp_hw_get_catalog_string(type, plural)); return false; } @@ -1008,21 +1020,21 @@ bool kmp_topology_t::filter_hw_subset() { if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) { if (item.num_attrs == 1) { if (using_core_effs) { - KMP_WARNING(AffHWSubsetIgnoringAttr, "efficiency"); + KMP_AFF_WARNING(AffHWSubsetIgnoringAttr, "efficiency"); } else { - KMP_WARNING(AffHWSubsetIgnoringAttr, "core_type"); + KMP_AFF_WARNING(AffHWSubsetIgnoringAttr, "core_type"); } using_core_effs = false; using_core_types = false; } else { - KMP_WARNING(AffHWSubsetAttrsNonHybrid); + KMP_AFF_WARNING(AffHWSubsetAttrsNonHybrid); return false; } } // Check if using both core types and core efficiencies together if (using_core_types && using_core_effs) { - KMP_WARNING(AffHWSubsetIncompat, "core_type", "efficiency"); + KMP_AFF_WARNING(AffHWSubsetIncompat, "core_type", "efficiency"); return false; } @@ -1058,7 +1070,7 @@ bool kmp_topology_t::filter_hw_subset() { (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) { kmp_str_buf_t buf; __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0); - KMP_WARNING(AffHWSubsetManyGeneric, buf.str); + KMP_AFF_WARNING(AffHWSubsetManyGeneric, buf.str); __kmp_str_buf_free(&buf); return false; } @@ -1080,8 +1092,8 @@ bool kmp_topology_t::filter_hw_subset() { } kmp_str_buf_t buf; __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0); - KMP_WARNING(AffHWSubsetIncompat, - __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str); + KMP_AFF_WARNING(AffHWSubsetIncompat, + __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str); __kmp_str_buf_free(&buf); return false; } @@ -1093,7 +1105,7 @@ bool kmp_topology_t::filter_hw_subset() { kmp_str_buf_t buf; __kmp_hw_get_catalog_core_string(item.attr[j], &buf, item.num[j] > 0); - KMP_WARNING(AffHWSubsetAttrRepeat, buf.str); + KMP_AFF_WARNING(AffHWSubsetAttrRepeat, buf.str); __kmp_str_buf_free(&buf); return false; } @@ -1201,7 +1213,7 @@ bool kmp_topology_t::filter_hw_subset() { // One last check that we shouldn't allow filtering entire machine if (num_filtered == num_hw_threads) { - KMP_WARNING(AffHWSubsetAllFiltered); + KMP_AFF_WARNING(AffHWSubsetAllFiltered); __kmp_free(filtered); return false; } @@ -1536,6 +1548,8 @@ int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { // internal topology object and set the layer ids for it. Each routine // returns a boolean on whether it was successful at doing so. kmp_affin_mask_t *__kmp_affin_fullMask = NULL; +// Original mask is a subset of full mask in multiple processor groups topology +kmp_affin_mask_t *__kmp_affin_origMask = NULL; #if KMP_USE_HWLOC static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { @@ -1765,7 +1779,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { hw_thread_index = 0; pu = NULL; - while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) { + while ((pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu))) { int index = depth - 1; bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask); kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); @@ -3353,10 +3367,7 @@ static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); } if (__kmp_affinity_gran_levels >= (int)depth) { - if (__kmp_affinity_verbose || - (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { - KMP_WARNING(AffThreadsMayMigrate); - } + KMP_AFF_WARNING(AffThreadsMayMigrate); } // Run through the table, forming the masks for all threads on each core. @@ -3443,11 +3454,7 @@ static int nextNewMask; { \ if (((_osId) > _maxOsId) || \ (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ - if (__kmp_affinity_verbose || \ - (__kmp_affinity_warnings && \ - (__kmp_affinity_type != affinity_none))) { \ - KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ - } \ + KMP_AFF_WARNING(AffIgnoreInvalidProcID, _osId); \ } else { \ ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ } \ @@ -3498,11 +3505,7 @@ static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, // Copy the mask for that osId to the sum (union) mask. if ((num > maxOsId) || (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { - if (__kmp_affinity_verbose || - (__kmp_affinity_warnings && - (__kmp_affinity_type != affinity_none))) { - KMP_WARNING(AffIgnoreInvalidProcID, num); - } + KMP_AFF_WARNING(AffIgnoreInvalidProcID, num); KMP_CPU_ZERO(sumMask); } else { KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); @@ -3534,11 +3537,7 @@ static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, // Add the mask for that osId to the sum mask. if ((num > maxOsId) || (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { - if (__kmp_affinity_verbose || - (__kmp_affinity_warnings && - (__kmp_affinity_type != affinity_none))) { - KMP_WARNING(AffIgnoreInvalidProcID, num); - } + KMP_AFF_WARNING(AffIgnoreInvalidProcID, num); } else { KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); setSize++; @@ -3695,11 +3694,7 @@ static void __kmp_process_subplace_list(const char **scan, if (**scan == '}' || **scan == ',') { if ((start > maxOsId) || (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { - if (__kmp_affinity_verbose || - (__kmp_affinity_warnings && - (__kmp_affinity_type != affinity_none))) { - KMP_WARNING(AffIgnoreInvalidProcID, start); - } + KMP_AFF_WARNING(AffIgnoreInvalidProcID, start); } else { KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); (*setSize)++; @@ -3728,11 +3723,7 @@ static void __kmp_process_subplace_list(const char **scan, for (i = 0; i < count; i++) { if ((start > maxOsId) || (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { - if (__kmp_affinity_verbose || - (__kmp_affinity_warnings && - (__kmp_affinity_type != affinity_none))) { - KMP_WARNING(AffIgnoreInvalidProcID, start); - } + KMP_AFF_WARNING(AffIgnoreInvalidProcID, start); break; // don't proliferate warnings for large count } else { KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); @@ -3779,11 +3770,7 @@ static void __kmp_process_subplace_list(const char **scan, for (i = 0; i < count; i++) { if ((start > maxOsId) || (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { - if (__kmp_affinity_verbose || - (__kmp_affinity_warnings && - (__kmp_affinity_type != affinity_none))) { - KMP_WARNING(AffIgnoreInvalidProcID, start); - } + KMP_AFF_WARNING(AffIgnoreInvalidProcID, start); break; // don't proliferate warnings for large count } else { KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); @@ -3825,10 +3812,7 @@ static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, KMP_ASSERT(num >= 0); if ((num > maxOsId) || (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { - if (__kmp_affinity_verbose || - (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { - KMP_WARNING(AffIgnoreInvalidProcID, num); - } + KMP_AFF_WARNING(AffIgnoreInvalidProcID, num); } else { KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); (*setSize)++; @@ -3945,11 +3929,8 @@ void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || (!KMP_CPU_ISSET(j + stride, KMP_CPU_INDEX(osId2Mask, j + stride)))) { - if ((__kmp_affinity_verbose || - (__kmp_affinity_warnings && - (__kmp_affinity_type != affinity_none))) && - i < count - 1) { - KMP_WARNING(AffIgnoreInvalidProcID, j + stride); + if (i < count - 1) { + KMP_AFF_WARNING(AffIgnoreInvalidProcID, j + stride); } continue; } @@ -4072,8 +4053,13 @@ static void __kmp_aux_affinity_initialize(void) { if (__kmp_affin_fullMask == NULL) { KMP_CPU_ALLOC(__kmp_affin_fullMask); } + if (__kmp_affin_origMask == NULL) { + KMP_CPU_ALLOC(__kmp_affin_origMask); + } if (KMP_AFFINITY_CAPABLE()) { __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); + // Make a copy before possible expanding to the entire machine mask + __kmp_affin_origMask->copy(__kmp_affin_fullMask); if (__kmp_affinity_respect_mask) { // Count the number of available processors. unsigned i; @@ -4085,11 +4071,7 @@ static void __kmp_aux_affinity_initialize(void) { __kmp_avail_proc++; } if (__kmp_avail_proc > __kmp_xproc) { - if (__kmp_affinity_verbose || - (__kmp_affinity_warnings && - (__kmp_affinity_type != affinity_none))) { - KMP_WARNING(ErrorInitializeAffinity); - } + KMP_AFF_WARNING(ErrorInitializeAffinity); __kmp_affinity_type = affinity_none; KMP_AFFINITY_DISABLE(); return; @@ -4111,6 +4093,10 @@ static void __kmp_aux_affinity_initialize(void) { __kmp_avail_proc = __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); #if KMP_OS_WINDOWS + if (__kmp_num_proc_groups <= 1) { + // Copy expanded full mask if topology has single processor group + __kmp_affin_origMask->copy(__kmp_affin_fullMask); + } // Set the process affinity mask since threads' affinity // masks must be subset of process mask in Windows* OS __kmp_affin_fullMask->set_process_affinity(true); @@ -4254,10 +4240,8 @@ static void __kmp_aux_affinity_initialize(void) { // Early exit if topology could not be created if (!__kmp_topology) { - if (KMP_AFFINITY_CAPABLE() && - (__kmp_affinity_verbose || - (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { - KMP_WARNING(ErrorInitializeAffinity); + if (KMP_AFFINITY_CAPABLE()) { + KMP_AFF_WARNING(ErrorInitializeAffinity); } if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 && __kmp_ncores > 0) { @@ -4283,6 +4267,13 @@ static void __kmp_aux_affinity_initialize(void) { if (__kmp_affinity_verbose) __kmp_topology->print("KMP_AFFINITY"); bool filtered = __kmp_topology->filter_hw_subset(); + if (filtered) { +#if KMP_OS_WINDOWS + // Copy filtered full mask if topology has single processor group + if (__kmp_num_proc_groups <= 1) +#endif + __kmp_affin_origMask->copy(__kmp_affin_fullMask); + } if (filtered && __kmp_affinity_verbose) __kmp_topology->print("KMP_HW_SUBSET"); machine_hierarchy.init(__kmp_topology->get_num_hw_threads()); @@ -4321,10 +4312,7 @@ static void __kmp_aux_affinity_initialize(void) { __kmp_affinity_proclist, osId2Mask, maxIndex); } if (__kmp_affinity_num_masks == 0) { - if (__kmp_affinity_verbose || - (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { - KMP_WARNING(AffNoValidProcID); - } + KMP_AFF_WARNING(AffNoValidProcID); __kmp_affinity_type = affinity_none; __kmp_create_affinity_none_places(); return; @@ -4374,9 +4362,7 @@ static void __kmp_aux_affinity_initialize(void) { case affinity_balanced: if (depth <= 1) { - if (__kmp_affinity_verbose || __kmp_affinity_warnings) { - KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); - } + KMP_AFF_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); __kmp_affinity_type = affinity_none; __kmp_create_affinity_none_places(); return; @@ -4393,9 +4379,7 @@ static void __kmp_aux_affinity_initialize(void) { int nproc = ncores * maxprocpercore; if ((nproc < 2) || (nproc < __kmp_avail_proc)) { - if (__kmp_affinity_verbose || __kmp_affinity_warnings) { - KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); - } + KMP_AFF_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); __kmp_affinity_type = affinity_none; return; } @@ -4506,6 +4490,10 @@ void __kmp_affinity_uninitialize(void) { KMP_CPU_FREE(__kmp_affin_fullMask); __kmp_affin_fullMask = NULL; } + if (__kmp_affin_origMask != NULL) { + KMP_CPU_FREE(__kmp_affin_origMask); + __kmp_affin_origMask = NULL; + } __kmp_affinity_num_masks = 0; __kmp_affinity_type = affinity_default; __kmp_affinity_num_places = 0; diff --git a/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp b/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp index 120cad17c2..e9aaedc538 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_alloc.cpp @@ -1254,7 +1254,6 @@ static void **mk_hbw_preferred_hugetlb; static void **mk_dax_kmem; static void **mk_dax_kmem_all; static void **mk_dax_kmem_preferred; -// Preview of target memory support static void *(*kmp_target_alloc_host)(size_t size, int device); static void *(*kmp_target_alloc_shared)(size_t size, int device); static void *(*kmp_target_alloc_device)(size_t size, int device); @@ -1269,7 +1268,7 @@ static bool __kmp_target_mem_available; MA == llvm_omp_target_shared_mem_alloc || \ MA == llvm_omp_target_device_mem_alloc) -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB +#if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN static inline void chk_kind(void ***pkind) { KMP_DEBUG_ASSERT(pkind); if (*pkind) // symbol found @@ -1280,7 +1279,7 @@ static inline void chk_kind(void ***pkind) { void __kmp_init_memkind() { // as of 2018-07-31 memkind does not support Windows*, exclude it for now -#if KMP_OS_UNIX && KMP_DYNAMIC_LIB +#if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN // use of statically linked memkind is problematic, as it depends on libnuma kmp_mk_lib_name = "libmemkind.so"; h_memkind = dlopen(kmp_mk_lib_name, RTLD_LAZY); @@ -1364,7 +1363,7 @@ void __kmp_fini_memkind() { mk_dax_kmem_preferred = NULL; #endif } -// Preview of target memory support + void __kmp_init_target_mem() { *(void **)(&kmp_target_alloc_host) = KMP_DLSYM("llvm_omp_target_alloc_host"); *(void **)(&kmp_target_alloc_shared) = diff --git a/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp b/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp index 0bd7b1a41a..21c2c60bfb 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_atomic.cpp @@ -2452,6 +2452,7 @@ ATOMIC_CMPXCHG_CPT(float8, mul_cpt, kmp_real64, 64, *, RTYPE, LCK_ID, MASK, GOMP_FLAG) \ ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE) \ TYPE new_value; \ + (void)new_value; \ OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG) \ OP_CMPXCHG_CPT(TYPE, BITS, OP) \ } @@ -2461,6 +2462,7 @@ ATOMIC_CMPXCHG_CPT(float8, mul_cpt, kmp_real64, 64, *, LCK_ID, GOMP_FLAG) \ ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE) \ TYPE new_value; \ + (void)new_value; \ OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG) /* send assignment */ \ OP_UPDATE_CRITICAL_CPT(TYPE, OP, LCK_ID) /* send assignment */ \ } @@ -3162,6 +3164,7 @@ ATOMIC_CRITICAL_CPT_REV(cmplx16, div_a16_cpt_rev, kmp_cmplx128_a16_t, /, 32c, RTYPE, LCK_ID, MASK, GOMP_FLAG) \ ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE) \ TYPE new_value; \ + (void)new_value; \ OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, GOMP_FLAG) \ OP_CMPXCHG_CPT_REV(TYPE, BITS, OP) \ } @@ -3171,6 +3174,7 @@ ATOMIC_CRITICAL_CPT_REV(cmplx16, div_a16_cpt_rev, kmp_cmplx128_a16_t, /, 32c, LCK_ID, GOMP_FLAG) \ ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE) \ TYPE new_value; \ + (void)new_value; \ OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, GOMP_FLAG) /* send assignment */ \ OP_CRITICAL_CPT_REV(TYPE, OP, LCK_ID) /* send assignment */ \ } diff --git a/contrib/libs/cxxsupp/openmp/kmp_atomic.h b/contrib/libs/cxxsupp/openmp/kmp_atomic.h index 079b917285..19c02e9d25 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_atomic.h +++ b/contrib/libs/cxxsupp/openmp/kmp_atomic.h @@ -251,6 +251,9 @@ struct KMP_DO_ALIGN(4) kmp_cmplx128_a4_t { kmp_cmplx128_a4_t() : q() {} +#if defined(__cplusplus) && (KMP_OS_WINDOWS) + kmp_cmplx128_a4_t(const std::complex<_Quad> &c128) : q(c128) {} +#endif kmp_cmplx128_a4_t(const kmp_cmplx128 &c128) : q(c128) {} kmp_cmplx128_a4_t operator+(const kmp_cmplx128_a4_t &b) { @@ -314,6 +317,9 @@ struct KMP_DO_ALIGN(16) kmp_cmplx128_a16_t { kmp_cmplx128_a16_t() : q() {} +#if defined(__cplusplus) && (KMP_OS_WINDOWS) + kmp_cmplx128_a16_t(const std::complex<_Quad> &c128) : q(c128) {} +#endif kmp_cmplx128_a16_t(const kmp_cmplx128 &c128) : q(c128) {} kmp_cmplx128_a16_t operator+(const kmp_cmplx128_a16_t &b) { diff --git a/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp b/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp index ee05bb3587..1a718b45ff 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_barrier.cpp @@ -2163,7 +2163,6 @@ void __kmp_join_barrier(int gtid) { kmp_info_t *this_thr = __kmp_threads[gtid]; kmp_team_t *team; - kmp_uint nproc; int tid; #ifdef KMP_DEBUG int team_id; @@ -2176,12 +2175,14 @@ void __kmp_join_barrier(int gtid) { itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); #endif #endif /* USE_ITT_BUILD */ +#if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG) + int nproc = this_thr->th.th_team_nproc; +#endif KMP_MB(); // Get current info team = this_thr->th.th_team; - nproc = this_thr->th.th_team_nproc; - KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); + KMP_DEBUG_ASSERT(nproc == team->t.t_nproc); tid = __kmp_tid_from_gtid(gtid); #ifdef KMP_DEBUG team_id = team->t.t_id; @@ -2354,7 +2355,7 @@ void __kmp_join_barrier(int gtid) { // Set arrive time to zero to be able to check it in // __kmp_invoke_task(); the same is done inside the loop below this_thr->th.th_bar_arrive_time = 0; - for (kmp_uint i = 1; i < nproc; ++i) { + for (int i = 1; i < nproc; ++i) { delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); other_threads[i]->th.th_bar_arrive_time = 0; } diff --git a/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp b/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp index e263558517..c932d450c8 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_csupport.cpp @@ -354,9 +354,9 @@ void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, @ingroup PARALLEL @param loc source location information @param global_tid global thread number -@param num_teams_lo lower bound on number of teams requested for the teams +@param num_teams_lb lower bound on number of teams requested for the teams construct -@param num_teams_up upper bound on number of teams requested for the teams +@param num_teams_ub upper bound on number of teams requested for the teams construct @param num_threads number of threads per team requested for the teams construct @@ -632,6 +632,11 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { "team %p\n", global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); } +#if KMP_AFFINITY_SUPPORTED + if (this_thr->th.th_team->t.t_level == 0 && __kmp_affin_reset) { + __kmp_reset_root_init_mask(global_tid); + } +#endif } else { if (__kmp_tasking_mode != tskm_immediate_exec) { KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting " @@ -685,13 +690,13 @@ void __kmpc_flush(ident_t *loc) { if (!__kmp_cpuinfo.flags.sse2) { // CPU cannot execute SSE2 instructions. } else { -#if KMP_COMPILER_ICC +#if KMP_COMPILER_ICC || KMP_COMPILER_ICX _mm_mfence(); #elif KMP_COMPILER_MSVC MemoryBarrier(); #else __sync_synchronize(); -#endif // KMP_COMPILER_ICC +#endif // KMP_COMPILER_ICC || KMP_COMPILER_ICX } #endif // KMP_MIC #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \ @@ -2021,6 +2026,11 @@ void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) { } __kmp_assign_root_init_mask(); gtid = __kmp_get_gtid(); +#if KMP_AFFINITY_SUPPORTED + if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 && __kmp_affin_reset) { + __kmp_reset_root_init_mask(gtid); + } +#endif __kmp_aux_display_affinity(gtid, format); } @@ -2034,6 +2044,11 @@ size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size, } __kmp_assign_root_init_mask(); gtid = __kmp_get_gtid(); +#if KMP_AFFINITY_SUPPORTED + if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 && __kmp_affin_reset) { + __kmp_reset_root_init_mask(gtid); + } +#endif __kmp_str_buf_init(&capture_buf); num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf); if (buffer && buf_size) { @@ -2224,6 +2239,61 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, } } +/* --------------------------------------------------------------------------*/ +/*! +@ingroup THREADPRIVATE +@param loc source location information +@param gtid global thread number +@param cpy_data pointer to the data to be saved/copied or 0 +@return the saved pointer to the data + +__kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate: +__kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so +coming from single), and returns that pointer in all calls (for single thread +it's not needed). This version doesn't do any actual data copying. Data copying +has to be done somewhere else, e.g. inline in the generated code. Due to this, +this function doesn't have any barrier at the end of the function, like +__kmpc_copyprivate does, so generated code needs barrier after copying of all +data was done. +*/ +void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) { + void **data_ptr; + + KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid)); + + KMP_MB(); + + data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data; + + if (__kmp_env_consistency_check) { + if (loc == 0) { + KMP_WARNING(ConstructIdentInvalid); + } + } + + // ToDo: Optimize the following barrier + + if (cpy_data) + *data_ptr = cpy_data; + +#if OMPT_SUPPORT + ompt_frame_t *ompt_frame; + if (ompt_enabled.enabled) { + __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); + if (ompt_frame->enter_frame.ptr == NULL) + ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); + OMPT_STORE_RETURN_ADDRESS(gtid); + } +#endif +/* This barrier is not a barrier region boundary */ +#if USE_ITT_NOTIFY + __kmp_threads[gtid]->th.th_ident = loc; +#endif + __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); + + return *data_ptr; +} + /* -------------------------------------------------------------------------- */ #define INIT_LOCK __kmp_init_user_lock_with_checks @@ -4348,7 +4418,7 @@ void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size, void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator, omp_allocator_handle_t free_allocator) { return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator, - free_allocator); + free_allocator); } void omp_free(void *ptr, omp_allocator_handle_t allocator) { diff --git a/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp b/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp index f3407bf889..8acf3d429e 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_dispatch.cpp @@ -1964,9 +1964,22 @@ int __kmp_dispatch_next_algorithm(int gtid, &(task_info->task_data), 0, codeptr); \ } \ } +#define OMPT_LOOP_DISPATCH(lb, ub, st, status) \ + if (ompt_enabled.ompt_callback_dispatch && status) { \ + ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ + ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ + ompt_dispatch_chunk_t chunk; \ + ompt_data_t instance = ompt_data_none; \ + OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st); \ + instance.ptr = &chunk; \ + ompt_callbacks.ompt_callback(ompt_callback_dispatch)( \ + &(team_info->parallel_data), &(task_info->task_data), \ + ompt_dispatch_ws_loop_chunk, instance); \ + } // TODO: implement count #else #define OMPT_LOOP_END // no-op +#define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op #endif #if KMP_STATS_ENABLED @@ -2142,6 +2155,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, #if INCLUDE_SSC_MARKS SSC_MARK_DISPATCH_NEXT(); #endif + OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status); OMPT_LOOP_END; KMP_STATS_LOOP_END; return status; @@ -2265,11 +2279,225 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, #if INCLUDE_SSC_MARKS SSC_MARK_DISPATCH_NEXT(); #endif + OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status); OMPT_LOOP_END; KMP_STATS_LOOP_END; return status; } +/*! +@ingroup WORK_SHARING +@param loc source location information +@param global_tid global thread number +@return Zero if the parallel region is not active and this thread should execute +all sections, non-zero otherwise. + +Beginning of sections construct. +There are no implicit barriers in the "sections" calls, rather the compiler +should introduce an explicit barrier if it is required. + +This implementation is based on __kmp_dispatch_init, using same constructs for +shared data (we can't have sections nested directly in omp for loop, there +should be a parallel region in between) +*/ +kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) { + + int active; + kmp_info_t *th; + kmp_team_t *team; + kmp_uint32 my_buffer_index; + dispatch_shared_info_template<kmp_int32> volatile *sh; + + KMP_DEBUG_ASSERT(__kmp_init_serial); + + if (!TCR_4(__kmp_init_parallel)) + __kmp_parallel_initialize(); + __kmp_resume_if_soft_paused(); + + /* setup data */ + th = __kmp_threads[gtid]; + team = th->th.th_team; + active = !team->t.t_serialized; + th->th.th_ident = loc; + + KMP_COUNT_BLOCK(OMP_SECTIONS); + KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid)); + + if (active) { + // Setup sections in the same way as dynamic scheduled loops. + // We need one shared data: which section is to execute next. + // (in case parallel is not active, all sections will be executed on the + // same thread) + KMP_DEBUG_ASSERT(th->th.th_dispatch == + &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); + + my_buffer_index = th->th.th_dispatch->th_disp_index++; + + // reuse shared data structures from dynamic sched loops: + sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( + &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); + KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid, + my_buffer_index)); + + th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; + th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; + + KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d " + "sh->buffer_index:%d\n", + gtid, my_buffer_index, sh->buffer_index)); + __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, + __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); + // Note: KMP_WAIT() cannot be used there: buffer index and + // my_buffer_index are *always* 32-bit integers. + KMP_MB(); + KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d " + "sh->buffer_index:%d\n", + gtid, my_buffer_index, sh->buffer_index)); + + th->th.th_dispatch->th_dispatch_pr_current = + nullptr; // sections construct doesn't need private data + th->th.th_dispatch->th_dispatch_sh_current = + CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); + } + +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.ompt_callback_work) { + ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); + ompt_task_info_t *task_info = __ompt_get_task_info_object(0); + ompt_callbacks.ompt_callback(ompt_callback_work)( + ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data), + &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); + } +#endif + KMP_PUSH_PARTITIONED_TIMER(OMP_sections); + + return active; +} + +/*! +@ingroup WORK_SHARING +@param loc source location information +@param global_tid global thread number +@param numberOfSections number of sections in the 'sections' construct +@return unsigned [from 0 to n) - number (id) of the section to execute next on +this thread. n (or any other number not in range) - nothing to execute on this +thread +*/ + +kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid, + kmp_int32 numberOfSections) { + + KMP_TIME_PARTITIONED_BLOCK(OMP_sections); + + kmp_info_t *th = __kmp_threads[gtid]; +#ifdef KMP_DEBUG + kmp_team_t *team = th->th.th_team; +#endif + + KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid, + numberOfSections)); + + // For serialized case we should not call this function: + KMP_DEBUG_ASSERT(!team->t.t_serialized); + + dispatch_shared_info_template<kmp_int32> volatile *sh; + + KMP_DEBUG_ASSERT(th->th.th_dispatch == + &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); + + KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current)); + sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( + th->th.th_dispatch->th_dispatch_sh_current); + KMP_DEBUG_ASSERT(sh); + + kmp_int32 sectionIndex = 0; + bool moreSectionsToExecute = true; + + // Find section to execute: + sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration); + if (sectionIndex >= numberOfSections) { + moreSectionsToExecute = false; + } + + // status == 0: no more sections to execute; + // OMPTODO: __kmpc_end_sections could be bypassed? + if (!moreSectionsToExecute) { + kmp_int32 num_done; + + num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done)); + + if (num_done == th->th.th_team_nproc - 1) { + /* NOTE: release this buffer to be reused */ + + KMP_MB(); /* Flush all pending memory write invalidates. */ + + sh->u.s.num_done = 0; + sh->u.s.iteration = 0; + + KMP_MB(); /* Flush all pending memory write invalidates. */ + + sh->buffer_index += __kmp_dispatch_num_buffers; + KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid, + sh->buffer_index)); + + KMP_MB(); /* Flush all pending memory write invalidates. */ + + } // if + + th->th.th_dispatch->th_deo_fcn = NULL; + th->th.th_dispatch->th_dxo_fcn = NULL; + th->th.th_dispatch->th_dispatch_sh_current = NULL; + th->th.th_dispatch->th_dispatch_pr_current = NULL; + +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.ompt_callback_dispatch) { + ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); + ompt_task_info_t *task_info = __ompt_get_task_info_object(0); + ompt_data_t instance = ompt_data_none; + instance.ptr = OMPT_GET_RETURN_ADDRESS(0); + ompt_callbacks.ompt_callback(ompt_callback_dispatch)( + &(team_info->parallel_data), &(task_info->task_data), + ompt_dispatch_section, instance); + } +#endif + KMP_POP_PARTITIONED_TIMER(); + } + + return sectionIndex; +} + +/*! +@ingroup WORK_SHARING +@param loc source location information +@param global_tid global thread number + +End of "sections" construct. +Don't need to wait here: barrier is added separately when needed. +*/ +void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) { + + kmp_info_t *th = __kmp_threads[gtid]; + int active = !th->th.th_team->t.t_serialized; + + KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid)); + + if (!active) { + // In active case call finalization is done in __kmpc_next_section +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.ompt_callback_work) { + ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); + ompt_task_info_t *task_info = __ompt_get_task_info_object(0); + ompt_callbacks.ompt_callback(ompt_callback_work)( + ompt_work_sections, ompt_scope_end, &(team_info->parallel_data), + &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); + } +#endif + KMP_POP_PARTITIONED_TIMER(); + } + + KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid)); +} + template <typename T> static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, kmp_int32 *plastiter, T *plower, T *pupper, diff --git a/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h b/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h index bf9ebf9b2e..6b332244c6 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h +++ b/contrib/libs/cxxsupp/openmp/kmp_ftn_entry.h @@ -238,6 +238,10 @@ int FTN_STDCALL FTN_GET_AFFINITY(void **mask) { __kmp_middle_initialize(); } __kmp_assign_root_init_mask(); + int gtid = __kmp_get_gtid(); + if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 && __kmp_affin_reset) { + __kmp_reset_root_init_mask(gtid); + } return __kmp_aux_get_affinity(mask); #endif } @@ -358,9 +362,13 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_THREADS)(void) { if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } - __kmp_assign_root_init_mask(); gtid = __kmp_entry_gtid(); thread = __kmp_threads[gtid]; +#if KMP_AFFINITY_SUPPORTED + if (thread->th.th_team->t.t_level == 0 && !__kmp_affin_reset) { + __kmp_assign_root_init_mask(); + } +#endif // return thread -> th.th_team -> t.t_current_task[ // thread->th.th_info.ds.ds_tid ] -> icvs.nproc; return thread->th.th_current_task->td_icvs.nproc; @@ -509,6 +517,11 @@ void FTN_STDCALL KMP_EXPAND_NAME_IF_APPEND(FTN_DISPLAY_AFFINITY)( } __kmp_assign_root_init_mask(); gtid = __kmp_get_gtid(); +#if KMP_AFFINITY_SUPPORTED + if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 && __kmp_affin_reset) { + __kmp_reset_root_init_mask(gtid); + } +#endif ConvertedString cformat(format, size); __kmp_aux_display_affinity(gtid, cformat.get()); #endif @@ -537,6 +550,11 @@ size_t FTN_STDCALL KMP_EXPAND_NAME_IF_APPEND(FTN_CAPTURE_AFFINITY)( } __kmp_assign_root_init_mask(); gtid = __kmp_get_gtid(); +#if KMP_AFFINITY_SUPPORTED + if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 && __kmp_affin_reset) { + __kmp_reset_root_init_mask(gtid); + } +#endif __kmp_str_buf_init(&capture_buf); ConvertedString cformat(format, for_size); num_required = __kmp_aux_capture_affinity(gtid, cformat.get(), &capture_buf); @@ -612,7 +630,16 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PROCS)(void) { if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } - __kmp_assign_root_init_mask(); +#if KMP_AFFINITY_SUPPORTED + if (!__kmp_affin_reset) { + // only bind root here if its affinity reset is not requested + int gtid = __kmp_entry_gtid(); + kmp_info_t *thread = __kmp_threads[gtid]; + if (thread->th.th_team->t.t_level == 0) { + __kmp_assign_root_init_mask(); + } + } +#endif return __kmp_avail_proc; #endif } @@ -802,9 +829,16 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PLACES)(void) { if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } - __kmp_assign_root_init_mask(); if (!KMP_AFFINITY_CAPABLE()) return 0; + if (!__kmp_affin_reset) { + // only bind root here if its affinity reset is not requested + int gtid = __kmp_entry_gtid(); + kmp_info_t *thread = __kmp_threads[gtid]; + if (thread->th.th_team->t.t_level == 0) { + __kmp_assign_root_init_mask(); + } + } return __kmp_affinity_num_masks; #endif } @@ -818,9 +852,16 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM_PROCS)(int place_num) { if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } - __kmp_assign_root_init_mask(); if (!KMP_AFFINITY_CAPABLE()) return 0; + if (!__kmp_affin_reset) { + // only bind root here if its affinity reset is not requested + int gtid = __kmp_entry_gtid(); + kmp_info_t *thread = __kmp_threads[gtid]; + if (thread->th.th_team->t.t_level == 0) { + __kmp_assign_root_init_mask(); + } + } if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks) return 0; kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num); @@ -844,9 +885,16 @@ void FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_PROC_IDS)(int place_num, if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } - __kmp_assign_root_init_mask(); if (!KMP_AFFINITY_CAPABLE()) return; + if (!__kmp_affin_reset) { + // only bind root here if its affinity reset is not requested + int gtid = __kmp_entry_gtid(); + kmp_info_t *thread = __kmp_threads[gtid]; + if (thread->th.th_team->t.t_level == 0) { + __kmp_assign_root_init_mask(); + } + } if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks) return; kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num); @@ -870,11 +918,13 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM)(void) { if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } - __kmp_assign_root_init_mask(); if (!KMP_AFFINITY_CAPABLE()) return -1; gtid = __kmp_entry_gtid(); thread = __kmp_thread_from_gtid(gtid); + if (thread->th.th_team->t.t_level == 0 && !__kmp_affin_reset) { + __kmp_assign_root_init_mask(); + } if (thread->th.th_current_place < 0) return -1; return thread->th.th_current_place; @@ -890,11 +940,13 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_NUM_PLACES)(void) { if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } - __kmp_assign_root_init_mask(); if (!KMP_AFFINITY_CAPABLE()) return 0; gtid = __kmp_entry_gtid(); thread = __kmp_thread_from_gtid(gtid); + if (thread->th.th_team->t.t_level == 0 && !__kmp_affin_reset) { + __kmp_assign_root_init_mask(); + } first_place = thread->th.th_first_place; last_place = thread->th.th_last_place; if (first_place < 0 || last_place < 0) @@ -917,11 +969,13 @@ KMP_EXPAND_NAME(FTN_GET_PARTITION_PLACE_NUMS)(int *place_nums) { if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } - __kmp_assign_root_init_mask(); if (!KMP_AFFINITY_CAPABLE()) return; gtid = __kmp_entry_gtid(); thread = __kmp_thread_from_gtid(gtid); + if (thread->th.th_team->t.t_level == 0 && !__kmp_affin_reset) { + __kmp_assign_root_init_mask(); + } first_place = thread->th.th_first_place; last_place = thread->th.th_last_place; if (first_place < 0 || last_place < 0) @@ -1567,6 +1621,15 @@ void FTN_STDCALL FTN_DISPLAY_ENV(int verbose) { #endif } +int FTN_STDCALL FTN_IN_EXPLICIT_TASK(void) { +#ifdef KMP_STUB + return 0; +#else + int gtid = __kmp_entry_gtid(); + return __kmp_thread_from_gtid(gtid)->th.th_current_task->td_flags.tasktype; +#endif +} + // GCC compatibility (versioned symbols) #ifdef KMP_USE_VERSION_SYMBOLS diff --git a/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h b/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h index 66e1e1ecd2..d37c9c8602 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h +++ b/contrib/libs/cxxsupp/openmp/kmp_ftn_os.h @@ -134,6 +134,7 @@ #define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all #define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels #define FTN_DISPLAY_ENV omp_display_env +#define FTN_IN_EXPLICIT_TASK omp_in_explicit_task #define FTN_FULFILL_EVENT omp_fulfill_event #define FTN_SET_NUM_TEAMS omp_set_num_teams #define FTN_GET_MAX_TEAMS omp_get_max_teams @@ -270,6 +271,7 @@ #define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all_ #define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels_ #define FTN_DISPLAY_ENV omp_display_env_ +#define FTN_IN_EXPLICIT_TASK omp_in_explicit_task_ #define FTN_FULFILL_EVENT omp_fulfill_event_ #define FTN_SET_NUM_TEAMS omp_set_num_teams_ #define FTN_GET_MAX_TEAMS omp_get_max_teams_ @@ -404,6 +406,7 @@ #define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL #define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS #define FTN_DISPLAY_ENV OMP_DISPLAY_ENV +#define FTN_IN_EXPLICIT_TASK OMP_IN_EXPLICIT_TASK #define FTN_FULFILL_EVENT OMP_FULFILL_EVENT #define FTN_SET_NUM_TEAMS OMP_SET_NUM_TEAMS #define FTN_GET_MAX_TEAMS OMP_GET_MAX_TEAMS @@ -540,6 +543,7 @@ #define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL_ #define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS_ #define FTN_DISPLAY_ENV OMP_DISPLAY_ENV_ +#define FTN_IN_EXPLICIT_TASK OMP_IN_EXPLICIT_TASK_ #define FTN_FULFILL_EVENT OMP_FULFILL_EVENT_ #define FTN_SET_NUM_TEAMS OMP_SET_NUM_TEAMS_ #define FTN_GET_MAX_TEAMS OMP_GET_MAX_TEAMS_ diff --git a/contrib/libs/cxxsupp/openmp/kmp_global.cpp b/contrib/libs/cxxsupp/openmp/kmp_global.cpp index 62bdac3c4b..04b63c72d6 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_global.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_global.cpp @@ -44,6 +44,7 @@ tsc_tick_count __kmp_stats_start_time; volatile int __kmp_init_serial = FALSE; volatile int __kmp_init_gtid = FALSE; volatile int __kmp_init_common = FALSE; +volatile int __kmp_need_register_serial = TRUE; volatile int __kmp_init_middle = FALSE; volatile int __kmp_init_parallel = FALSE; volatile int __kmp_init_hidden_helper = FALSE; @@ -154,6 +155,7 @@ int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1]; kmp_hier_sched_env_t __kmp_hier_scheds = {0, 0, NULL, NULL, NULL}; #endif int __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; +bool __kmp_wpolicy_passive = false; #if KMP_USE_MONITOR int __kmp_monitor_wakeups = KMP_MIN_MONITOR_WAKEUPS; int __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(KMP_DEFAULT_BLOCKTIME, @@ -283,6 +285,7 @@ kmp_affin_mask_t *__kmp_affinity_masks = NULL; unsigned __kmp_affinity_num_masks = 0; char *__kmp_cpuinfo_file = NULL; +bool __kmp_affin_reset = 0; #endif /* KMP_AFFINITY_SUPPORTED */ @@ -316,7 +319,6 @@ omp_allocator_handle_t const omp_pteam_mem_alloc = (omp_allocator_handle_t const)7; omp_allocator_handle_t const omp_thread_mem_alloc = (omp_allocator_handle_t const)8; -// Preview of target memory support omp_allocator_handle_t const llvm_omp_target_host_mem_alloc = (omp_allocator_handle_t const)100; omp_allocator_handle_t const llvm_omp_target_shared_mem_alloc = @@ -337,7 +339,6 @@ omp_memspace_handle_t const omp_high_bw_mem_space = (omp_memspace_handle_t const)3; omp_memspace_handle_t const omp_low_lat_mem_space = (omp_memspace_handle_t const)4; -// Preview of target memory support omp_memspace_handle_t const llvm_omp_target_host_mem_space = (omp_memspace_handle_t const)100; omp_memspace_handle_t const llvm_omp_target_shared_mem_space = @@ -426,7 +427,13 @@ int __kmp_env_consistency_check = FALSE; /* KMP_CONSISTENCY_CHECK specified? */ // 0 = never yield; // 1 = always yield (default); // 2 = yield only if oversubscribed +#if KMP_OS_DARWIN && KMP_ARCH_AARCH64 +// Set to 0 for environments where yield is slower +kmp_int32 __kmp_use_yield = 0; +#else kmp_int32 __kmp_use_yield = 1; +#endif + // This will be 1 if KMP_USE_YIELD environment variable was set explicitly kmp_int32 __kmp_use_yield_exp_set = 0; @@ -443,6 +450,7 @@ kmp_uint64 __kmp_pause_init = 1; // for tpause KMP_ALIGN_CACHE kmp_info_t **__kmp_threads = NULL; kmp_root_t **__kmp_root = NULL; +kmp_old_threads_list_t *__kmp_old_threads_list = NULL; /* data read/written to often by primary threads */ KMP_ALIGN_CACHE diff --git a/contrib/libs/cxxsupp/openmp/kmp_lock.cpp b/contrib/libs/cxxsupp/openmp/kmp_lock.cpp index fff7305b57..8fcddc7108 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_lock.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_lock.cpp @@ -1954,7 +1954,7 @@ static inline bool __kmp_is_unlocked_queuing_lock(kmp_queuing_lock_t *lck) { // We need a fence here, since we must ensure that no memory operations // from later in this thread float above that read. -#if KMP_COMPILER_ICC +#if KMP_COMPILER_ICC || KMP_COMPILER_ICX _mm_mfence(); #else __sync_synchronize(); diff --git a/contrib/libs/cxxsupp/openmp/kmp_os.h b/contrib/libs/cxxsupp/openmp/kmp_os.h index d71e9aecb3..02efaa1b26 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_os.h +++ b/contrib/libs/cxxsupp/openmp/kmp_os.h @@ -17,6 +17,7 @@ #include <atomic> #include <stdarg.h> #include <stdlib.h> +#include <string.h> #define KMP_FTN_PLAIN 1 #define KMP_FTN_APPEND 2 @@ -53,8 +54,12 @@ #define KMP_COMPILER_GCC 0 #define KMP_COMPILER_CLANG 0 #define KMP_COMPILER_MSVC 0 +#define KMP_COMPILER_ICX 0 -#if defined(__INTEL_COMPILER) +#if __INTEL_CLANG_COMPILER +#undef KMP_COMPILER_ICX +#define KMP_COMPILER_ICX 1 +#elif defined(__INTEL_COMPILER) #undef KMP_COMPILER_ICC #define KMP_COMPILER_ICC 1 #elif defined(__clang__) @@ -82,10 +87,16 @@ #define KMP_GROUP_AFFINITY 0 #endif +#if (KMP_OS_LINUX || (KMP_OS_FREEBSD && __FreeBSD_version >= 1301000)) +#define KMP_HAVE_SCHED_GETCPU 1 +#else +#define KMP_HAVE_SCHED_GETCPU 0 +#endif + /* Check for quad-precision extension. */ #define KMP_HAVE_QUAD 0 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 -#if KMP_COMPILER_ICC +#if KMP_COMPILER_ICC || KMP_COMPILER_ICX /* _Quad is already defined for icc */ #undef KMP_HAVE_QUAD #define KMP_HAVE_QUAD 1 @@ -334,6 +345,9 @@ extern "C" { // Use a function like macro to imply that it must be followed by a semicolon #if __cplusplus > 201402L && __has_cpp_attribute(fallthrough) #define KMP_FALLTHROUGH() [[fallthrough]] +// icc cannot properly tell this attribute is absent so force off +#elif KMP_COMPILER_ICC +#define KMP_FALLTHROUGH() ((void)0) #elif __has_cpp_attribute(clang::fallthrough) #define KMP_FALLTHROUGH() [[clang::fallthrough]] #elif __has_attribute(fallthrough) || __GNUC__ >= 7 @@ -448,8 +462,10 @@ enum kmp_mem_fence_type { #pragma intrinsic(InterlockedExchangeAdd) #pragma intrinsic(InterlockedCompareExchange) #pragma intrinsic(InterlockedExchange) +#if !(KMP_COMPILER_ICX && KMP_32_BIT_ARCH) #pragma intrinsic(InterlockedExchange64) #endif +#endif // Using InterlockedIncrement / InterlockedDecrement causes a library loading // ordering problem, so we use InterlockedExchangeAdd instead. @@ -842,8 +858,14 @@ static inline bool mips_sync_val_compare_and_swap(volatile kmp_uint64 *p, (kmp_uint64)(sv)) #endif +#if KMP_OS_DARWIN && defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1800 +#define KMP_XCHG_FIXED8(p, v) \ + __atomic_exchange_1((volatile kmp_uint8 *)(p), (kmp_uint8)(v), \ + __ATOMIC_SEQ_CST) +#else #define KMP_XCHG_FIXED8(p, v) \ __sync_lock_test_and_set((volatile kmp_uint8 *)(p), (kmp_uint8)(v)) +#endif #define KMP_XCHG_FIXED16(p, v) \ __sync_lock_test_and_set((volatile kmp_uint16 *)(p), (kmp_uint16)(v)) #define KMP_XCHG_FIXED32(p, v) \ @@ -852,15 +874,25 @@ static inline bool mips_sync_val_compare_and_swap(volatile kmp_uint64 *p, __sync_lock_test_and_set((volatile kmp_uint64 *)(p), (kmp_uint64)(v)) inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) { - kmp_int32 tmp = - __sync_lock_test_and_set((volatile kmp_uint32 *)(p), *(kmp_uint32 *)&v); - return *(kmp_real32 *)&tmp; + volatile kmp_uint32 *up; + kmp_uint32 uv; + memcpy(&up, &p, sizeof(up)); + memcpy(&uv, &v, sizeof(uv)); + kmp_int32 tmp = __sync_lock_test_and_set(up, uv); + kmp_real32 ftmp; + memcpy(&ftmp, &tmp, sizeof(tmp)); + return ftmp; } inline kmp_real64 KMP_XCHG_REAL64(volatile kmp_real64 *p, kmp_real64 v) { - kmp_int64 tmp = - __sync_lock_test_and_set((volatile kmp_uint64 *)(p), *(kmp_uint64 *)&v); - return *(kmp_real64 *)&tmp; + volatile kmp_uint64 *up; + kmp_uint64 uv; + memcpy(&up, &p, sizeof(up)); + memcpy(&uv, &v, sizeof(uv)); + kmp_int64 tmp = __sync_lock_test_and_set(up, uv); + kmp_real64 dtmp; + memcpy(&dtmp, &tmp, sizeof(tmp)); + return dtmp; } #else @@ -1026,7 +1058,7 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v); #endif #if KMP_ARCH_X86 || KMP_ARCH_X86_64 -#if KMP_COMPILER_ICC +#if KMP_COMPILER_ICC || KMP_COMPILER_ICX #define KMP_MFENCE_() _mm_mfence() #define KMP_SFENCE_() _mm_sfence() #elif KMP_COMPILER_MSVC diff --git a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp index 34f8a01743..bfbff03bd6 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_runtime.cpp @@ -2222,11 +2222,11 @@ int __kmp_fork_call(ident_t *loc, int gtid, } else // only one notification scheme (either "submit" or "forking/joined", not both) #endif /* USE_ITT_NOTIFY */ - if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && - __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { - // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. - __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); - } + if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && + __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { + // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. + __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); + } } #endif /* USE_ITT_BUILD */ @@ -2641,6 +2641,11 @@ void __kmp_join_call(ident_t *loc, int gtid __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); +#if KMP_AFFINITY_SUPPORTED + if (master_th->th.th_team->t.t_level == 0 && __kmp_affin_reset) { + __kmp_reset_root_init_mask(gtid); + } +#endif #if OMPT_SUPPORT int flags = OMPT_INVOKER(fork_context) | @@ -3276,7 +3281,7 @@ static void __kmp_initialize_root(kmp_root_t *root) { __kmp_nested_proc_bind.bind_types[0], &r_icvs, 0 // argc USE_NESTED_HOT_ARG(NULL) // primary thread is unknown - ); + ); #if USE_DEBUGGER // Non-NULL value should be assigned to make the debugger display the root // team. @@ -3313,7 +3318,7 @@ static void __kmp_initialize_root(kmp_root_t *root) { __kmp_nested_proc_bind.bind_types[0], &r_icvs, 0 // argc USE_NESTED_HOT_ARG(NULL) // primary thread is unknown - ); + ); KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); root->r.r_hot_team = hot_team; @@ -3669,11 +3674,16 @@ static int __kmp_expand_threads(int nNeed) { __kmp_threads_capacity * sizeof(kmp_info_t *)); KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t *)); + // Put old __kmp_threads array on a list. Any ongoing references to the old + // list will be valid. This list is cleaned up at library shutdown. + kmp_old_threads_list_t *node = + (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t)); + node->threads = __kmp_threads; + node->next = __kmp_old_threads_list; + __kmp_old_threads_list = node; - kmp_info_t **temp_threads = __kmp_threads; *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; *(kmp_root_t * *volatile *)&__kmp_root = newRoot; - __kmp_free(temp_threads); added += newCapacity - __kmp_threads_capacity; *(volatile int *)&__kmp_threads_capacity = newCapacity; @@ -6960,10 +6970,12 @@ static void __kmp_do_serial_initialize(void) { /* Initialize internal memory allocator */ __kmp_init_allocator(); - /* Register the library startup via an environment variable and check to see - whether another copy of the library is already registered. */ - - __kmp_register_library_startup(); + /* Register the library startup via an environment variable or via mapped + shared memory file and check to see whether another copy of the library is + already registered. Since forked child process is often terminated, we + postpone the registration till middle initialization in the child */ + if (__kmp_need_register_serial) + __kmp_register_library_startup(); /* TODO reinitialization of library */ if (TCR_4(__kmp_global.g.g_done)) { @@ -7250,6 +7262,12 @@ static void __kmp_do_middle_initialize(void) { KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); + if (UNLIKELY(!__kmp_need_register_serial)) { + // We are in a forked child process. The registration was skipped during + // serial initialization in __kmp_atfork_child handler. Do it here. + __kmp_register_library_startup(); + } + // Save the previous value for the __kmp_dflt_team_nth so that // we can avoid some reinitialization if it hasn't changed. prev_dflt_team_nth = __kmp_dflt_team_nth; @@ -8101,6 +8119,15 @@ void __kmp_cleanup(void) { __kmp_root = NULL; __kmp_threads_capacity = 0; + // Free old __kmp_threads arrays if they exist. + kmp_old_threads_list_t *ptr = __kmp_old_threads_list; + while (ptr) { + kmp_old_threads_list_t *next = ptr->next; + __kmp_free(ptr->threads); + __kmp_free(ptr); + ptr = next; + } + #if KMP_USE_DYNAMIC_LOCK __kmp_cleanup_indirect_user_locks(); #else @@ -8286,7 +8313,7 @@ void __kmp_aux_set_library(enum library_type arg) { break; case library_throughput: if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) - __kmp_dflt_blocktime = 200; + __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; break; default: KMP_FATAL(UnknownLibraryType, arg); @@ -8707,7 +8734,8 @@ __kmp_determine_reduction_method( KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ - ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) + (loc && \ + ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))) #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) retval = critical_reduce_block; @@ -8953,19 +8981,16 @@ void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2); } // Release all the workers - kmp_uint64 new_value; // new value for go - new_value = team->t.b->go_release(); + team->t.b->go_release(); KMP_MFENCE(); // Workers should see transition status 2 and move to 0; but may need to be // woken up first - size_t my_go_index; int count = old_nthreads - 1; while (count > 0) { count = old_nthreads - 1; for (int f = 1; f < old_nthreads; ++f) { - my_go_index = f / team->t.b->threads_per_go; if (other_threads[f]->th.th_used_in_team.load() != 0) { if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST( diff --git a/contrib/libs/cxxsupp/openmp/kmp_sched.cpp b/contrib/libs/cxxsupp/openmp/kmp_sched.cpp index 09e497e029..acd75448d2 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_sched.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_sched.cpp @@ -101,7 +101,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid, static kmp_int8 warn = 0; - if (ompt_enabled.ompt_callback_work) { + if (ompt_enabled.ompt_callback_work || ompt_enabled.ompt_callback_dispatch) { // Only fully initialize variables needed by OMPT if OMPT is enabled. team_info = __ompt_get_teaminfo(0, NULL); task_info = __ompt_get_task_info_object(0); @@ -194,8 +194,13 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid, // we are in DISTRIBUTE construct schedtype += kmp_sch_static - kmp_distribute_static; // AC: convert to usual schedule type - tid = th->th.th_team->t.t_master_tid; - team = th->th.th_team->t.t_parent; + if (th->th.th_team->t.t_serialized > 1) { + tid = 0; + team = th->th.th_team; + } else { + tid = th->th.th_team->t.t_master_tid; + team = th->th.th_team->t.t_parent; + } } else { tid = __kmp_tid_from_gtid(global_tid); team = th->th.th_team; @@ -433,6 +438,24 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid, ompt_work_type, ompt_scope_begin, &(team_info->parallel_data), &(task_info->task_data), trip_count, codeptr); } + if (ompt_enabled.ompt_callback_dispatch) { + ompt_dispatch_t dispatch_type; + ompt_data_t instance = ompt_data_none; + ompt_dispatch_chunk_t dispatch_chunk; + if (ompt_work_type == ompt_work_sections) { + dispatch_type = ompt_dispatch_section; + instance.ptr = codeptr; + } else { + OMPT_GET_DISPATCH_CHUNK(dispatch_chunk, *plower, *pupper, incr); + dispatch_type = (ompt_work_type == ompt_work_distribute) + ? ompt_dispatch_distribute_chunk + : ompt_dispatch_ws_loop_chunk; + instance.ptr = &dispatch_chunk; + } + ompt_callbacks.ompt_callback(ompt_callback_dispatch)( + &(team_info->parallel_data), &(task_info->task_data), dispatch_type, + instance); + } #endif KMP_STATS_LOOP_END(OMP_loop_static_iterations); @@ -445,7 +468,12 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid, T *plower, T *pupper, T *pupperDist, typename traits_t<T>::signed_t *pstride, typename traits_t<T>::signed_t incr, - typename traits_t<T>::signed_t chunk) { + typename traits_t<T>::signed_t chunk +#if OMPT_SUPPORT && OMPT_OPTIONAL + , + void *codeptr +#endif +) { KMP_COUNT_BLOCK(OMP_DISTRIBUTE); KMP_PUSH_PARTITIONED_TIMER(OMP_distribute); KMP_PUSH_PARTITIONED_TIMER(OMP_distribute_scheduling); @@ -677,6 +705,26 @@ end:; } #endif KE_TRACE(10, ("__kmpc_dist_for_static_init: T#%d return\n", gtid)); +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.ompt_callback_work || ompt_enabled.ompt_callback_dispatch) { + ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); + ompt_task_info_t *task_info = __ompt_get_task_info_object(0); + if (ompt_enabled.ompt_callback_work) { + ompt_callbacks.ompt_callback(ompt_callback_work)( + ompt_work_distribute, ompt_scope_begin, &(team_info->parallel_data), + &(task_info->task_data), 0, codeptr); + } + if (ompt_enabled.ompt_callback_dispatch) { + ompt_data_t instance = ompt_data_none; + ompt_dispatch_chunk_t dispatch_chunk; + OMPT_GET_DISPATCH_CHUNK(dispatch_chunk, *plower, *pupperDist, incr); + instance.ptr = &dispatch_chunk; + ompt_callbacks.ompt_callback(ompt_callback_dispatch)( + &(team_info->parallel_data), &(task_info->task_data), + ompt_dispatch_distribute_chunk, instance); + } + } +#endif // OMPT_SUPPORT && OMPT_OPTIONAL KMP_STATS_LOOP_END(OMP_distribute_iterations); return; } @@ -882,6 +930,12 @@ void __kmpc_for_static_init_8u(ident_t *loc, kmp_int32 gtid, @} */ +#if OMPT_SUPPORT && OMPT_OPTIONAL +#define OMPT_CODEPTR_ARG , OMPT_GET_RETURN_ADDRESS(0) +#else +#define OMPT_CODEPTR_ARG +#endif + /*! @ingroup WORK_SHARING @param loc Source code location @@ -910,7 +964,8 @@ void __kmpc_dist_for_static_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *pupperD, kmp_int32 *pstride, kmp_int32 incr, kmp_int32 chunk) { __kmp_dist_for_static_init<kmp_int32>(loc, gtid, schedule, plastiter, plower, - pupper, pupperD, pstride, incr, chunk); + pupper, pupperD, pstride, incr, + chunk OMPT_CODEPTR_ARG); } /*! @@ -922,7 +977,8 @@ void __kmpc_dist_for_static_init_4u(ident_t *loc, kmp_int32 gtid, kmp_uint32 *pupperD, kmp_int32 *pstride, kmp_int32 incr, kmp_int32 chunk) { __kmp_dist_for_static_init<kmp_uint32>(loc, gtid, schedule, plastiter, plower, - pupper, pupperD, pstride, incr, chunk); + pupper, pupperD, pstride, incr, + chunk OMPT_CODEPTR_ARG); } /*! @@ -934,7 +990,8 @@ void __kmpc_dist_for_static_init_8(ident_t *loc, kmp_int32 gtid, kmp_int64 *pupperD, kmp_int64 *pstride, kmp_int64 incr, kmp_int64 chunk) { __kmp_dist_for_static_init<kmp_int64>(loc, gtid, schedule, plastiter, plower, - pupper, pupperD, pstride, incr, chunk); + pupper, pupperD, pstride, incr, + chunk OMPT_CODEPTR_ARG); } /*! @@ -946,7 +1003,8 @@ void __kmpc_dist_for_static_init_8u(ident_t *loc, kmp_int32 gtid, kmp_uint64 *pupperD, kmp_int64 *pstride, kmp_int64 incr, kmp_int64 chunk) { __kmp_dist_for_static_init<kmp_uint64>(loc, gtid, schedule, plastiter, plower, - pupper, pupperD, pstride, incr, chunk); + pupper, pupperD, pstride, incr, + chunk OMPT_CODEPTR_ARG); } /*! @} diff --git a/contrib/libs/cxxsupp/openmp/kmp_settings.cpp b/contrib/libs/cxxsupp/openmp/kmp_settings.cpp index 112502fdce..38ff15461b 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_settings.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_settings.cpp @@ -812,6 +812,7 @@ static void __kmp_stg_parse_wait_policy(char const *name, char const *value, } } else if (__kmp_str_match("PASSIVE", 1, value)) { __kmp_library = library_throughput; + __kmp_wpolicy_passive = true; /* allow sleep while active tasking */ if (blocktime_str == NULL) { // KMP_BLOCKTIME not specified, so set default to 0. __kmp_dflt_blocktime = 0; @@ -1245,13 +1246,25 @@ static void __kmp_stg_parse_num_hidden_helper_threads(char const *name, // task if (__kmp_hidden_helper_threads_num == 0) { __kmp_enable_hidden_helper = FALSE; + } else { + // Since the main thread of hidden helper team dooes not participate + // in tasks execution let's increment the number of threads by one + // so that requested number of threads do actual job. + __kmp_hidden_helper_threads_num++; } } // __kmp_stg_parse_num_hidden_helper_threads static void __kmp_stg_print_num_hidden_helper_threads(kmp_str_buf_t *buffer, char const *name, void *data) { - __kmp_stg_print_int(buffer, name, __kmp_hidden_helper_threads_num); + if (__kmp_hidden_helper_threads_num == 0) { + __kmp_stg_print_int(buffer, name, __kmp_hidden_helper_threads_num); + } else { + KMP_DEBUG_ASSERT(__kmp_hidden_helper_threads_num > 1); + // Let's exclude the main thread of hidden helper team and print + // number of worker threads those do actual job. + __kmp_stg_print_int(buffer, name, __kmp_hidden_helper_threads_num - 1); + } } // __kmp_stg_print_num_hidden_helper_threads static void __kmp_stg_parse_use_hidden_helper(char const *name, @@ -2156,6 +2169,7 @@ static void __kmp_parse_affinity_env(char const *name, char const *value, int respect = 0; int gran = 0; int dups = 0; + int reset = 0; bool set = false; KMP_ASSERT(value != NULL); @@ -2211,6 +2225,7 @@ static void __kmp_parse_affinity_env(char const *name, char const *value, #define set_respect(val) _set_param(respect, *out_respect, val) #define set_dups(val) _set_param(dups, *out_dups, val) #define set_proclist(val) _set_param(proclist, *out_proclist, val) +#define set_reset(val) _set_param(reset, __kmp_affin_reset, val) #define set_gran(val, levels) \ { \ @@ -2280,6 +2295,12 @@ static void __kmp_parse_affinity_env(char const *name, char const *value, } else if (__kmp_match_str("norespect", buf, CCAST(const char **, &next))) { set_respect(FALSE); buf = next; + } else if (__kmp_match_str("reset", buf, CCAST(const char **, &next))) { + set_reset(TRUE); + buf = next; + } else if (__kmp_match_str("noreset", buf, CCAST(const char **, &next))) { + set_reset(FALSE); + buf = next; } else if (__kmp_match_str("duplicates", buf, CCAST(const char **, &next)) || __kmp_match_str("dups", buf, CCAST(const char **, &next))) { @@ -2420,6 +2441,7 @@ static void __kmp_parse_affinity_env(char const *name, char const *value, #undef set_warnings #undef set_respect #undef set_granularity +#undef set_reset __kmp_str_free(&buffer); @@ -2551,6 +2573,11 @@ static void __kmp_stg_print_affinity(kmp_str_buf_t *buffer, char const *name, } else { __kmp_str_buf_print(buffer, "%s,", "norespect"); } + if (__kmp_affin_reset) { + __kmp_str_buf_print(buffer, "%s,", "reset"); + } else { + __kmp_str_buf_print(buffer, "%s,", "noreset"); + } __kmp_str_buf_print(buffer, "granularity=%s,", __kmp_hw_get_keyword(__kmp_affinity_gran, false)); } @@ -5009,7 +5036,7 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value, attr.set_core_type(KMP_HW_CORE_TYPE_CORE); } else if (__kmp_str_match("intel_atom", -1, attr_ptr + 1)) { attr.set_core_type(KMP_HW_CORE_TYPE_ATOM); - } + } else #endif if (__kmp_str_match("eff", 3, attr_ptr + 1)) { const char *number = attr_ptr + 1; diff --git a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp index 501830eaa7..6c1d93a891 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.cpp @@ -45,6 +45,9 @@ static void __kmp_init_node(kmp_depnode_t *node) { #ifdef KMP_SUPPORT_GRAPH_OUTPUT node->dn.id = KMP_ATOMIC_INC(&kmp_node_id_seed); #endif +#if USE_ITT_BUILD && USE_ITT_NOTIFY + __itt_sync_create(node, "OMP task dep node", NULL, 0); +#endif } static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) { diff --git a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h index 99f182bbd0..ac6174afd3 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h +++ b/contrib/libs/cxxsupp/openmp/kmp_taskdeps.h @@ -25,6 +25,9 @@ static inline void __kmp_node_deref(kmp_info_t *thread, kmp_depnode_t *node) { kmp_int32 n = KMP_ATOMIC_DEC(&node->dn.nrefs) - 1; KMP_DEBUG_ASSERT(n >= 0); if (n == 0) { +#if USE_ITT_BUILD && USE_ITT_NOTIFY + __itt_sync_destroy(node); +#endif KMP_ASSERT(node->dn.nrefs == 0); #if USE_FAST_MEMORY __kmp_fast_free(thread, node); @@ -125,11 +128,17 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) { kmp_taskdata_t *next_taskdata; for (kmp_depnode_list_t *p = node->dn.successors; p; p = next) { kmp_depnode_t *successor = p->node; +#if USE_ITT_BUILD && USE_ITT_NOTIFY + __itt_sync_releasing(successor); +#endif kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->dn.npredecessors) - 1; // successor task can be NULL for wait_depends or because deps are still // being processed if (npredecessors == 0) { +#if USE_ITT_BUILD && USE_ITT_NOTIFY + __itt_sync_acquired(successor); +#endif KMP_MB(); if (successor->dn.task) { KA_TRACE(20, ("__kmp_release_deps: T#%d successor %p of %p scheduled " diff --git a/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp b/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp index e445438524..1622c6aea1 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_tasking.cpp @@ -319,6 +319,144 @@ static void __kmp_realloc_task_deque(kmp_info_t *thread, thread_data->td.td_deque_size = new_size; } +static kmp_task_pri_t *__kmp_alloc_task_pri_list() { + kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t)); + kmp_thread_data_t *thread_data = &l->td; + __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); + thread_data->td.td_deque_last_stolen = -1; + KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] " + "for thread_data %p\n", + __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data)); + thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( + INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); + thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; + return l; +} + +// The function finds the deque of priority tasks with given priority, or +// allocates a new deque and put it into sorted (high -> low) list of deques. +// Deques of non-default priority tasks are shared between all threads in team, +// as opposed to per-thread deques of tasks with default priority. +// The function is called under the lock task_team->tt.tt_task_pri_lock. +static kmp_thread_data_t * +__kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) { + kmp_thread_data_t *thread_data; + kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list; + if (lst->priority == pri) { + // Found queue of tasks with given priority. + thread_data = &lst->td; + } else if (lst->priority < pri) { + // All current priority queues contain tasks with lower priority. + // Allocate new one for given priority tasks. + kmp_task_pri_t *list = __kmp_alloc_task_pri_list(); + thread_data = &list->td; + list->priority = pri; + list->next = lst; + task_team->tt.tt_task_pri_list = list; + } else { // task_team->tt.tt_task_pri_list->priority > pri + kmp_task_pri_t *next_queue = lst->next; + while (next_queue && next_queue->priority > pri) { + lst = next_queue; + next_queue = lst->next; + } + // lst->priority > pri && (next == NULL || pri >= next->priority) + if (next_queue == NULL) { + // No queue with pri priority, need to allocate new one. + kmp_task_pri_t *list = __kmp_alloc_task_pri_list(); + thread_data = &list->td; + list->priority = pri; + list->next = NULL; + lst->next = list; + } else if (next_queue->priority == pri) { + // Found queue of tasks with given priority. + thread_data = &next_queue->td; + } else { // lst->priority > pri > next->priority + // insert newly allocated between existed queues + kmp_task_pri_t *list = __kmp_alloc_task_pri_list(); + thread_data = &list->td; + list->priority = pri; + list->next = next_queue; + lst->next = list; + } + } + return thread_data; +} + +// __kmp_push_priority_task: Add a task to the team's priority task deque +static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread, + kmp_taskdata_t *taskdata, + kmp_task_team_t *task_team, + kmp_int32 pri) { + kmp_thread_data_t *thread_data = NULL; + KA_TRACE(20, + ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n", + gtid, taskdata, pri)); + + // Find task queue specific to priority value + kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list; + if (UNLIKELY(lst == NULL)) { + __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock); + if (task_team->tt.tt_task_pri_list == NULL) { + // List of queues is still empty, allocate one. + kmp_task_pri_t *list = __kmp_alloc_task_pri_list(); + thread_data = &list->td; + list->priority = pri; + list->next = NULL; + task_team->tt.tt_task_pri_list = list; + } else { + // Other thread initialized a queue. Check if it fits and get thread_data. + thread_data = __kmp_get_priority_deque_data(task_team, pri); + } + __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock); + } else { + if (lst->priority == pri) { + // Found queue of tasks with given priority. + thread_data = &lst->td; + } else { + __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock); + thread_data = __kmp_get_priority_deque_data(task_team, pri); + __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock); + } + } + KMP_DEBUG_ASSERT(thread_data); + + __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); + // Check if deque is full + if (TCR_4(thread_data->td.td_deque_ntasks) >= + TASK_DEQUE_SIZE(thread_data->td)) { + if (__kmp_enable_task_throttling && + __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, + thread->th.th_current_task)) { + __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); + KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning " + "TASK_NOT_PUSHED for task %p\n", + gtid, taskdata)); + return TASK_NOT_PUSHED; + } else { + // expand deque to push the task which is not allowed to execute + __kmp_realloc_task_deque(thread, thread_data); + } + } + KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < + TASK_DEQUE_SIZE(thread_data->td)); + // Push taskdata. + thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; + // Wrap index. + thread_data->td.td_deque_tail = + (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); + TCW_4(thread_data->td.td_deque_ntasks, + TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count + KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self + KMP_FSYNC_RELEASING(taskdata); // releasing child + KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning " + "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n", + gtid, taskdata, thread_data->td.td_deque_ntasks, + thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); + __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); + task_team->tt.tt_num_task_pri++; // atomic inc + return TASK_SUCCESSFULLY_PUSHED; +} + // __kmp_push_task: Add a task to the thread's deque static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { kmp_info_t *thread = __kmp_threads[gtid]; @@ -371,6 +509,12 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); + if (taskdata->td_flags.priority_specified && task->data2.priority > 0 && + __kmp_max_task_priority > 0) { + int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority); + return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri); + } + // Find tasking deque specific to encountering thread thread_data = &task_team->tt.tt_threads_data[tid]; @@ -567,6 +711,8 @@ static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { ompt_frame_runtime | ompt_frame_framepointer; task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; + task->ompt_task_info.dispatch_chunk.start = 0; + task->ompt_task_info.dispatch_chunk.iterations = 0; } // __ompt_task_start: @@ -728,6 +874,10 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 || taskdata->td_flags.task_serial == 1); KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0); + kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata); + // Clear data to not be re-used later by mistake. + task->data1.destructors = NULL; + task->data2.priority = 0; taskdata->td_flags.freed = 1; // deallocate the taskdata and shared variable blocks associated with this task @@ -1599,6 +1749,18 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, if (UNLIKELY(ompt_enabled.enabled)) __ompt_task_start(task, current_task, gtid); #endif +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (UNLIKELY(ompt_enabled.ompt_callback_dispatch && + taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) { + ompt_data_t instance = ompt_data_none; + instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk); + ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); + ompt_callbacks.ompt_callback(ompt_callback_dispatch)( + &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data), + ompt_dispatch_taskloop_chunk, instance); + taskdata->ompt_task_info.dispatch_chunk = {0, 0}; + } +#endif // OMPT_SUPPORT && OMPT_OPTIONAL #if OMPD_SUPPORT if (ompd_state & OMPD_ENABLE_BP) @@ -1747,8 +1909,21 @@ kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, if (serialize_immediate) new_taskdata->td_flags.task_serial = 1; __kmp_invoke_task(gtid, new_task, current_task); + } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && + __kmp_wpolicy_passive) { + kmp_info_t *this_thr = __kmp_threads[gtid]; + kmp_team_t *team = this_thr->th.th_team; + kmp_int32 nthreads = this_thr->th.th_team_nproc; + for (int i = 0; i < nthreads; ++i) { + kmp_info_t *thread = team->t.t_threads[i]; + if (thread == this_thr) + continue; + if (thread->th.th_sleep_loc != NULL) { + __kmp_null_resume_wrapper(thread); + break; // awake one thread at a time + } + } } - return TASK_CURRENT_NOT_QUEUED; } @@ -2089,7 +2264,7 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { Flags for special info per task reduction item. */ typedef struct kmp_taskred_flags { - /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */ + /*! 1 - use lazy alloc/init (e.g. big objects, num tasks < num threads) */ unsigned lazy_priv : 1; unsigned reserved31 : 31; } kmp_taskred_flags_t; @@ -2667,6 +2842,105 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) { #endif } +static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid, + kmp_task_team_t *task_team, + kmp_int32 is_constrained) { + kmp_task_t *task = NULL; + kmp_taskdata_t *taskdata; + kmp_taskdata_t *current; + kmp_thread_data_t *thread_data; + int ntasks = task_team->tt.tt_num_task_pri; + if (ntasks == 0) { + KA_TRACE( + 20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid)); + return NULL; + } + do { + // decrement num_tasks to "reserve" one task to get for execution + if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks, + ntasks - 1)) + break; + } while (ntasks > 0); + if (ntasks == 0) { + KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n", + __kmp_get_gtid())); + return NULL; + } + // We got a "ticket" to get a "reserved" priority task + int deque_ntasks; + kmp_task_pri_t *list = task_team->tt.tt_task_pri_list; + do { + KMP_ASSERT(list != NULL); + thread_data = &list->td; + __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); + deque_ntasks = thread_data->td.td_deque_ntasks; + if (deque_ntasks == 0) { + __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); + KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n", + __kmp_get_gtid(), thread_data)); + list = list->next; + } + } while (deque_ntasks == 0); + KMP_DEBUG_ASSERT(deque_ntasks); + int target = thread_data->td.td_deque_head; + current = __kmp_threads[gtid]->th.th_current_task; + taskdata = thread_data->td.td_deque[target]; + if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { + // Bump head pointer and Wrap. + thread_data->td.td_deque_head = + (target + 1) & TASK_DEQUE_MASK(thread_data->td); + } else { + if (!task_team->tt.tt_untied_task_encountered) { + // The TSC does not allow to steal victim task + __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); + KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task " + "from %p: task_team=%p ntasks=%d head=%u tail=%u\n", + gtid, thread_data, task_team, deque_ntasks, target, + thread_data->td.td_deque_tail)); + task_team->tt.tt_num_task_pri++; // atomic inc, restore value + return NULL; + } + int i; + // walk through the deque trying to steal any task + taskdata = NULL; + for (i = 1; i < deque_ntasks; ++i) { + target = (target + 1) & TASK_DEQUE_MASK(thread_data->td); + taskdata = thread_data->td.td_deque[target]; + if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { + break; // found task to execute + } else { + taskdata = NULL; + } + } + if (taskdata == NULL) { + // No appropriate candidate found to execute + __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); + KA_TRACE( + 10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from " + "%p: task_team=%p ntasks=%d head=%u tail=%u\n", + gtid, thread_data, task_team, deque_ntasks, + thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); + task_team->tt.tt_num_task_pri++; // atomic inc, restore value + return NULL; + } + int prev = target; + for (i = i + 1; i < deque_ntasks; ++i) { + // shift remaining tasks in the deque left by 1 + target = (target + 1) & TASK_DEQUE_MASK(thread_data->td); + thread_data->td.td_deque[prev] = thread_data->td.td_deque[target]; + prev = target; + } + KMP_DEBUG_ASSERT( + thread_data->td.td_deque_tail == + (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td))); + thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped)) + } + thread_data->td.td_deque_ntasks = deque_ntasks - 1; + __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); + task = KMP_TASKDATA_TO_TASK(taskdata); + return task; +} + // __kmp_remove_my_task: remove a task from my own deque static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, kmp_task_team_t *task_team, @@ -2916,10 +3190,13 @@ static inline int __kmp_execute_tasks_template( // getting tasks from target constructs while (1) { // Inner loop to find a task and execute it task = NULL; - if (use_own_tasks) { // check on own queue first + if (task_team->tt.tt_num_task_pri) { // get priority task first + task = __kmp_get_priority_task(gtid, task_team, is_constrained); + } + if (task == NULL && use_own_tasks) { // check own queue next task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); } - if ((task == NULL) && (nthreads > 1)) { // Steal a task + if ((task == NULL) && (nthreads > 1)) { // Steal a task finally int asleep = 1; use_own_tasks = 0; // Try to steal from the last place I stole from successfully. @@ -3440,6 +3717,24 @@ static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); } +// __kmp_free_task_pri_list: +// Deallocates tasking deques used for priority tasks. +// Only occurs at library shutdown. +static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) { + __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock); + if (task_team->tt.tt_task_pri_list != NULL) { + kmp_task_pri_t *list = task_team->tt.tt_task_pri_list; + while (list != NULL) { + kmp_task_pri_t *next = list->next; + __kmp_free_task_deque(&list->td); + __kmp_free(list); + list = next; + } + task_team->tt.tt_task_pri_list = NULL; + } + __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock); +} + // __kmp_allocate_task_team: // Allocates a task team associated with a specific team, taking it from // the global task team free list if possible. Also initializes data @@ -3471,6 +3766,7 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, // __kmp_thread_malloc because threads not around for kmp_reap_task_team. task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); + __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock); #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG // suppress race conditions detection on synchronization flags in debug mode // this helps to analyze library internals eliminating false positives @@ -3540,6 +3836,9 @@ void __kmp_reap_task_teams(void) { if (task_team->tt.tt_threads_data != NULL) { __kmp_free_task_threads_data(task_team); } + if (task_team->tt.tt_task_pri_list != NULL) { + __kmp_free_task_pri_list(task_team); + } __kmp_free(task_team); } __kmp_release_bootstrap_lock(&__kmp_task_team_lock); @@ -3996,6 +4295,17 @@ void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) { pass = pass << 1; } while (!__kmp_give_task(thread, k, ptask, pass)); + + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) { + // awake at least one thread to execute given task + for (int i = 0; i < nthreads; ++i) { + thread = team->t.t_threads[i]; + if (thread->th.th_sleep_loc != NULL) { + __kmp_null_resume_wrapper(thread); + break; + } + } + } } /*! @@ -4371,6 +4681,12 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, #if OMPT_SUPPORT __kmp_omp_taskloop_task(NULL, gtid, next_task, codeptr_ra); // schedule new task +#if OMPT_OPTIONAL + if (ompt_enabled.ompt_callback_dispatch) { + OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk, + lower, upper, st); + } +#endif // OMPT_OPTIONAL #else __kmp_omp_task(gtid, next_task, true); // schedule new task #endif @@ -4800,7 +5116,7 @@ void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks @param grainsize Schedule value if specified -@param modifer Modifier 'strict' for sched, 1 if present, 0 otherwise +@param modifier Modifier 'strict' for sched, 1 if present, 0 otherwise @param task_dup Tasks duplication routine Execute the taskloop construct. diff --git a/contrib/libs/cxxsupp/openmp/kmp_version.cpp b/contrib/libs/cxxsupp/openmp/kmp_version.cpp index db2454c0f4..bb600c120d 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_version.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_version.cpp @@ -22,7 +22,9 @@ #define stringer(x) _stringer(x) // Detect compiler. -#if KMP_COMPILER_ICC +#if KMP_COMPILER_ICX +#define KMP_COMPILER __VERSION__ +#elif KMP_COMPILER_ICC #if __INTEL_COMPILER == 1010 #define KMP_COMPILER "Intel(R) C++ Compiler 10.1" #elif __INTEL_COMPILER == 1100 @@ -53,8 +55,10 @@ #define KMP_COMPILER "Intel(R) C++ Compiler 19.0" #elif __INTEL_COMPILER == 1910 #define KMP_COMPILER "Intel(R) C++ Compiler 19.1" -#elif __INTEL_COMPILER >= 9900 -#define KMP_COMPILER "Intel(R) C++ Compiler mainline" +#elif __INTEL_COMPILER > 1910 +#define KMP_COMPILER \ + "Intel(R) C++ Compiler Classic " stringer(__INTEL_COMPILER) "." stringer( \ + __INTEL_COMPILER_UPDATE) #endif #elif KMP_COMPILER_CLANG #define KMP_COMPILER \ diff --git a/contrib/libs/cxxsupp/openmp/kmp_wait_release.h b/contrib/libs/cxxsupp/openmp/kmp_wait_release.h index b32cb15de1..3fcae5687d 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_wait_release.h +++ b/contrib/libs/cxxsupp/openmp/kmp_wait_release.h @@ -105,7 +105,7 @@ template <flag_type FlagType> class kmp_flag { protected: flag_properties t; /**< "Type" of the flag in loc */ kmp_info_t *waiting_threads[1]; /**< Threads sleeping on this thread. */ - kmp_uint32 num_waiting_threads; /**< #threads sleeping on this thread. */ + kmp_uint32 num_waiting_threads; /**< Num threads sleeping on this thread. */ std::atomic<bool> *sleepLoc; public: @@ -609,7 +609,8 @@ final_spin=FALSE) continue; // Don't suspend if there is a likelihood of new tasks being spawned. - if ((task_team != NULL) && TCR_4(task_team->tt.tt_found_tasks)) + if (task_team != NULL && TCR_4(task_team->tt.tt_found_tasks) && + !__kmp_wpolicy_passive) continue; #if KMP_USE_MONITOR @@ -625,10 +626,6 @@ final_spin=FALSE) if (!Sleepable) continue; - if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && - __kmp_pause_status != kmp_soft_paused) - continue; - #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT if (__kmp_mwait_enabled || __kmp_umwait_enabled) { KF_TRACE(50, ("__kmp_wait_sleep: T#%d using monitor/mwait\n", th_gtid)); diff --git a/contrib/libs/cxxsupp/openmp/omp-tools.h b/contrib/libs/cxxsupp/openmp/omp-tools.h index 5092174d66..6bae305c71 100644 --- a/contrib/libs/cxxsupp/openmp/omp-tools.h +++ b/contrib/libs/cxxsupp/openmp/omp-tools.h @@ -266,7 +266,10 @@ typedef enum ompt_scope_endpoint_t { typedef enum ompt_dispatch_t { ompt_dispatch_iteration = 1, - ompt_dispatch_section = 2 + ompt_dispatch_section = 2, + ompt_dispatch_ws_loop_chunk = 3, + ompt_dispatch_taskloop_chunk = 4, + ompt_dispatch_distribute_chunk = 5 } ompt_dispatch_t; typedef enum ompt_sync_region_t { @@ -303,7 +306,11 @@ typedef enum ompt_work_t { ompt_work_workshare = 5, ompt_work_distribute = 6, ompt_work_taskloop = 7, - ompt_work_scope = 8 + ompt_work_scope = 8, + ompt_work_loop_static = 10, + ompt_work_loop_dynamic = 11, + ompt_work_loop_guided = 12, + ompt_work_loop_other = 13 } ompt_work_t; typedef enum ompt_mutex_t { @@ -554,6 +561,11 @@ typedef struct ompt_dependence_t { ompt_dependence_type_t dependence_type; } ompt_dependence_t; +typedef struct ompt_dispatch_chunk_t { + uint64_t start; + uint64_t iterations; +} ompt_dispatch_chunk_t; + typedef int (*ompt_enumerate_states_t) ( int current_state, int *next_state, @@ -745,7 +757,7 @@ typedef struct ompt_record_parallel_end_t { } ompt_record_parallel_end_t; typedef void (*ompt_callback_work_t) ( - ompt_work_t wstype, + ompt_work_t work_type, ompt_scope_endpoint_t endpoint, ompt_data_t *parallel_data, ompt_data_t *task_data, @@ -754,7 +766,7 @@ typedef void (*ompt_callback_work_t) ( ); typedef struct ompt_record_work_t { - ompt_work_t wstype; + ompt_work_t work_type; ompt_scope_endpoint_t endpoint; ompt_id_t parallel_id; ompt_id_t task_id; diff --git a/contrib/libs/cxxsupp/openmp/omp.h b/contrib/libs/cxxsupp/openmp/omp.h index 2ddf4f630b..959e87359d 100644 --- a/contrib/libs/cxxsupp/openmp/omp.h +++ b/contrib/libs/cxxsupp/openmp/omp.h @@ -374,7 +374,6 @@ extern __KMP_IMP omp_allocator_handle_t const omp_cgroup_mem_alloc; extern __KMP_IMP omp_allocator_handle_t const omp_pteam_mem_alloc; extern __KMP_IMP omp_allocator_handle_t const omp_thread_mem_alloc; - /* Preview of target memory support */ extern __KMP_IMP omp_allocator_handle_t const llvm_omp_target_host_mem_alloc; extern __KMP_IMP omp_allocator_handle_t const llvm_omp_target_shared_mem_alloc; extern __KMP_IMP omp_allocator_handle_t const llvm_omp_target_device_mem_alloc; @@ -385,7 +384,6 @@ extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space; extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space; extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space; - /* Preview of target memory support */ extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_host_mem_space; extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_shared_mem_space; extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_device_mem_space; @@ -405,7 +403,6 @@ omp_cgroup_mem_alloc = 6, omp_pteam_mem_alloc = 7, omp_thread_mem_alloc = 8, - /* Preview of target memory support */ llvm_omp_target_host_mem_alloc = 100, llvm_omp_target_shared_mem_alloc = 101, llvm_omp_target_device_mem_alloc = 102, @@ -422,7 +419,6 @@ omp_const_mem_space = 2, omp_high_bw_mem_space = 3, omp_low_lat_mem_space = 4, - /* Preview of target memory support */ llvm_omp_target_host_mem_space = 100, llvm_omp_target_shared_mem_space = 101, llvm_omp_target_device_mem_space = 102, @@ -503,6 +499,12 @@ #pragma omp end declare variant # endif + /* OpenMP 5.2 */ + extern int __KAI_KMPC_CONVENTION omp_in_explicit_task(void); + + /* LLVM Extensions */ + extern void *llvm_omp_target_dynamic_shared_alloc(); + # undef __KAI_KMPC_CONVENTION # undef __KMP_IMP diff --git a/contrib/libs/cxxsupp/openmp/ompt-event-specific.h b/contrib/libs/cxxsupp/openmp/ompt-event-specific.h index 875d6921b7..f6c7022c8f 100644 --- a/contrib/libs/cxxsupp/openmp/ompt-event-specific.h +++ b/contrib/libs/cxxsupp/openmp/ompt-event-specific.h @@ -104,7 +104,7 @@ #define ompt_callback_reduction_implemented ompt_event_MAY_ALWAYS_OPTIONAL -#define ompt_callback_dispatch_implemented ompt_event_UNIMPLEMENTED +#define ompt_callback_dispatch_implemented ompt_event_MAY_ALWAYS_OPTIONAL #define ompt_callback_error_implemented ompt_event_MAY_ALWAYS_OPTIONAL diff --git a/contrib/libs/cxxsupp/openmp/ompt-general.cpp b/contrib/libs/cxxsupp/openmp/ompt-general.cpp index c1468c0c32..0bee7e77c8 100644 --- a/contrib/libs/cxxsupp/openmp/ompt-general.cpp +++ b/contrib/libs/cxxsupp/openmp/ompt-general.cpp @@ -310,7 +310,8 @@ ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) { OMPT_VERBOSE_INIT_PRINT("Opening %s... ", fname); HMODULE h = LoadLibrary(fname); if (!h) { - OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: Error %u\n", GetLastError()); + OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: Error %u\n", + (unsigned)GetLastError()); } else { OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success. \n"); OMPT_VERBOSE_INIT_PRINT("Searching for ompt_start_tool in %s... ", @@ -318,7 +319,7 @@ ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) { start_tool = (ompt_start_tool_t)GetProcAddress(h, "ompt_start_tool"); if (!start_tool) { OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: Error %u\n", - GetLastError()); + (unsigned)GetLastError()); } else #else #error Activation of OMPT is not supported on this platform. @@ -789,7 +790,7 @@ OMPT_API_ROUTINE int ompt_get_partition_place_nums(int place_nums_size, OMPT_API_ROUTINE int ompt_get_proc_id(void) { if (!ompt_enabled.enabled || __kmp_get_gtid() < 0) return -1; -#if KMP_OS_LINUX +#if KMP_HAVE_SCHED_GETCPU return sched_getcpu(); #elif KMP_OS_WINDOWS PROCESSOR_NUMBER pn; diff --git a/contrib/libs/cxxsupp/openmp/ompt-internal.h b/contrib/libs/cxxsupp/openmp/ompt-internal.h index 6665bb5e83..a85fe3835c 100644 --- a/contrib/libs/cxxsupp/openmp/ompt-internal.h +++ b/contrib/libs/cxxsupp/openmp/ompt-internal.h @@ -57,6 +57,7 @@ typedef struct { ompt_data_t task_data; struct kmp_taskdata *scheduling_parent; int thread_num; + ompt_dispatch_chunk_t dispatch_chunk; } ompt_task_info_t; typedef struct { diff --git a/contrib/libs/cxxsupp/openmp/ompt-specific.h b/contrib/libs/cxxsupp/openmp/ompt-specific.h index 2fc7ee1c35..bd1e0d8991 100644 --- a/contrib/libs/cxxsupp/openmp/ompt-specific.h +++ b/contrib/libs/cxxsupp/openmp/ompt-specific.h @@ -89,6 +89,17 @@ inline void *__ompt_load_return_address(int gtid) { ? __ompt_load_return_address(gtid) \ : __builtin_return_address(0)) +#define OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, incr) \ + do { \ + if (incr > 0) { \ + chunk.start = static_cast<uint64_t>(lb); \ + chunk.iterations = static_cast<uint64_t>(((ub) - (lb)) / (incr) + 1); \ + } else { \ + chunk.start = static_cast<uint64_t>(ub); \ + chunk.iterations = static_cast<uint64_t>(((lb) - (ub)) / -(incr) + 1); \ + } \ + } while (0) + //****************************************************************************** // inline functions //****************************************************************************** diff --git a/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp b/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp index 5cd6ad6a03..91edf0254a 100644 --- a/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp +++ b/contrib/libs/cxxsupp/openmp/z_Linux_util.cpp @@ -1297,7 +1297,13 @@ static void __kmp_atfork_child(void) { __kmp_itt_reset(); // reset ITT's global state #endif /* USE_ITT_BUILD */ - __kmp_serial_initialize(); + { + // Child process often get terminated without any use of OpenMP. That might + // cause mapped shared memory file to be left unattended. Thus we postpone + // library registration till middle initialization in the child process. + __kmp_need_register_serial = FALSE; + __kmp_serial_initialize(); + } /* This is necessary to make sure no stale data is left around */ /* AC: customers complain that we use unsafe routines in the atfork diff --git a/library/cpp/json/converter/converter.h b/library/cpp/json/converter/converter.h index be86c31595..04f26b6946 100644 --- a/library/cpp/json/converter/converter.h +++ b/library/cpp/json/converter/converter.h @@ -1,3 +1,5 @@ +#pragma once + #include "library/cpp/json/writer/json_value.h" #include <limits> diff --git a/library/cpp/json/converter/ut/test_conversion.cpp b/library/cpp/json/converter/ut/test_conversion.cpp index c2c857835a..84027dda75 100644 --- a/library/cpp/json/converter/ut/test_conversion.cpp +++ b/library/cpp/json/converter/ut/test_conversion.cpp @@ -30,19 +30,15 @@ namespace NJson { static void TestEncoding(const TValues& values) { for (const auto& [serializedValue, value] : values) { - { - const auto encodedValue = TConverter<T>::Encode(value); - AssertJsonsEqual(encodedValue, serializedValue); - } + const auto encodedValue = TConverter<T>::Encode(value); + AssertJsonsEqual(encodedValue, serializedValue); } } static void TestDecoding(const TValues& values) { for (const auto& [serializedValue, value] : values) { - { - const auto decodedValue = TConverter<T>::Decode(ReadJsonFastTree(serializedValue)); - UNIT_ASSERT_EQUAL(decodedValue, value); - } + const auto decodedValue = TConverter<T>::Decode(ReadJsonFastTree(serializedValue)); + UNIT_ASSERT_EQUAL(decodedValue, value); } } @@ -59,11 +55,9 @@ namespace NJson { TestDecoding(values); for (const auto& [serializedValue, value] : values) { - { - const auto encodedValue = TConverter<T>::Encode(value); - const auto decodedValue = TConverter<T>::Decode(encodedValue); - UNIT_ASSERT_EQUAL(value, decodedValue); - } + const auto encodedValue = TConverter<T>::Encode(value); + const auto decodedValue = TConverter<T>::Decode(encodedValue); + UNIT_ASSERT_EQUAL(value, decodedValue); } } }; |