diff options
author | thegeorg <thegeorg@yandex-team.ru> | 2022-06-03 10:53:07 +0300 |
---|---|---|
committer | thegeorg <thegeorg@yandex-team.ru> | 2022-06-03 10:53:07 +0300 |
commit | a1d4361e379e2c72a469ad1bd64569cbc2db131f (patch) | |
tree | 0caddb240a10132376e4653a31578e117d33f9fd /contrib/libs/cxxsupp/openmp/kmp_affinity.cpp | |
parent | 41f55a521834080d9d703c099c0418cfff3a0546 (diff) | |
download | ydb-a1d4361e379e2c72a469ad1bd64569cbc2db131f.tar.gz |
Update contrib/libs/cxxsupp/openmp to 14.0.4
ref:77c6cdda99b217d50c4deadca11f5611fa0dc168
Diffstat (limited to 'contrib/libs/cxxsupp/openmp/kmp_affinity.cpp')
-rw-r--r-- | contrib/libs/cxxsupp/openmp/kmp_affinity.cpp | 718 |
1 files changed, 667 insertions, 51 deletions
diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp index 8b40bd7ecd..414a27fb05 100644 --- a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp +++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp @@ -26,6 +26,7 @@ #define HWLOC_GROUP_KIND_INTEL_DIE 104 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 #endif +#include <ctype.h> // The machine topology kmp_topology_t *__kmp_topology = nullptr; @@ -123,6 +124,20 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) { return ((plural) ? "unknowns" : "unknown"); } +const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) { + switch (type) { + case KMP_HW_CORE_TYPE_UNKNOWN: + return "unknown"; +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + case KMP_HW_CORE_TYPE_ATOM: + return "Intel Atom(R) processor"; + case KMP_HW_CORE_TYPE_CORE: + return "Intel(R) Core(TM) processor"; +#endif + } + return "unknown"; +} + //////////////////////////////////////////////////////////////////////////////// // kmp_hw_thread_t methods int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { @@ -174,20 +189,94 @@ void kmp_hw_thread_t::print() const { for (int i = 0; i < depth; ++i) { printf("%4d ", ids[i]); } + if (attrs) { + if (attrs.is_core_type_valid()) + printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type())); + if (attrs.is_core_eff_valid()) + printf(" (eff=%d)", attrs.get_core_eff()); + } printf("\n"); } //////////////////////////////////////////////////////////////////////////////// // kmp_topology_t methods +// Add a layer to the topology based on the ids. Assume the topology +// is perfectly nested (i.e., so no object has more than one parent) +void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) { + // Figure out where the layer should go by comparing the ids of the current + // layers with the new ids + int target_layer; + int previous_id = kmp_hw_thread_t::UNKNOWN_ID; + int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID; + + // Start from the highest layer and work down to find target layer + // If new layer is equal to another layer then put the new layer above + for (target_layer = 0; target_layer < depth; ++target_layer) { + bool layers_equal = true; + bool strictly_above_target_layer = false; + for (int i = 0; i < num_hw_threads; ++i) { + int id = hw_threads[i].ids[target_layer]; + int new_id = ids[i]; + if (id != previous_id && new_id == previous_new_id) { + // Found the layer we are strictly above + strictly_above_target_layer = true; + layers_equal = false; + break; + } else if (id == previous_id && new_id != previous_new_id) { + // Found a layer we are below. Move to next layer and check. + layers_equal = false; + break; + } + previous_id = id; + previous_new_id = new_id; + } + if (strictly_above_target_layer || layers_equal) + break; + } + + // Found the layer we are above. Now move everything to accommodate the new + // layer. And put the new ids and type into the topology. + for (int i = depth - 1, j = depth; i >= target_layer; --i, --j) + types[j] = types[i]; + types[target_layer] = type; + for (int k = 0; k < num_hw_threads; ++k) { + for (int i = depth - 1, j = depth; i >= target_layer; --i, --j) + hw_threads[k].ids[j] = hw_threads[k].ids[i]; + hw_threads[k].ids[target_layer] = ids[k]; + } + equivalent[type] = type; + depth++; +} + +#if KMP_GROUP_AFFINITY +// Insert the Windows Processor Group structure into the topology +void kmp_topology_t::_insert_windows_proc_groups() { + // Do not insert the processor group structure for a single group + if (__kmp_num_proc_groups == 1) + return; + kmp_affin_mask_t *mask; + int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads); + KMP_CPU_ALLOC(mask); + for (int i = 0; i < num_hw_threads; ++i) { + KMP_CPU_ZERO(mask); + KMP_CPU_SET(hw_threads[i].os_id, mask); + ids[i] = __kmp_get_proc_group(mask); + } + KMP_CPU_FREE(mask); + _insert_layer(KMP_HW_PROC_GROUP, ids); + __kmp_free(ids); +} +#endif + // Remove layers that don't add information to the topology. // This is done by having the layer take on the id = UNKNOWN_ID (-1) void kmp_topology_t::_remove_radix1_layers() { int preference[KMP_HW_LAST]; int top_index1, top_index2; // Set up preference associative array - preference[KMP_HW_PROC_GROUP] = 110; - preference[KMP_HW_SOCKET] = 100; + preference[KMP_HW_SOCKET] = 110; + preference[KMP_HW_PROC_GROUP] = 100; preference[KMP_HW_CORE] = 95; preference[KMP_HW_THREAD] = 90; preference[KMP_HW_NUMA] = 85; @@ -305,6 +394,7 @@ void kmp_topology_t::_gather_enumeration_information() { count[i] = 0; ratio[i] = 0; } + int core_level = get_level(KMP_HW_CORE); for (int i = 0; i < num_hw_threads; ++i) { kmp_hw_thread_t &hw_thread = hw_threads[i]; for (int layer = 0; layer < depth; ++layer) { @@ -320,6 +410,29 @@ void kmp_topology_t::_gather_enumeration_information() { ratio[l] = max[l]; max[l] = 1; } + // Figure out the number of different core types + // and efficiencies for hybrid CPUs + if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) { + if (hw_thread.attrs.is_core_eff_valid() && + hw_thread.attrs.core_eff >= num_core_efficiencies) { + // Because efficiencies can range from 0 to max efficiency - 1, + // the number of efficiencies is max efficiency + 1 + num_core_efficiencies = hw_thread.attrs.core_eff + 1; + } + if (hw_thread.attrs.is_core_type_valid()) { + bool found = false; + for (int j = 0; j < num_core_types; ++j) { + if (hw_thread.attrs.get_core_type() == core_types[j]) { + found = true; + break; + } + } + if (!found) { + KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES); + core_types[num_core_types++] = hw_thread.attrs.get_core_type(); + } + } + } break; } } @@ -333,6 +446,42 @@ void kmp_topology_t::_gather_enumeration_information() { } } +int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr, + int above_level, + bool find_all) const { + int current, current_max; + int previous_id[KMP_HW_LAST]; + for (int i = 0; i < depth; ++i) + previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; + int core_level = get_level(KMP_HW_CORE); + if (find_all) + above_level = -1; + KMP_ASSERT(above_level < core_level); + current_max = 0; + current = 0; + for (int i = 0; i < num_hw_threads; ++i) { + kmp_hw_thread_t &hw_thread = hw_threads[i]; + if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) { + if (current > current_max) + current_max = current; + current = hw_thread.attrs.contains(attr); + } else { + for (int level = above_level + 1; level <= core_level; ++level) { + if (hw_thread.ids[level] != previous_id[level]) { + if (hw_thread.attrs.contains(attr)) + current++; + break; + } + } + } + for (int level = 0; level < depth; ++level) + previous_id[level] = hw_thread.ids[level]; + } + if (current > current_max) + current_max = current; + return current_max; +} + // Find out if the topology is uniform void kmp_topology_t::_discover_uniformity() { int num = 1; @@ -406,7 +555,7 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, kmp_topology_t *retval; // Allocate all data in one large allocation size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc + - sizeof(int) * ndepth * 3; + sizeof(int) * (size_t)KMP_HW_LAST * 3; char *bytes = (char *)__kmp_allocate(size); retval = (kmp_topology_t *)bytes; if (nproc > 0) { @@ -419,8 +568,12 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, int *arr = (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc); retval->types = (kmp_hw_t *)arr; - retval->ratio = arr + ndepth; - retval->count = arr + 2 * ndepth; + retval->ratio = arr + (size_t)KMP_HW_LAST; + retval->count = arr + 2 * (size_t)KMP_HW_LAST; + retval->num_core_efficiencies = 0; + retval->num_core_types = 0; + for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) + retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN; KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; } for (int i = 0; i < ndepth; ++i) { retval->types[i] = types[i]; @@ -478,6 +631,13 @@ void kmp_topology_t::dump() const { } printf("\n"); + printf("* num_core_eff: %d\n", num_core_efficiencies); + printf("* num_core_types: %d\n", num_core_types); + printf("* core_types: "); + for (int i = 0; i < num_core_types; ++i) + printf("%3d ", core_types[i]); + printf("\n"); + printf("* equivalent map:\n"); KMP_FOREACH_HW_TYPE(i) { const char *key = __kmp_hw_get_keyword(i); @@ -571,6 +731,29 @@ void kmp_topology_t::print(const char *env_var) const { } KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores); + // Hybrid topology information + if (__kmp_is_hybrid_cpu()) { + for (int i = 0; i < num_core_types; ++i) { + kmp_hw_core_type_t core_type = core_types[i]; + kmp_hw_attr_t attr; + attr.clear(); + attr.set_core_type(core_type); + int ncores = get_ncores_with_attr(attr); + if (ncores > 0) { + KMP_INFORM(TopologyHybrid, env_var, ncores, + __kmp_hw_get_core_type_string(core_type)); + KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS) + for (int eff = 0; eff < num_core_efficiencies; ++eff) { + attr.set_core_eff(eff); + int ncores_with_eff = get_ncores_with_attr(attr); + if (ncores_with_eff > 0) { + KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff); + } + } + } + } + } + if (num_hw_threads <= 0) { __kmp_str_buf_free(&buf); return; @@ -585,6 +768,10 @@ void kmp_topology_t::print(const char *env_var) const { __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type)); __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]); } + if (__kmp_is_hybrid_cpu()) + __kmp_str_buf_print( + &buf, "(%s)", + __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type())); KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str); } @@ -592,6 +779,9 @@ void kmp_topology_t::print(const char *env_var) const { } void kmp_topology_t::canonicalize() { +#if KMP_GROUP_AFFINITY + _insert_windows_proc_groups(); +#endif _remove_radix1_layers(); _gather_enumeration_information(); _discover_uniformity(); @@ -640,6 +830,25 @@ void kmp_topology_t::canonicalize() { __kmp_hw_get_catalog_string(gran_type)); __kmp_affinity_gran = gran_type; } +#if KMP_GROUP_AFFINITY + // If more than one processor group exists, and the level of + // granularity specified by the user is too coarse, then the + // granularity must be adjusted "down" to processor group affinity + // because threads can only exist within one processor group. + // For example, if a user sets granularity=socket and there are two + // processor groups that cover a socket, then the runtime must + // restrict the granularity down to the processor group level. + if (__kmp_num_proc_groups > 1) { + int gran_depth = __kmp_topology->get_level(gran_type); + int proc_group_depth = __kmp_topology->get_level(KMP_HW_PROC_GROUP); + if (gran_depth >= 0 && proc_group_depth >= 0 && + gran_depth < proc_group_depth) { + KMP_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY", + __kmp_hw_get_catalog_string(__kmp_affinity_gran)); + __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP; + } + } +#endif __kmp_affinity_gran_levels = 0; for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i) __kmp_affinity_gran_levels++; @@ -673,6 +882,56 @@ void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg, _discover_uniformity(); } +// Represents running sub IDs for a single core attribute where +// attribute values have SIZE possibilities. +template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t { + int last_level; // last level in topology to consider for sub_ids + int sub_id[SIZE]; // The sub ID for a given attribute value + int prev_sub_id[KMP_HW_LAST]; + IndexFunc indexer; + +public: + kmp_sub_ids_t(int last_level) : last_level(last_level) { + KMP_ASSERT(last_level < KMP_HW_LAST); + for (size_t i = 0; i < SIZE; ++i) + sub_id[i] = -1; + for (size_t i = 0; i < KMP_HW_LAST; ++i) + prev_sub_id[i] = -1; + } + void update(const kmp_hw_thread_t &hw_thread) { + int idx = indexer(hw_thread); + KMP_ASSERT(idx < (int)SIZE); + for (int level = 0; level <= last_level; ++level) { + if (hw_thread.sub_ids[level] != prev_sub_id[level]) { + if (level < last_level) + sub_id[idx] = -1; + sub_id[idx]++; + break; + } + } + for (int level = 0; level <= last_level; ++level) + prev_sub_id[level] = hw_thread.sub_ids[level]; + } + int get_sub_id(const kmp_hw_thread_t &hw_thread) const { + return sub_id[indexer(hw_thread)]; + } +}; + +static kmp_str_buf_t * +__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf, + bool plural) { + __kmp_str_buf_init(buf); + if (attr.is_core_type_valid()) + __kmp_str_buf_print(buf, "%s %s", + __kmp_hw_get_core_type_string(attr.get_core_type()), + __kmp_hw_get_catalog_string(KMP_HW_CORE, plural)); + else + __kmp_str_buf_print(buf, "%s eff=%d", + __kmp_hw_get_catalog_string(KMP_HW_CORE, plural), + attr.get_core_eff()); + return buf; +} + // Apply the KMP_HW_SUBSET envirable to the topology // Returns true if KMP_HW_SUBSET filtered any processors // otherwise, returns false @@ -681,18 +940,27 @@ bool kmp_topology_t::filter_hw_subset() { if (!__kmp_hw_subset) return false; + // First, sort the KMP_HW_SUBSET items by the machine topology + __kmp_hw_subset->sort(); + // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology + bool using_core_types = false; + bool using_core_effs = false; int hw_subset_depth = __kmp_hw_subset->get_depth(); kmp_hw_t specified[KMP_HW_LAST]; + int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth); KMP_ASSERT(hw_subset_depth > 0); KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; } + int core_level = get_level(KMP_HW_CORE); for (int i = 0; i < hw_subset_depth; ++i) { int max_count; - int num = __kmp_hw_subset->at(i).num; - int offset = __kmp_hw_subset->at(i).offset; - kmp_hw_t type = __kmp_hw_subset->at(i).type; + const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i); + int num = item.num[0]; + int offset = item.offset[0]; + kmp_hw_t type = item.type; kmp_hw_t equivalent_type = equivalent[type]; int level = get_level(type); + topology_levels[i] = level; // Check to see if current layer is in detected machine topology if (equivalent_type != KMP_HW_UNKNOWN) { @@ -703,8 +971,8 @@ bool kmp_topology_t::filter_hw_subset() { return false; } - // Check to see if current layer has already been specified - // either directly or through an equivalent type + // Check to see if current layer has already been + // specified either directly or through an equivalent type if (specified[equivalent_type] != KMP_HW_UNKNOWN) { KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), __kmp_hw_get_catalog_string(specified[equivalent_type])); @@ -712,66 +980,247 @@ bool kmp_topology_t::filter_hw_subset() { } specified[equivalent_type] = type; - // Check to see if layers are in order - if (i + 1 < hw_subset_depth) { - kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type); - if (next_type == KMP_HW_UNKNOWN) { - KMP_WARNING( - AffHWSubsetNotExistGeneric, - __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type)); - return false; - } - int next_topology_level = get_level(next_type); - if (level > next_topology_level) { - KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type), - __kmp_hw_get_catalog_string(next_type)); - return false; - } - } - // Check to see if each layer's num & offset parameters are valid max_count = get_ratio(level); - if (max_count < 0 || num + offset > max_count) { + if (max_count < 0 || + (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) { bool plural = (num > 1); KMP_WARNING(AffHWSubsetManyGeneric, __kmp_hw_get_catalog_string(type, plural)); return false; } + + // Check to see if core attributes are consistent + if (core_level == level) { + // Determine which core attributes are specified + for (int j = 0; j < item.num_attrs; ++j) { + if (item.attr[j].is_core_type_valid()) + using_core_types = true; + if (item.attr[j].is_core_eff_valid()) + using_core_effs = true; + } + + // Check if using a single core attribute on non-hybrid arch. + // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute. + // + // Check if using multiple core attributes on non-hyrbid arch. + // Ignore all of KMP_HW_SUBSET if this is the case. + if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) { + if (item.num_attrs == 1) { + if (using_core_effs) { + KMP_WARNING(AffHWSubsetIgnoringAttr, "efficiency"); + } else { + KMP_WARNING(AffHWSubsetIgnoringAttr, "core_type"); + } + using_core_effs = false; + using_core_types = false; + } else { + KMP_WARNING(AffHWSubsetAttrsNonHybrid); + return false; + } + } + + // Check if using both core types and core efficiencies together + if (using_core_types && using_core_effs) { + KMP_WARNING(AffHWSubsetIncompat, "core_type", "efficiency"); + return false; + } + + // Check that core efficiency values are valid + if (using_core_effs) { + for (int j = 0; j < item.num_attrs; ++j) { + if (item.attr[j].is_core_eff_valid()) { + int core_eff = item.attr[j].get_core_eff(); + if (core_eff < 0 || core_eff >= num_core_efficiencies) { + kmp_str_buf_t buf; + __kmp_str_buf_init(&buf); + __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff()); + __kmp_msg(kmp_ms_warning, + KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str), + KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1), + __kmp_msg_null); + __kmp_str_buf_free(&buf); + return false; + } + } + } + } + + // Check that the number of requested cores with attributes is valid + if (using_core_types || using_core_effs) { + for (int j = 0; j < item.num_attrs; ++j) { + int num = item.num[j]; + int offset = item.offset[j]; + int level_above = core_level - 1; + if (level_above >= 0) { + max_count = get_ncores_with_attr_per(item.attr[j], level_above); + if (max_count <= 0 || + (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) { + kmp_str_buf_t buf; + __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0); + KMP_WARNING(AffHWSubsetManyGeneric, buf.str); + __kmp_str_buf_free(&buf); + return false; + } + } + } + } + + if ((using_core_types || using_core_effs) && item.num_attrs > 1) { + for (int j = 0; j < item.num_attrs; ++j) { + // Ambiguous use of specific core attribute + generic core + // e.g., 4c & 3c:intel_core or 4c & 3c:eff1 + if (!item.attr[j]) { + kmp_hw_attr_t other_attr; + for (int k = 0; k < item.num_attrs; ++k) { + if (item.attr[k] != item.attr[j]) { + other_attr = item.attr[k]; + break; + } + } + kmp_str_buf_t buf; + __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0); + KMP_WARNING(AffHWSubsetIncompat, + __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str); + __kmp_str_buf_free(&buf); + return false; + } + // Allow specifying a specific core type or core eff exactly once + for (int k = 0; k < j; ++k) { + if (!item.attr[j] || !item.attr[k]) + continue; + if (item.attr[k] == item.attr[j]) { + kmp_str_buf_t buf; + __kmp_hw_get_catalog_core_string(item.attr[j], &buf, + item.num[j] > 0); + KMP_WARNING(AffHWSubsetAttrRepeat, buf.str); + __kmp_str_buf_free(&buf); + return false; + } + } + } + } + } } - // Apply the filtered hardware subset - int new_index = 0; + struct core_type_indexer { + int operator()(const kmp_hw_thread_t &t) const { + switch (t.attrs.get_core_type()) { +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + case KMP_HW_CORE_TYPE_ATOM: + return 1; + case KMP_HW_CORE_TYPE_CORE: + return 2; +#endif + case KMP_HW_CORE_TYPE_UNKNOWN: + return 0; + } + KMP_ASSERT(0); + return 0; + } + }; + struct core_eff_indexer { + int operator()(const kmp_hw_thread_t &t) const { + return t.attrs.get_core_eff(); + } + }; + + kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids( + core_level); + kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids( + core_level); + + // Determine which hardware threads should be filtered. + int num_filtered = 0; + bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads); for (int i = 0; i < num_hw_threads; ++i) { kmp_hw_thread_t &hw_thread = hw_threads[i]; + // Update type_sub_id + if (using_core_types) + core_type_sub_ids.update(hw_thread); + if (using_core_effs) + core_eff_sub_ids.update(hw_thread); + // Check to see if this hardware thread should be filtered bool should_be_filtered = false; - for (int level = 0, hw_subset_index = 0; - level < depth && hw_subset_index < hw_subset_depth; ++level) { - kmp_hw_t topology_type = types[level]; - auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index); - kmp_hw_t hw_subset_type = hw_subset_item.type; - if (topology_type != hw_subset_type) + for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth; + ++hw_subset_index) { + const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index); + int level = topology_levels[hw_subset_index]; + if (level == -1) continue; - int num = hw_subset_item.num; - int offset = hw_subset_item.offset; - hw_subset_index++; - if (hw_thread.sub_ids[level] < offset || - hw_thread.sub_ids[level] >= offset + num) { - should_be_filtered = true; - break; + if ((using_core_effs || using_core_types) && level == core_level) { + // Look for the core attribute in KMP_HW_SUBSET which corresponds + // to this hardware thread's core attribute. Use this num,offset plus + // the running sub_id for the particular core attribute of this hardware + // thread to determine if the hardware thread should be filtered or not. + int attr_idx; + kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type(); + int core_eff = hw_thread.attrs.get_core_eff(); + for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) { + if (using_core_types && + hw_subset_item.attr[attr_idx].get_core_type() == core_type) + break; + if (using_core_effs && + hw_subset_item.attr[attr_idx].get_core_eff() == core_eff) + break; + } + // This core attribute isn't in the KMP_HW_SUBSET so always filter it. + if (attr_idx == hw_subset_item.num_attrs) { + should_be_filtered = true; + break; + } + int sub_id; + int num = hw_subset_item.num[attr_idx]; + int offset = hw_subset_item.offset[attr_idx]; + if (using_core_types) + sub_id = core_type_sub_ids.get_sub_id(hw_thread); + else + sub_id = core_eff_sub_ids.get_sub_id(hw_thread); + if (sub_id < offset || + (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) { + should_be_filtered = true; + break; + } + } else { + int num = hw_subset_item.num[0]; + int offset = hw_subset_item.offset[0]; + if (hw_thread.sub_ids[level] < offset || + (num != kmp_hw_subset_t::USE_ALL && + hw_thread.sub_ids[level] >= offset + num)) { + should_be_filtered = true; + break; + } } } - if (!should_be_filtered) { + // Collect filtering information + filtered[i] = should_be_filtered; + if (should_be_filtered) + num_filtered++; + } + + // One last check that we shouldn't allow filtering entire machine + if (num_filtered == num_hw_threads) { + KMP_WARNING(AffHWSubsetAllFiltered); + __kmp_free(filtered); + return false; + } + + // Apply the filter + int new_index = 0; + for (int i = 0; i < num_hw_threads; ++i) { + if (!filtered[i]) { if (i != new_index) - hw_threads[new_index] = hw_thread; + hw_threads[new_index] = hw_threads[i]; new_index++; } else { #if KMP_AFFINITY_SUPPORTED - KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask); + KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask); #endif __kmp_avail_proc--; } } + KMP_DEBUG_ASSERT(new_index <= num_hw_threads); num_hw_threads = new_index; @@ -780,6 +1229,7 @@ bool kmp_topology_t::filter_hw_subset() { _discover_uniformity(); _set_globals(); _set_last_level_cache(); + __kmp_free(filtered); return true; } @@ -986,7 +1436,67 @@ kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, return buf; } -void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { +// Return (possibly empty) affinity mask representing the offline CPUs +// Caller must free the mask +kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() { + kmp_affin_mask_t *offline; + KMP_CPU_ALLOC(offline); + KMP_CPU_ZERO(offline); +#if KMP_OS_LINUX + int n, begin_cpu, end_cpu; + kmp_safe_raii_file_t offline_file; + auto skip_ws = [](FILE *f) { + int c; + do { + c = fgetc(f); + } while (isspace(c)); + if (c != EOF) + ungetc(c, f); + }; + // File contains CSV of integer ranges representing the offline CPUs + // e.g., 1,2,4-7,9,11-15 + int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r"); + if (status != 0) + return offline; + while (!feof(offline_file)) { + skip_ws(offline_file); + n = fscanf(offline_file, "%d", &begin_cpu); + if (n != 1) + break; + skip_ws(offline_file); + int c = fgetc(offline_file); + if (c == EOF || c == ',') { + // Just single CPU + end_cpu = begin_cpu; + } else if (c == '-') { + // Range of CPUs + skip_ws(offline_file); + n = fscanf(offline_file, "%d", &end_cpu); + if (n != 1) + break; + skip_ws(offline_file); + c = fgetc(offline_file); // skip ',' + } else { + // Syntax problem + break; + } + // Ensure a valid range of CPUs + if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 || + end_cpu >= __kmp_xproc || begin_cpu > end_cpu) { + continue; + } + // Insert [begin_cpu, end_cpu] into offline mask + for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) { + KMP_CPU_SET(cpu, offline); + } + } +#endif + return offline; +} + +// Return the number of available procs +int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { + int avail_proc = 0; KMP_CPU_ZERO(mask); #if KMP_GROUP_AFFINITY @@ -999,6 +1509,7 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { int num = __kmp_GetActiveProcessorCount(group); for (i = 0; i < num; i++) { KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); + avail_proc++; } } } else @@ -1007,10 +1518,18 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { { int proc; + kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus(); for (proc = 0; proc < __kmp_xproc; proc++) { + // Skip offline CPUs + if (KMP_CPU_ISSET(proc, offline_cpus)) + continue; KMP_CPU_SET(proc, mask); + avail_proc++; } + KMP_CPU_FREE(offline_cpus); } + + return avail_proc; } // All of the __kmp_affinity_create_*_map() routines should allocate the @@ -1156,6 +1675,45 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { return true; } + // Handle multiple types of cores if they exist on the system + int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0); + + typedef struct kmp_hwloc_cpukinds_info_t { + int efficiency; + kmp_hw_core_type_t core_type; + hwloc_bitmap_t mask; + } kmp_hwloc_cpukinds_info_t; + kmp_hwloc_cpukinds_info_t *cpukinds = nullptr; + + if (nr_cpu_kinds > 0) { + unsigned nr_infos; + struct hwloc_info_s *infos; + cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate( + sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds); + for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) { + cpukinds[idx].efficiency = -1; + cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN; + cpukinds[idx].mask = hwloc_bitmap_alloc(); + if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask, + &cpukinds[idx].efficiency, &nr_infos, &infos, + 0) == 0) { + for (unsigned i = 0; i < nr_infos; ++i) { + if (__kmp_str_match("CoreType", 8, infos[i].name)) { +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + if (__kmp_str_match("IntelAtom", 9, infos[i].value)) { + cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM; + break; + } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) { + cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE; + break; + } +#endif + } + } + } + } + } + root = hwloc_get_root_obj(tp); // Figure out the depth and types in the topology @@ -1215,6 +1773,20 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { hw_thread.clear(); hw_thread.ids[index] = pu->logical_index; hw_thread.os_id = pu->os_index; + // If multiple core types, then set that attribute for the hardware thread + if (cpukinds) { + int cpukind_index = -1; + for (int i = 0; i < nr_cpu_kinds; ++i) { + if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) { + cpukind_index = i; + break; + } + } + if (cpukind_index >= 0) { + hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type); + hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency); + } + } index--; } obj = pu; @@ -1258,6 +1830,13 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { if (included) hw_thread_index++; } + + // Free the core types information + if (cpukinds) { + for (int idx = 0; idx < nr_cpu_kinds; ++idx) + hwloc_bitmap_free(cpukinds[idx].mask); + __kmp_free(cpukinds); + } __kmp_topology->sort_ids(); return true; } @@ -1782,6 +2361,26 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) { return true; } +// Hybrid cpu detection using CPUID.1A +// Thread should be pinned to processor already +static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency, + unsigned *native_model_id) { + kmp_cpuid buf; + __kmp_x86_cpuid(0x1a, 0, &buf); + *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax); + switch (*type) { + case KMP_HW_CORE_TYPE_ATOM: + *efficiency = 0; + break; + case KMP_HW_CORE_TYPE_CORE: + *efficiency = 1; + break; + default: + *efficiency = 0; + } + *native_model_id = __kmp_extract_bits<0, 23>(buf.eax); +} + // Intel(R) microarchitecture code name Nehalem, Dunnington and later // architectures support a newer interface for specifying the x2APIC Ids, // based on CPUID.B or CPUID.1F @@ -2051,6 +2650,15 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) { hw_thread.ids[idx] >>= my_levels[j - 1].mask_width; } } + // Hybrid information + if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) { + kmp_hw_core_type_t type; + unsigned native_model_id; + int efficiency; + __kmp_get_hybrid_info(&type, &efficiency, &native_model_id); + hw_thread.attrs.set_core_type(type); + hw_thread.attrs.set_core_eff(efficiency); + } hw_thread_index++; } KMP_ASSERT(hw_thread_index > 0); @@ -2386,7 +2994,10 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line, unsigned val; if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; - KMP_ASSERT(nodeIdIndex + level <= maxIndex); + // validate the input before using level: + if (level > (unsigned)__kmp_xproc) { // level is too big + level = __kmp_xproc; + } if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; threadInfo[num_avail][nodeIdIndex + level] = val; @@ -3497,8 +4108,8 @@ static void __kmp_aux_affinity_initialize(void) { __kmp_affin_fullMask); KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); } - __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); - __kmp_avail_proc = __kmp_xproc; + __kmp_avail_proc = + __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); #if KMP_OS_WINDOWS // Set the process affinity mask since threads' affinity // masks must be subset of process mask in Windows* OS @@ -4145,14 +4756,19 @@ int __kmp_aux_set_affinity(void **mask) { int __kmp_aux_get_affinity(void **mask) { int gtid; int retval; +#if KMP_OS_WINDOWS || KMP_DEBUG kmp_info_t *th; - +#endif if (!KMP_AFFINITY_CAPABLE()) { return -1; } gtid = __kmp_entry_gtid(); +#if KMP_OS_WINDOWS || KMP_DEBUG th = __kmp_threads[gtid]; +#else + (void)gtid; // unused variable +#endif KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); KA_TRACE( |