aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
diff options
context:
space:
mode:
authorthegeorg <thegeorg@yandex-team.ru>2022-06-03 10:53:07 +0300
committerthegeorg <thegeorg@yandex-team.ru>2022-06-03 10:53:07 +0300
commita1d4361e379e2c72a469ad1bd64569cbc2db131f (patch)
tree0caddb240a10132376e4653a31578e117d33f9fd /contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
parent41f55a521834080d9d703c099c0418cfff3a0546 (diff)
downloadydb-a1d4361e379e2c72a469ad1bd64569cbc2db131f.tar.gz
Update contrib/libs/cxxsupp/openmp to 14.0.4
ref:77c6cdda99b217d50c4deadca11f5611fa0dc168
Diffstat (limited to 'contrib/libs/cxxsupp/openmp/kmp_affinity.cpp')
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_affinity.cpp718
1 files changed, 667 insertions, 51 deletions
diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
index 8b40bd7ecd..414a27fb05 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
@@ -26,6 +26,7 @@
#define HWLOC_GROUP_KIND_INTEL_DIE 104
#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
#endif
+#include <ctype.h>
// The machine topology
kmp_topology_t *__kmp_topology = nullptr;
@@ -123,6 +124,20 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
return ((plural) ? "unknowns" : "unknown");
}
+const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
+ switch (type) {
+ case KMP_HW_CORE_TYPE_UNKNOWN:
+ return "unknown";
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ case KMP_HW_CORE_TYPE_ATOM:
+ return "Intel Atom(R) processor";
+ case KMP_HW_CORE_TYPE_CORE:
+ return "Intel(R) Core(TM) processor";
+#endif
+ }
+ return "unknown";
+}
+
////////////////////////////////////////////////////////////////////////////////
// kmp_hw_thread_t methods
int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
@@ -174,20 +189,94 @@ void kmp_hw_thread_t::print() const {
for (int i = 0; i < depth; ++i) {
printf("%4d ", ids[i]);
}
+ if (attrs) {
+ if (attrs.is_core_type_valid())
+ printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type()));
+ if (attrs.is_core_eff_valid())
+ printf(" (eff=%d)", attrs.get_core_eff());
+ }
printf("\n");
}
////////////////////////////////////////////////////////////////////////////////
// kmp_topology_t methods
+// Add a layer to the topology based on the ids. Assume the topology
+// is perfectly nested (i.e., so no object has more than one parent)
+void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
+ // Figure out where the layer should go by comparing the ids of the current
+ // layers with the new ids
+ int target_layer;
+ int previous_id = kmp_hw_thread_t::UNKNOWN_ID;
+ int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID;
+
+ // Start from the highest layer and work down to find target layer
+ // If new layer is equal to another layer then put the new layer above
+ for (target_layer = 0; target_layer < depth; ++target_layer) {
+ bool layers_equal = true;
+ bool strictly_above_target_layer = false;
+ for (int i = 0; i < num_hw_threads; ++i) {
+ int id = hw_threads[i].ids[target_layer];
+ int new_id = ids[i];
+ if (id != previous_id && new_id == previous_new_id) {
+ // Found the layer we are strictly above
+ strictly_above_target_layer = true;
+ layers_equal = false;
+ break;
+ } else if (id == previous_id && new_id != previous_new_id) {
+ // Found a layer we are below. Move to next layer and check.
+ layers_equal = false;
+ break;
+ }
+ previous_id = id;
+ previous_new_id = new_id;
+ }
+ if (strictly_above_target_layer || layers_equal)
+ break;
+ }
+
+ // Found the layer we are above. Now move everything to accommodate the new
+ // layer. And put the new ids and type into the topology.
+ for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
+ types[j] = types[i];
+ types[target_layer] = type;
+ for (int k = 0; k < num_hw_threads; ++k) {
+ for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
+ hw_threads[k].ids[j] = hw_threads[k].ids[i];
+ hw_threads[k].ids[target_layer] = ids[k];
+ }
+ equivalent[type] = type;
+ depth++;
+}
+
+#if KMP_GROUP_AFFINITY
+// Insert the Windows Processor Group structure into the topology
+void kmp_topology_t::_insert_windows_proc_groups() {
+ // Do not insert the processor group structure for a single group
+ if (__kmp_num_proc_groups == 1)
+ return;
+ kmp_affin_mask_t *mask;
+ int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
+ KMP_CPU_ALLOC(mask);
+ for (int i = 0; i < num_hw_threads; ++i) {
+ KMP_CPU_ZERO(mask);
+ KMP_CPU_SET(hw_threads[i].os_id, mask);
+ ids[i] = __kmp_get_proc_group(mask);
+ }
+ KMP_CPU_FREE(mask);
+ _insert_layer(KMP_HW_PROC_GROUP, ids);
+ __kmp_free(ids);
+}
+#endif
+
// Remove layers that don't add information to the topology.
// This is done by having the layer take on the id = UNKNOWN_ID (-1)
void kmp_topology_t::_remove_radix1_layers() {
int preference[KMP_HW_LAST];
int top_index1, top_index2;
// Set up preference associative array
- preference[KMP_HW_PROC_GROUP] = 110;
- preference[KMP_HW_SOCKET] = 100;
+ preference[KMP_HW_SOCKET] = 110;
+ preference[KMP_HW_PROC_GROUP] = 100;
preference[KMP_HW_CORE] = 95;
preference[KMP_HW_THREAD] = 90;
preference[KMP_HW_NUMA] = 85;
@@ -305,6 +394,7 @@ void kmp_topology_t::_gather_enumeration_information() {
count[i] = 0;
ratio[i] = 0;
}
+ int core_level = get_level(KMP_HW_CORE);
for (int i = 0; i < num_hw_threads; ++i) {
kmp_hw_thread_t &hw_thread = hw_threads[i];
for (int layer = 0; layer < depth; ++layer) {
@@ -320,6 +410,29 @@ void kmp_topology_t::_gather_enumeration_information() {
ratio[l] = max[l];
max[l] = 1;
}
+ // Figure out the number of different core types
+ // and efficiencies for hybrid CPUs
+ if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) {
+ if (hw_thread.attrs.is_core_eff_valid() &&
+ hw_thread.attrs.core_eff >= num_core_efficiencies) {
+ // Because efficiencies can range from 0 to max efficiency - 1,
+ // the number of efficiencies is max efficiency + 1
+ num_core_efficiencies = hw_thread.attrs.core_eff + 1;
+ }
+ if (hw_thread.attrs.is_core_type_valid()) {
+ bool found = false;
+ for (int j = 0; j < num_core_types; ++j) {
+ if (hw_thread.attrs.get_core_type() == core_types[j]) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES);
+ core_types[num_core_types++] = hw_thread.attrs.get_core_type();
+ }
+ }
+ }
break;
}
}
@@ -333,6 +446,42 @@ void kmp_topology_t::_gather_enumeration_information() {
}
}
+int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr,
+ int above_level,
+ bool find_all) const {
+ int current, current_max;
+ int previous_id[KMP_HW_LAST];
+ for (int i = 0; i < depth; ++i)
+ previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
+ int core_level = get_level(KMP_HW_CORE);
+ if (find_all)
+ above_level = -1;
+ KMP_ASSERT(above_level < core_level);
+ current_max = 0;
+ current = 0;
+ for (int i = 0; i < num_hw_threads; ++i) {
+ kmp_hw_thread_t &hw_thread = hw_threads[i];
+ if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) {
+ if (current > current_max)
+ current_max = current;
+ current = hw_thread.attrs.contains(attr);
+ } else {
+ for (int level = above_level + 1; level <= core_level; ++level) {
+ if (hw_thread.ids[level] != previous_id[level]) {
+ if (hw_thread.attrs.contains(attr))
+ current++;
+ break;
+ }
+ }
+ }
+ for (int level = 0; level < depth; ++level)
+ previous_id[level] = hw_thread.ids[level];
+ }
+ if (current > current_max)
+ current_max = current;
+ return current_max;
+}
+
// Find out if the topology is uniform
void kmp_topology_t::_discover_uniformity() {
int num = 1;
@@ -406,7 +555,7 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
kmp_topology_t *retval;
// Allocate all data in one large allocation
size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
- sizeof(int) * ndepth * 3;
+ sizeof(int) * (size_t)KMP_HW_LAST * 3;
char *bytes = (char *)__kmp_allocate(size);
retval = (kmp_topology_t *)bytes;
if (nproc > 0) {
@@ -419,8 +568,12 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
int *arr =
(int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
retval->types = (kmp_hw_t *)arr;
- retval->ratio = arr + ndepth;
- retval->count = arr + 2 * ndepth;
+ retval->ratio = arr + (size_t)KMP_HW_LAST;
+ retval->count = arr + 2 * (size_t)KMP_HW_LAST;
+ retval->num_core_efficiencies = 0;
+ retval->num_core_types = 0;
+ for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
+ retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
for (int i = 0; i < ndepth; ++i) {
retval->types[i] = types[i];
@@ -478,6 +631,13 @@ void kmp_topology_t::dump() const {
}
printf("\n");
+ printf("* num_core_eff: %d\n", num_core_efficiencies);
+ printf("* num_core_types: %d\n", num_core_types);
+ printf("* core_types: ");
+ for (int i = 0; i < num_core_types; ++i)
+ printf("%3d ", core_types[i]);
+ printf("\n");
+
printf("* equivalent map:\n");
KMP_FOREACH_HW_TYPE(i) {
const char *key = __kmp_hw_get_keyword(i);
@@ -571,6 +731,29 @@ void kmp_topology_t::print(const char *env_var) const {
}
KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
+ // Hybrid topology information
+ if (__kmp_is_hybrid_cpu()) {
+ for (int i = 0; i < num_core_types; ++i) {
+ kmp_hw_core_type_t core_type = core_types[i];
+ kmp_hw_attr_t attr;
+ attr.clear();
+ attr.set_core_type(core_type);
+ int ncores = get_ncores_with_attr(attr);
+ if (ncores > 0) {
+ KMP_INFORM(TopologyHybrid, env_var, ncores,
+ __kmp_hw_get_core_type_string(core_type));
+ KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS)
+ for (int eff = 0; eff < num_core_efficiencies; ++eff) {
+ attr.set_core_eff(eff);
+ int ncores_with_eff = get_ncores_with_attr(attr);
+ if (ncores_with_eff > 0) {
+ KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff);
+ }
+ }
+ }
+ }
+ }
+
if (num_hw_threads <= 0) {
__kmp_str_buf_free(&buf);
return;
@@ -585,6 +768,10 @@ void kmp_topology_t::print(const char *env_var) const {
__kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
__kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
}
+ if (__kmp_is_hybrid_cpu())
+ __kmp_str_buf_print(
+ &buf, "(%s)",
+ __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type()));
KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
}
@@ -592,6 +779,9 @@ void kmp_topology_t::print(const char *env_var) const {
}
void kmp_topology_t::canonicalize() {
+#if KMP_GROUP_AFFINITY
+ _insert_windows_proc_groups();
+#endif
_remove_radix1_layers();
_gather_enumeration_information();
_discover_uniformity();
@@ -640,6 +830,25 @@ void kmp_topology_t::canonicalize() {
__kmp_hw_get_catalog_string(gran_type));
__kmp_affinity_gran = gran_type;
}
+#if KMP_GROUP_AFFINITY
+ // If more than one processor group exists, and the level of
+ // granularity specified by the user is too coarse, then the
+ // granularity must be adjusted "down" to processor group affinity
+ // because threads can only exist within one processor group.
+ // For example, if a user sets granularity=socket and there are two
+ // processor groups that cover a socket, then the runtime must
+ // restrict the granularity down to the processor group level.
+ if (__kmp_num_proc_groups > 1) {
+ int gran_depth = __kmp_topology->get_level(gran_type);
+ int proc_group_depth = __kmp_topology->get_level(KMP_HW_PROC_GROUP);
+ if (gran_depth >= 0 && proc_group_depth >= 0 &&
+ gran_depth < proc_group_depth) {
+ KMP_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY",
+ __kmp_hw_get_catalog_string(__kmp_affinity_gran));
+ __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP;
+ }
+ }
+#endif
__kmp_affinity_gran_levels = 0;
for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
__kmp_affinity_gran_levels++;
@@ -673,6 +882,56 @@ void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
_discover_uniformity();
}
+// Represents running sub IDs for a single core attribute where
+// attribute values have SIZE possibilities.
+template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
+ int last_level; // last level in topology to consider for sub_ids
+ int sub_id[SIZE]; // The sub ID for a given attribute value
+ int prev_sub_id[KMP_HW_LAST];
+ IndexFunc indexer;
+
+public:
+ kmp_sub_ids_t(int last_level) : last_level(last_level) {
+ KMP_ASSERT(last_level < KMP_HW_LAST);
+ for (size_t i = 0; i < SIZE; ++i)
+ sub_id[i] = -1;
+ for (size_t i = 0; i < KMP_HW_LAST; ++i)
+ prev_sub_id[i] = -1;
+ }
+ void update(const kmp_hw_thread_t &hw_thread) {
+ int idx = indexer(hw_thread);
+ KMP_ASSERT(idx < (int)SIZE);
+ for (int level = 0; level <= last_level; ++level) {
+ if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
+ if (level < last_level)
+ sub_id[idx] = -1;
+ sub_id[idx]++;
+ break;
+ }
+ }
+ for (int level = 0; level <= last_level; ++level)
+ prev_sub_id[level] = hw_thread.sub_ids[level];
+ }
+ int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
+ return sub_id[indexer(hw_thread)];
+ }
+};
+
+static kmp_str_buf_t *
+__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
+ bool plural) {
+ __kmp_str_buf_init(buf);
+ if (attr.is_core_type_valid())
+ __kmp_str_buf_print(buf, "%s %s",
+ __kmp_hw_get_core_type_string(attr.get_core_type()),
+ __kmp_hw_get_catalog_string(KMP_HW_CORE, plural));
+ else
+ __kmp_str_buf_print(buf, "%s eff=%d",
+ __kmp_hw_get_catalog_string(KMP_HW_CORE, plural),
+ attr.get_core_eff());
+ return buf;
+}
+
// Apply the KMP_HW_SUBSET envirable to the topology
// Returns true if KMP_HW_SUBSET filtered any processors
// otherwise, returns false
@@ -681,18 +940,27 @@ bool kmp_topology_t::filter_hw_subset() {
if (!__kmp_hw_subset)
return false;
+ // First, sort the KMP_HW_SUBSET items by the machine topology
+ __kmp_hw_subset->sort();
+
// Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
+ bool using_core_types = false;
+ bool using_core_effs = false;
int hw_subset_depth = __kmp_hw_subset->get_depth();
kmp_hw_t specified[KMP_HW_LAST];
+ int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
KMP_ASSERT(hw_subset_depth > 0);
KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
+ int core_level = get_level(KMP_HW_CORE);
for (int i = 0; i < hw_subset_depth; ++i) {
int max_count;
- int num = __kmp_hw_subset->at(i).num;
- int offset = __kmp_hw_subset->at(i).offset;
- kmp_hw_t type = __kmp_hw_subset->at(i).type;
+ const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i);
+ int num = item.num[0];
+ int offset = item.offset[0];
+ kmp_hw_t type = item.type;
kmp_hw_t equivalent_type = equivalent[type];
int level = get_level(type);
+ topology_levels[i] = level;
// Check to see if current layer is in detected machine topology
if (equivalent_type != KMP_HW_UNKNOWN) {
@@ -703,8 +971,8 @@ bool kmp_topology_t::filter_hw_subset() {
return false;
}
- // Check to see if current layer has already been specified
- // either directly or through an equivalent type
+ // Check to see if current layer has already been
+ // specified either directly or through an equivalent type
if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type),
__kmp_hw_get_catalog_string(specified[equivalent_type]));
@@ -712,66 +980,247 @@ bool kmp_topology_t::filter_hw_subset() {
}
specified[equivalent_type] = type;
- // Check to see if layers are in order
- if (i + 1 < hw_subset_depth) {
- kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type);
- if (next_type == KMP_HW_UNKNOWN) {
- KMP_WARNING(
- AffHWSubsetNotExistGeneric,
- __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type));
- return false;
- }
- int next_topology_level = get_level(next_type);
- if (level > next_topology_level) {
- KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type),
- __kmp_hw_get_catalog_string(next_type));
- return false;
- }
- }
-
// Check to see if each layer's num & offset parameters are valid
max_count = get_ratio(level);
- if (max_count < 0 || num + offset > max_count) {
+ if (max_count < 0 ||
+ (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
bool plural = (num > 1);
KMP_WARNING(AffHWSubsetManyGeneric,
__kmp_hw_get_catalog_string(type, plural));
return false;
}
+
+ // Check to see if core attributes are consistent
+ if (core_level == level) {
+ // Determine which core attributes are specified
+ for (int j = 0; j < item.num_attrs; ++j) {
+ if (item.attr[j].is_core_type_valid())
+ using_core_types = true;
+ if (item.attr[j].is_core_eff_valid())
+ using_core_effs = true;
+ }
+
+ // Check if using a single core attribute on non-hybrid arch.
+ // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute.
+ //
+ // Check if using multiple core attributes on non-hyrbid arch.
+ // Ignore all of KMP_HW_SUBSET if this is the case.
+ if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
+ if (item.num_attrs == 1) {
+ if (using_core_effs) {
+ KMP_WARNING(AffHWSubsetIgnoringAttr, "efficiency");
+ } else {
+ KMP_WARNING(AffHWSubsetIgnoringAttr, "core_type");
+ }
+ using_core_effs = false;
+ using_core_types = false;
+ } else {
+ KMP_WARNING(AffHWSubsetAttrsNonHybrid);
+ return false;
+ }
+ }
+
+ // Check if using both core types and core efficiencies together
+ if (using_core_types && using_core_effs) {
+ KMP_WARNING(AffHWSubsetIncompat, "core_type", "efficiency");
+ return false;
+ }
+
+ // Check that core efficiency values are valid
+ if (using_core_effs) {
+ for (int j = 0; j < item.num_attrs; ++j) {
+ if (item.attr[j].is_core_eff_valid()) {
+ int core_eff = item.attr[j].get_core_eff();
+ if (core_eff < 0 || core_eff >= num_core_efficiencies) {
+ kmp_str_buf_t buf;
+ __kmp_str_buf_init(&buf);
+ __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff());
+ __kmp_msg(kmp_ms_warning,
+ KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str),
+ KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1),
+ __kmp_msg_null);
+ __kmp_str_buf_free(&buf);
+ return false;
+ }
+ }
+ }
+ }
+
+ // Check that the number of requested cores with attributes is valid
+ if (using_core_types || using_core_effs) {
+ for (int j = 0; j < item.num_attrs; ++j) {
+ int num = item.num[j];
+ int offset = item.offset[j];
+ int level_above = core_level - 1;
+ if (level_above >= 0) {
+ max_count = get_ncores_with_attr_per(item.attr[j], level_above);
+ if (max_count <= 0 ||
+ (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
+ kmp_str_buf_t buf;
+ __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
+ KMP_WARNING(AffHWSubsetManyGeneric, buf.str);
+ __kmp_str_buf_free(&buf);
+ return false;
+ }
+ }
+ }
+ }
+
+ if ((using_core_types || using_core_effs) && item.num_attrs > 1) {
+ for (int j = 0; j < item.num_attrs; ++j) {
+ // Ambiguous use of specific core attribute + generic core
+ // e.g., 4c & 3c:intel_core or 4c & 3c:eff1
+ if (!item.attr[j]) {
+ kmp_hw_attr_t other_attr;
+ for (int k = 0; k < item.num_attrs; ++k) {
+ if (item.attr[k] != item.attr[j]) {
+ other_attr = item.attr[k];
+ break;
+ }
+ }
+ kmp_str_buf_t buf;
+ __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
+ KMP_WARNING(AffHWSubsetIncompat,
+ __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
+ __kmp_str_buf_free(&buf);
+ return false;
+ }
+ // Allow specifying a specific core type or core eff exactly once
+ for (int k = 0; k < j; ++k) {
+ if (!item.attr[j] || !item.attr[k])
+ continue;
+ if (item.attr[k] == item.attr[j]) {
+ kmp_str_buf_t buf;
+ __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
+ item.num[j] > 0);
+ KMP_WARNING(AffHWSubsetAttrRepeat, buf.str);
+ __kmp_str_buf_free(&buf);
+ return false;
+ }
+ }
+ }
+ }
+ }
}
- // Apply the filtered hardware subset
- int new_index = 0;
+ struct core_type_indexer {
+ int operator()(const kmp_hw_thread_t &t) const {
+ switch (t.attrs.get_core_type()) {
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ case KMP_HW_CORE_TYPE_ATOM:
+ return 1;
+ case KMP_HW_CORE_TYPE_CORE:
+ return 2;
+#endif
+ case KMP_HW_CORE_TYPE_UNKNOWN:
+ return 0;
+ }
+ KMP_ASSERT(0);
+ return 0;
+ }
+ };
+ struct core_eff_indexer {
+ int operator()(const kmp_hw_thread_t &t) const {
+ return t.attrs.get_core_eff();
+ }
+ };
+
+ kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
+ core_level);
+ kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
+ core_level);
+
+ // Determine which hardware threads should be filtered.
+ int num_filtered = 0;
+ bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads);
for (int i = 0; i < num_hw_threads; ++i) {
kmp_hw_thread_t &hw_thread = hw_threads[i];
+ // Update type_sub_id
+ if (using_core_types)
+ core_type_sub_ids.update(hw_thread);
+ if (using_core_effs)
+ core_eff_sub_ids.update(hw_thread);
+
// Check to see if this hardware thread should be filtered
bool should_be_filtered = false;
- for (int level = 0, hw_subset_index = 0;
- level < depth && hw_subset_index < hw_subset_depth; ++level) {
- kmp_hw_t topology_type = types[level];
- auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
- kmp_hw_t hw_subset_type = hw_subset_item.type;
- if (topology_type != hw_subset_type)
+ for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth;
+ ++hw_subset_index) {
+ const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
+ int level = topology_levels[hw_subset_index];
+ if (level == -1)
continue;
- int num = hw_subset_item.num;
- int offset = hw_subset_item.offset;
- hw_subset_index++;
- if (hw_thread.sub_ids[level] < offset ||
- hw_thread.sub_ids[level] >= offset + num) {
- should_be_filtered = true;
- break;
+ if ((using_core_effs || using_core_types) && level == core_level) {
+ // Look for the core attribute in KMP_HW_SUBSET which corresponds
+ // to this hardware thread's core attribute. Use this num,offset plus
+ // the running sub_id for the particular core attribute of this hardware
+ // thread to determine if the hardware thread should be filtered or not.
+ int attr_idx;
+ kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type();
+ int core_eff = hw_thread.attrs.get_core_eff();
+ for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) {
+ if (using_core_types &&
+ hw_subset_item.attr[attr_idx].get_core_type() == core_type)
+ break;
+ if (using_core_effs &&
+ hw_subset_item.attr[attr_idx].get_core_eff() == core_eff)
+ break;
+ }
+ // This core attribute isn't in the KMP_HW_SUBSET so always filter it.
+ if (attr_idx == hw_subset_item.num_attrs) {
+ should_be_filtered = true;
+ break;
+ }
+ int sub_id;
+ int num = hw_subset_item.num[attr_idx];
+ int offset = hw_subset_item.offset[attr_idx];
+ if (using_core_types)
+ sub_id = core_type_sub_ids.get_sub_id(hw_thread);
+ else
+ sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
+ if (sub_id < offset ||
+ (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
+ should_be_filtered = true;
+ break;
+ }
+ } else {
+ int num = hw_subset_item.num[0];
+ int offset = hw_subset_item.offset[0];
+ if (hw_thread.sub_ids[level] < offset ||
+ (num != kmp_hw_subset_t::USE_ALL &&
+ hw_thread.sub_ids[level] >= offset + num)) {
+ should_be_filtered = true;
+ break;
+ }
}
}
- if (!should_be_filtered) {
+ // Collect filtering information
+ filtered[i] = should_be_filtered;
+ if (should_be_filtered)
+ num_filtered++;
+ }
+
+ // One last check that we shouldn't allow filtering entire machine
+ if (num_filtered == num_hw_threads) {
+ KMP_WARNING(AffHWSubsetAllFiltered);
+ __kmp_free(filtered);
+ return false;
+ }
+
+ // Apply the filter
+ int new_index = 0;
+ for (int i = 0; i < num_hw_threads; ++i) {
+ if (!filtered[i]) {
if (i != new_index)
- hw_threads[new_index] = hw_thread;
+ hw_threads[new_index] = hw_threads[i];
new_index++;
} else {
#if KMP_AFFINITY_SUPPORTED
- KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask);
+ KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask);
#endif
__kmp_avail_proc--;
}
}
+
KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
num_hw_threads = new_index;
@@ -780,6 +1229,7 @@ bool kmp_topology_t::filter_hw_subset() {
_discover_uniformity();
_set_globals();
_set_last_level_cache();
+ __kmp_free(filtered);
return true;
}
@@ -986,7 +1436,67 @@ kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
return buf;
}
-void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
+// Return (possibly empty) affinity mask representing the offline CPUs
+// Caller must free the mask
+kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
+ kmp_affin_mask_t *offline;
+ KMP_CPU_ALLOC(offline);
+ KMP_CPU_ZERO(offline);
+#if KMP_OS_LINUX
+ int n, begin_cpu, end_cpu;
+ kmp_safe_raii_file_t offline_file;
+ auto skip_ws = [](FILE *f) {
+ int c;
+ do {
+ c = fgetc(f);
+ } while (isspace(c));
+ if (c != EOF)
+ ungetc(c, f);
+ };
+ // File contains CSV of integer ranges representing the offline CPUs
+ // e.g., 1,2,4-7,9,11-15
+ int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
+ if (status != 0)
+ return offline;
+ while (!feof(offline_file)) {
+ skip_ws(offline_file);
+ n = fscanf(offline_file, "%d", &begin_cpu);
+ if (n != 1)
+ break;
+ skip_ws(offline_file);
+ int c = fgetc(offline_file);
+ if (c == EOF || c == ',') {
+ // Just single CPU
+ end_cpu = begin_cpu;
+ } else if (c == '-') {
+ // Range of CPUs
+ skip_ws(offline_file);
+ n = fscanf(offline_file, "%d", &end_cpu);
+ if (n != 1)
+ break;
+ skip_ws(offline_file);
+ c = fgetc(offline_file); // skip ','
+ } else {
+ // Syntax problem
+ break;
+ }
+ // Ensure a valid range of CPUs
+ if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 ||
+ end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
+ continue;
+ }
+ // Insert [begin_cpu, end_cpu] into offline mask
+ for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
+ KMP_CPU_SET(cpu, offline);
+ }
+ }
+#endif
+ return offline;
+}
+
+// Return the number of available procs
+int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
+ int avail_proc = 0;
KMP_CPU_ZERO(mask);
#if KMP_GROUP_AFFINITY
@@ -999,6 +1509,7 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
int num = __kmp_GetActiveProcessorCount(group);
for (i = 0; i < num; i++) {
KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
+ avail_proc++;
}
}
} else
@@ -1007,10 +1518,18 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
{
int proc;
+ kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus();
for (proc = 0; proc < __kmp_xproc; proc++) {
+ // Skip offline CPUs
+ if (KMP_CPU_ISSET(proc, offline_cpus))
+ continue;
KMP_CPU_SET(proc, mask);
+ avail_proc++;
}
+ KMP_CPU_FREE(offline_cpus);
}
+
+ return avail_proc;
}
// All of the __kmp_affinity_create_*_map() routines should allocate the
@@ -1156,6 +1675,45 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
return true;
}
+ // Handle multiple types of cores if they exist on the system
+ int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
+
+ typedef struct kmp_hwloc_cpukinds_info_t {
+ int efficiency;
+ kmp_hw_core_type_t core_type;
+ hwloc_bitmap_t mask;
+ } kmp_hwloc_cpukinds_info_t;
+ kmp_hwloc_cpukinds_info_t *cpukinds = nullptr;
+
+ if (nr_cpu_kinds > 0) {
+ unsigned nr_infos;
+ struct hwloc_info_s *infos;
+ cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate(
+ sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds);
+ for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) {
+ cpukinds[idx].efficiency = -1;
+ cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+ cpukinds[idx].mask = hwloc_bitmap_alloc();
+ if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask,
+ &cpukinds[idx].efficiency, &nr_infos, &infos,
+ 0) == 0) {
+ for (unsigned i = 0; i < nr_infos; ++i) {
+ if (__kmp_str_match("CoreType", 8, infos[i].name)) {
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ if (__kmp_str_match("IntelAtom", 9, infos[i].value)) {
+ cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM;
+ break;
+ } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) {
+ cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE;
+ break;
+ }
+#endif
+ }
+ }
+ }
+ }
+ }
+
root = hwloc_get_root_obj(tp);
// Figure out the depth and types in the topology
@@ -1215,6 +1773,20 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
hw_thread.clear();
hw_thread.ids[index] = pu->logical_index;
hw_thread.os_id = pu->os_index;
+ // If multiple core types, then set that attribute for the hardware thread
+ if (cpukinds) {
+ int cpukind_index = -1;
+ for (int i = 0; i < nr_cpu_kinds; ++i) {
+ if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) {
+ cpukind_index = i;
+ break;
+ }
+ }
+ if (cpukind_index >= 0) {
+ hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type);
+ hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
+ }
+ }
index--;
}
obj = pu;
@@ -1258,6 +1830,13 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
if (included)
hw_thread_index++;
}
+
+ // Free the core types information
+ if (cpukinds) {
+ for (int idx = 0; idx < nr_cpu_kinds; ++idx)
+ hwloc_bitmap_free(cpukinds[idx].mask);
+ __kmp_free(cpukinds);
+ }
__kmp_topology->sort_ids();
return true;
}
@@ -1782,6 +2361,26 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
return true;
}
+// Hybrid cpu detection using CPUID.1A
+// Thread should be pinned to processor already
+static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency,
+ unsigned *native_model_id) {
+ kmp_cpuid buf;
+ __kmp_x86_cpuid(0x1a, 0, &buf);
+ *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
+ switch (*type) {
+ case KMP_HW_CORE_TYPE_ATOM:
+ *efficiency = 0;
+ break;
+ case KMP_HW_CORE_TYPE_CORE:
+ *efficiency = 1;
+ break;
+ default:
+ *efficiency = 0;
+ }
+ *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
+}
+
// Intel(R) microarchitecture code name Nehalem, Dunnington and later
// architectures support a newer interface for specifying the x2APIC Ids,
// based on CPUID.B or CPUID.1F
@@ -2051,6 +2650,15 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
}
}
+ // Hybrid information
+ if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
+ kmp_hw_core_type_t type;
+ unsigned native_model_id;
+ int efficiency;
+ __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
+ hw_thread.attrs.set_core_type(type);
+ hw_thread.attrs.set_core_eff(efficiency);
+ }
hw_thread_index++;
}
KMP_ASSERT(hw_thread_index > 0);
@@ -2386,7 +2994,10 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
unsigned val;
if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
goto no_val;
- KMP_ASSERT(nodeIdIndex + level <= maxIndex);
+ // validate the input before using level:
+ if (level > (unsigned)__kmp_xproc) { // level is too big
+ level = __kmp_xproc;
+ }
if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
goto dup_field;
threadInfo[num_avail][nodeIdIndex + level] = val;
@@ -3497,8 +4108,8 @@ static void __kmp_aux_affinity_initialize(void) {
__kmp_affin_fullMask);
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
}
- __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
- __kmp_avail_proc = __kmp_xproc;
+ __kmp_avail_proc =
+ __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
#if KMP_OS_WINDOWS
// Set the process affinity mask since threads' affinity
// masks must be subset of process mask in Windows* OS
@@ -4145,14 +4756,19 @@ int __kmp_aux_set_affinity(void **mask) {
int __kmp_aux_get_affinity(void **mask) {
int gtid;
int retval;
+#if KMP_OS_WINDOWS || KMP_DEBUG
kmp_info_t *th;
-
+#endif
if (!KMP_AFFINITY_CAPABLE()) {
return -1;
}
gtid = __kmp_entry_gtid();
+#if KMP_OS_WINDOWS || KMP_DEBUG
th = __kmp_threads[gtid];
+#else
+ (void)gtid; // unused variable
+#endif
KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
KA_TRACE(