summaryrefslogtreecommitdiffstats
path: root/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
diff options
context:
space:
mode:
authormikhnenko <[email protected]>2025-07-15 20:05:43 +0300
committermikhnenko <[email protected]>2025-07-15 20:52:16 +0300
commita40bd4f45bbc18fd95b1596e655b8942ceb2cf4b (patch)
treebce599ca02c778c277198de6d131d37db71997d0 /contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
parent728e0eaef4dc1f1152d2c3a4cc1bbdf597f3ef3d (diff)
Update contrib/libs/cxxsupp/openmp to 20.1.7
commit_hash:722dd5fe79203d22ad4a0be288ac0caeb6b3dd68
Diffstat (limited to 'contrib/libs/cxxsupp/openmp/kmp_affinity.cpp')
-rw-r--r--contrib/libs/cxxsupp/openmp/kmp_affinity.cpp2109
1 files changed, 1489 insertions, 620 deletions
diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
index b9a8d49d8da..624fb3b0761 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
@@ -38,6 +38,43 @@ static hierarchy_info machine_hierarchy;
void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
+#if KMP_AFFINITY_SUPPORTED
+// Helper class to see if place lists further restrict the fullMask
+class kmp_full_mask_modifier_t {
+ kmp_affin_mask_t *mask;
+
+public:
+ kmp_full_mask_modifier_t() {
+ KMP_CPU_ALLOC(mask);
+ KMP_CPU_ZERO(mask);
+ }
+ ~kmp_full_mask_modifier_t() {
+ KMP_CPU_FREE(mask);
+ mask = nullptr;
+ }
+ void include(const kmp_affin_mask_t *other) { KMP_CPU_UNION(mask, other); }
+ // If the new full mask is different from the current full mask,
+ // then switch them. Returns true if full mask was affected, false otherwise.
+ bool restrict_to_mask() {
+ // See if the new mask further restricts or changes the full mask
+ if (KMP_CPU_EQUAL(__kmp_affin_fullMask, mask) || KMP_CPU_ISEMPTY(mask))
+ return false;
+ return __kmp_topology->restrict_to_mask(mask);
+ }
+};
+
+static inline const char *
+__kmp_get_affinity_env_var(const kmp_affinity_t &affinity,
+ bool for_binding = false) {
+ if (affinity.flags.omp_places) {
+ if (for_binding)
+ return "OMP_PROC_BIND";
+ return "OMP_PLACES";
+ }
+ return affinity.env_var;
+}
+#endif // KMP_AFFINITY_SUPPORTED
+
void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
kmp_uint32 depth;
// The test below is true if affinity is available, but set to "none". Need to
@@ -90,8 +127,12 @@ const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
case KMP_HW_PROC_GROUP:
return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
+ case KMP_HW_UNKNOWN:
+ case KMP_HW_LAST:
+ return KMP_I18N_STR(Unknown);
}
- return KMP_I18N_STR(Unknown);
+ KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
+ KMP_BUILTIN_UNREACHABLE;
}
const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
@@ -120,13 +161,18 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
return ((plural) ? "threads" : "thread");
case KMP_HW_PROC_GROUP:
return ((plural) ? "proc_groups" : "proc_group");
+ case KMP_HW_UNKNOWN:
+ case KMP_HW_LAST:
+ return ((plural) ? "unknowns" : "unknown");
}
- return ((plural) ? "unknowns" : "unknown");
+ KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
+ KMP_BUILTIN_UNREACHABLE;
}
const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
switch (type) {
case KMP_HW_CORE_TYPE_UNKNOWN:
+ case KMP_HW_MAX_NUM_CORE_TYPES:
return "unknown";
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
case KMP_HW_CORE_TYPE_ATOM:
@@ -135,19 +181,19 @@ const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
return "Intel(R) Core(TM) processor";
#endif
}
- return "unknown";
+ KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
+ KMP_BUILTIN_UNREACHABLE;
}
#if KMP_AFFINITY_SUPPORTED
// If affinity is supported, check the affinity
// verbose and warning flags before printing warning
-#define KMP_AFF_WARNING(...) \
- if (__kmp_affinity_verbose || \
- (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { \
+#define KMP_AFF_WARNING(s, ...) \
+ if (s.flags.verbose || (s.flags.warnings && (s.type != affinity_none))) { \
KMP_WARNING(__VA_ARGS__); \
}
#else
-#define KMP_AFF_WARNING KMP_WARNING
+#define KMP_AFF_WARNING(s, ...) KMP_WARNING(__VA_ARGS__)
#endif
////////////////////////////////////////////////////////////////////////////////
@@ -157,7 +203,26 @@ int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
int depth = __kmp_topology->get_depth();
for (int level = 0; level < depth; ++level) {
- if (ahwthread->ids[level] < bhwthread->ids[level])
+ // Reverse sort (higher efficiencies earlier in list) cores by core
+ // efficiency if available.
+ if (__kmp_is_hybrid_cpu() &&
+ __kmp_topology->get_type(level) == KMP_HW_CORE &&
+ ahwthread->attrs.is_core_eff_valid() &&
+ bhwthread->attrs.is_core_eff_valid()) {
+ if (ahwthread->attrs.get_core_eff() < bhwthread->attrs.get_core_eff())
+ return 1;
+ if (ahwthread->attrs.get_core_eff() > bhwthread->attrs.get_core_eff())
+ return -1;
+ }
+ if (ahwthread->ids[level] == bhwthread->ids[level])
+ continue;
+ // If the hardware id is unknown for this level, then place hardware thread
+ // further down in the sorted list as it should take last priority
+ if (ahwthread->ids[level] == UNKNOWN_ID)
+ return 1;
+ else if (bhwthread->ids[level] == UNKNOWN_ID)
+ return -1;
+ else if (ahwthread->ids[level] < bhwthread->ids[level])
return -1;
else if (ahwthread->ids[level] > bhwthread->ids[level])
return 1;
@@ -175,9 +240,10 @@ int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
int depth = __kmp_topology->get_depth();
- KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
- KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth);
- for (i = 0; i < __kmp_affinity_compact; i++) {
+ int compact = __kmp_topology->compact;
+ KMP_DEBUG_ASSERT(compact >= 0);
+ KMP_DEBUG_ASSERT(compact <= depth);
+ for (i = 0; i < compact; i++) {
int j = depth - i - 1;
if (aa->sub_ids[j] < bb->sub_ids[j])
return -1;
@@ -185,7 +251,7 @@ int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
return 1;
}
for (; i < depth; i++) {
- int j = i - __kmp_affinity_compact;
+ int j = i - compact;
if (aa->sub_ids[j] < bb->sub_ids[j])
return -1;
if (aa->sub_ids[j] > bb->sub_ids[j])
@@ -199,7 +265,7 @@ void kmp_hw_thread_t::print() const {
int depth = __kmp_topology->get_depth();
printf("%4d ", os_id);
for (int i = 0; i < depth; ++i) {
- printf("%4d ", ids[i]);
+ printf("%4d (%d) ", ids[i], sub_ids[i]);
}
if (attrs) {
if (attrs.is_core_type_valid())
@@ -207,6 +273,8 @@ void kmp_hw_thread_t::print() const {
if (attrs.is_core_eff_valid())
printf(" (eff=%d)", attrs.get_core_eff());
}
+ if (leader)
+ printf(" (leader)");
printf("\n");
}
@@ -215,7 +283,7 @@ void kmp_hw_thread_t::print() const {
// Add a layer to the topology based on the ids. Assume the topology
// is perfectly nested (i.e., so no object has more than one parent)
-void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
+void kmp_topology_t::insert_layer(kmp_hw_t type, const int *ids) {
// Figure out where the layer should go by comparing the ids of the current
// layers with the new ids
int target_layer;
@@ -276,8 +344,11 @@ void kmp_topology_t::_insert_windows_proc_groups() {
ids[i] = __kmp_get_proc_group(mask);
}
KMP_CPU_FREE(mask);
- _insert_layer(KMP_HW_PROC_GROUP, ids);
+ insert_layer(KMP_HW_PROC_GROUP, ids);
__kmp_free(ids);
+
+ // sort topology after adding proc groups
+ __kmp_topology->sort_ids();
}
#endif
@@ -413,10 +484,13 @@ void kmp_topology_t::_gather_enumeration_information() {
int id = hw_thread.ids[layer];
if (id != previous_id[layer]) {
// Add an additional increment to each count
- for (int l = layer; l < depth; ++l)
- count[l]++;
+ for (int l = layer; l < depth; ++l) {
+ if (hw_thread.ids[l] != kmp_hw_thread_t::UNKNOWN_ID)
+ count[l]++;
+ }
// Keep track of topology layer ratio statistics
- max[layer]++;
+ if (hw_thread.ids[layer] != kmp_hw_thread_t::UNKNOWN_ID)
+ max[layer]++;
for (int l = layer + 1; l < depth; ++l) {
if (max[l] > ratio[l])
ratio[l] = max[l];
@@ -584,6 +658,7 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
retval->count = arr + 2 * (size_t)KMP_HW_LAST;
retval->num_core_efficiencies = 0;
retval->num_core_types = 0;
+ retval->compact = 0;
for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
@@ -674,7 +749,11 @@ void kmp_topology_t::print(const char *env_var) const {
kmp_hw_t print_types[KMP_HW_LAST + 2];
// Num Available Threads
- KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
+ if (num_hw_threads) {
+ KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
+ } else {
+ KMP_INFORM(AvailableOSProc, env_var, __kmp_xproc);
+ }
// Uniform or not
if (is_uniform()) {
@@ -776,6 +855,8 @@ void kmp_topology_t::print(const char *env_var) const {
for (int i = 0; i < num_hw_threads; i++) {
__kmp_str_buf_clear(&buf);
for (int level = 0; level < depth; ++level) {
+ if (hw_threads[i].ids[level] == kmp_hw_thread_t::UNKNOWN_ID)
+ continue;
kmp_hw_t type = types[level];
__kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
__kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
@@ -790,41 +871,45 @@ void kmp_topology_t::print(const char *env_var) const {
__kmp_str_buf_free(&buf);
}
-void kmp_topology_t::canonicalize() {
-#if KMP_GROUP_AFFINITY
- _insert_windows_proc_groups();
-#endif
- _remove_radix1_layers();
- _gather_enumeration_information();
- _discover_uniformity();
- _set_sub_ids();
- _set_globals();
- _set_last_level_cache();
-
-#if KMP_MIC_SUPPORTED
- // Manually Add L2 = Tile equivalence
- if (__kmp_mic_type == mic3) {
- if (get_level(KMP_HW_L2) != -1)
- set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
- else if (get_level(KMP_HW_TILE) != -1)
- set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
- }
-#endif
-
- // Perform post canonicalization checking
- KMP_ASSERT(depth > 0);
- for (int level = 0; level < depth; ++level) {
- // All counts, ratios, and types must be valid
- KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
- KMP_ASSERT_VALID_HW_TYPE(types[level]);
- // Detected types must point to themselves
- KMP_ASSERT(equivalent[types[level]] == types[level]);
- }
-
#if KMP_AFFINITY_SUPPORTED
+void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const {
+ const char *env_var = __kmp_get_affinity_env_var(affinity);
+ // If requested hybrid CPU attributes for granularity (either OMP_PLACES or
+ // KMP_AFFINITY), but none exist, then reset granularity and have below method
+ // select a granularity and warn user.
+ if (!__kmp_is_hybrid_cpu()) {
+ if (affinity.core_attr_gran.valid) {
+ // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores
+ // instead
+ KMP_AFF_WARNING(
+ affinity, AffIgnoringNonHybrid, env_var,
+ __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
+ affinity.gran = KMP_HW_CORE;
+ affinity.gran_levels = -1;
+ affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
+ affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
+ } else if (affinity.flags.core_types_gran ||
+ affinity.flags.core_effs_gran) {
+ // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead
+ if (affinity.flags.omp_places) {
+ KMP_AFF_WARNING(
+ affinity, AffIgnoringNonHybrid, env_var,
+ __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
+ } else {
+ // KMP_AFFINITY=granularity=core_type|core_eff,...
+ KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
+ "Intel(R) Hybrid Technology core attribute",
+ __kmp_hw_get_catalog_string(KMP_HW_CORE));
+ }
+ affinity.gran = KMP_HW_CORE;
+ affinity.gran_levels = -1;
+ affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
+ affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
+ }
+ }
// Set the number of affinity granularity levels
- if (__kmp_affinity_gran_levels < 0) {
- kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran);
+ if (affinity.gran_levels < 0) {
+ kmp_hw_t gran_type = get_equivalent_type(affinity.gran);
// Check if user's granularity request is valid
if (gran_type == KMP_HW_UNKNOWN) {
// First try core, then thread, then package
@@ -837,10 +922,10 @@ void kmp_topology_t::canonicalize() {
}
KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
// Warn user what granularity setting will be used instead
- KMP_AFF_WARNING(AffGranularityBad, "KMP_AFFINITY",
- __kmp_hw_get_catalog_string(__kmp_affinity_gran),
+ KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
+ __kmp_hw_get_catalog_string(affinity.gran),
__kmp_hw_get_catalog_string(gran_type));
- __kmp_affinity_gran = gran_type;
+ affinity.gran = gran_type;
}
#if KMP_GROUP_AFFINITY
// If more than one processor group exists, and the level of
@@ -855,17 +940,49 @@ void kmp_topology_t::canonicalize() {
int proc_group_depth = get_level(KMP_HW_PROC_GROUP);
if (gran_depth >= 0 && proc_group_depth >= 0 &&
gran_depth < proc_group_depth) {
- KMP_AFF_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY",
- __kmp_hw_get_catalog_string(__kmp_affinity_gran));
- __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP;
+ KMP_AFF_WARNING(affinity, AffGranTooCoarseProcGroup, env_var,
+ __kmp_hw_get_catalog_string(affinity.gran));
+ affinity.gran = gran_type = KMP_HW_PROC_GROUP;
}
}
#endif
- __kmp_affinity_gran_levels = 0;
+ affinity.gran_levels = 0;
for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
- __kmp_affinity_gran_levels++;
+ affinity.gran_levels++;
+ }
+}
+#endif
+
+void kmp_topology_t::canonicalize() {
+#if KMP_GROUP_AFFINITY
+ _insert_windows_proc_groups();
+#endif
+ _remove_radix1_layers();
+ _gather_enumeration_information();
+ _discover_uniformity();
+ _set_sub_ids();
+ _set_globals();
+ _set_last_level_cache();
+
+#if KMP_MIC_SUPPORTED
+ // Manually Add L2 = Tile equivalence
+ if (__kmp_mic_type == mic3) {
+ if (get_level(KMP_HW_L2) != -1)
+ set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
+ else if (get_level(KMP_HW_TILE) != -1)
+ set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
+ }
+#endif
+
+ // Perform post canonicalization checking
+ KMP_ASSERT(depth > 0);
+ for (int level = 0; level < depth; ++level) {
+ // All counts, ratios, and types must be valid
+ KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
+ KMP_ASSERT_VALID_HW_TYPE(types[level]);
+ // Detected types must point to themselves
+ KMP_ASSERT(equivalent[types[level]] == types[level]);
}
-#endif // KMP_AFFINITY_SUPPORTED
}
// Canonicalize an explicit packages X cores/pkg X threads/core topology
@@ -894,41 +1011,7 @@ void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
_discover_uniformity();
}
-// Represents running sub IDs for a single core attribute where
-// attribute values have SIZE possibilities.
-template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
- int last_level; // last level in topology to consider for sub_ids
- int sub_id[SIZE]; // The sub ID for a given attribute value
- int prev_sub_id[KMP_HW_LAST];
- IndexFunc indexer;
-
-public:
- kmp_sub_ids_t(int last_level) : last_level(last_level) {
- KMP_ASSERT(last_level < KMP_HW_LAST);
- for (size_t i = 0; i < SIZE; ++i)
- sub_id[i] = -1;
- for (size_t i = 0; i < KMP_HW_LAST; ++i)
- prev_sub_id[i] = -1;
- }
- void update(const kmp_hw_thread_t &hw_thread) {
- int idx = indexer(hw_thread);
- KMP_ASSERT(idx < (int)SIZE);
- for (int level = 0; level <= last_level; ++level) {
- if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
- if (level < last_level)
- sub_id[idx] = -1;
- sub_id[idx]++;
- break;
- }
- }
- for (int level = 0; level <= last_level; ++level)
- prev_sub_id[level] = hw_thread.sub_ids[level];
- }
- int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
- return sub_id[indexer(hw_thread)];
- }
-};
-
+#if KMP_AFFINITY_SUPPORTED
static kmp_str_buf_t *
__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
bool plural) {
@@ -944,6 +1027,41 @@ __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
return buf;
}
+bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) {
+ // Apply the filter
+ bool affected;
+ int new_index = 0;
+ for (int i = 0; i < num_hw_threads; ++i) {
+ int os_id = hw_threads[i].os_id;
+ if (KMP_CPU_ISSET(os_id, mask)) {
+ if (i != new_index)
+ hw_threads[new_index] = hw_threads[i];
+ new_index++;
+ } else {
+ KMP_CPU_CLR(os_id, __kmp_affin_fullMask);
+ __kmp_avail_proc--;
+ }
+ }
+
+ KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
+ affected = (num_hw_threads != new_index);
+ num_hw_threads = new_index;
+
+ // Post hardware subset canonicalization
+ if (affected) {
+ _gather_enumeration_information();
+ _discover_uniformity();
+ _set_globals();
+ _set_last_level_cache();
+#if KMP_OS_WINDOWS
+ // Copy filtered full mask if topology has single processor group
+ if (__kmp_num_proc_groups <= 1)
+#endif
+ __kmp_affin_origMask->copy(__kmp_affin_fullMask);
+ }
+ return affected;
+}
+
// Apply the KMP_HW_SUBSET envirable to the topology
// Returns true if KMP_HW_SUBSET filtered any processors
// otherwise, returns false
@@ -955,9 +1073,12 @@ bool kmp_topology_t::filter_hw_subset() {
// First, sort the KMP_HW_SUBSET items by the machine topology
__kmp_hw_subset->sort();
+ __kmp_hw_subset->canonicalize(__kmp_topology);
+
// Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
bool using_core_types = false;
bool using_core_effs = false;
+ bool is_absolute = __kmp_hw_subset->is_absolute();
int hw_subset_depth = __kmp_hw_subset->get_depth();
kmp_hw_t specified[KMP_HW_LAST];
int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
@@ -978,7 +1099,7 @@ bool kmp_topology_t::filter_hw_subset() {
if (equivalent_type != KMP_HW_UNKNOWN) {
__kmp_hw_subset->at(i).type = equivalent_type;
} else {
- KMP_AFF_WARNING(AffHWSubsetNotExistGeneric,
+ KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetNotExistGeneric,
__kmp_hw_get_catalog_string(type));
return false;
}
@@ -986,7 +1107,8 @@ bool kmp_topology_t::filter_hw_subset() {
// Check to see if current layer has already been
// specified either directly or through an equivalent type
if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
- KMP_AFF_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type),
+ KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetEqvLayers,
+ __kmp_hw_get_catalog_string(type),
__kmp_hw_get_catalog_string(specified[equivalent_type]));
return false;
}
@@ -994,12 +1116,14 @@ bool kmp_topology_t::filter_hw_subset() {
// Check to see if each layer's num & offset parameters are valid
max_count = get_ratio(level);
- if (max_count < 0 ||
- (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
- bool plural = (num > 1);
- KMP_AFF_WARNING(AffHWSubsetManyGeneric,
- __kmp_hw_get_catalog_string(type, plural));
- return false;
+ if (!is_absolute) {
+ if (max_count < 0 ||
+ (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
+ bool plural = (num > 1);
+ KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
+ __kmp_hw_get_catalog_string(type, plural));
+ return false;
+ }
}
// Check to see if core attributes are consistent
@@ -1020,21 +1144,24 @@ bool kmp_topology_t::filter_hw_subset() {
if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
if (item.num_attrs == 1) {
if (using_core_effs) {
- KMP_AFF_WARNING(AffHWSubsetIgnoringAttr, "efficiency");
+ KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
+ "efficiency");
} else {
- KMP_AFF_WARNING(AffHWSubsetIgnoringAttr, "core_type");
+ KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
+ "core_type");
}
using_core_effs = false;
using_core_types = false;
} else {
- KMP_AFF_WARNING(AffHWSubsetAttrsNonHybrid);
+ KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrsNonHybrid);
return false;
}
}
// Check if using both core types and core efficiencies together
if (using_core_types && using_core_effs) {
- KMP_AFF_WARNING(AffHWSubsetIncompat, "core_type", "efficiency");
+ KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat, "core_type",
+ "efficiency");
return false;
}
@@ -1059,7 +1186,7 @@ bool kmp_topology_t::filter_hw_subset() {
}
// Check that the number of requested cores with attributes is valid
- if (using_core_types || using_core_effs) {
+ if ((using_core_types || using_core_effs) && !is_absolute) {
for (int j = 0; j < item.num_attrs; ++j) {
int num = item.num[j];
int offset = item.offset[j];
@@ -1070,7 +1197,7 @@ bool kmp_topology_t::filter_hw_subset() {
(num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
kmp_str_buf_t buf;
__kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
- KMP_AFF_WARNING(AffHWSubsetManyGeneric, buf.str);
+ KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric, buf.str);
__kmp_str_buf_free(&buf);
return false;
}
@@ -1092,7 +1219,7 @@ bool kmp_topology_t::filter_hw_subset() {
}
kmp_str_buf_t buf;
__kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
- KMP_AFF_WARNING(AffHWSubsetIncompat,
+ KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat,
__kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
__kmp_str_buf_free(&buf);
return false;
@@ -1105,7 +1232,7 @@ bool kmp_topology_t::filter_hw_subset() {
kmp_str_buf_t buf;
__kmp_hw_get_catalog_core_string(item.attr[j], &buf,
item.num[j] > 0);
- KMP_AFF_WARNING(AffHWSubsetAttrRepeat, buf.str);
+ KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrRepeat, buf.str);
__kmp_str_buf_free(&buf);
return false;
}
@@ -1115,43 +1242,92 @@ bool kmp_topology_t::filter_hw_subset() {
}
}
- struct core_type_indexer {
- int operator()(const kmp_hw_thread_t &t) const {
- switch (t.attrs.get_core_type()) {
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
- case KMP_HW_CORE_TYPE_ATOM:
- return 1;
- case KMP_HW_CORE_TYPE_CORE:
- return 2;
-#endif
- case KMP_HW_CORE_TYPE_UNKNOWN:
- return 0;
- }
- KMP_ASSERT(0);
- return 0;
+ // For keeping track of sub_ids for an absolute KMP_HW_SUBSET
+ // or core attributes (core type or efficiency)
+ int prev_sub_ids[KMP_HW_LAST];
+ int abs_sub_ids[KMP_HW_LAST];
+ int core_eff_sub_ids[KMP_HW_MAX_NUM_CORE_EFFS];
+ int core_type_sub_ids[KMP_HW_MAX_NUM_CORE_TYPES];
+ for (size_t i = 0; i < KMP_HW_LAST; ++i) {
+ abs_sub_ids[i] = -1;
+ prev_sub_ids[i] = -1;
+ }
+ for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_EFFS; ++i)
+ core_eff_sub_ids[i] = -1;
+ for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
+ core_type_sub_ids[i] = -1;
+
+ // Determine which hardware threads should be filtered.
+
+ // Helpful to determine if a topology layer is targeted by an absolute subset
+ auto is_targeted = [&](int level) {
+ if (is_absolute) {
+ for (int i = 0; i < hw_subset_depth; ++i)
+ if (topology_levels[i] == level)
+ return true;
+ return false;
}
+ // If not absolute KMP_HW_SUBSET, then every layer is seen as targeted
+ return true;
};
- struct core_eff_indexer {
- int operator()(const kmp_hw_thread_t &t) const {
- return t.attrs.get_core_eff();
+
+ // Helpful to index into core type sub Ids array
+ auto get_core_type_index = [](const kmp_hw_thread_t &t) {
+ switch (t.attrs.get_core_type()) {
+ case KMP_HW_CORE_TYPE_UNKNOWN:
+ case KMP_HW_MAX_NUM_CORE_TYPES:
+ return 0;
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ case KMP_HW_CORE_TYPE_ATOM:
+ return 1;
+ case KMP_HW_CORE_TYPE_CORE:
+ return 2;
+#endif
}
+ KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
+ KMP_BUILTIN_UNREACHABLE;
};
- kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
- core_level);
- kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
- core_level);
+ // Helpful to index into core efficiencies sub Ids array
+ auto get_core_eff_index = [](const kmp_hw_thread_t &t) {
+ return t.attrs.get_core_eff();
+ };
- // Determine which hardware threads should be filtered.
int num_filtered = 0;
- bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads);
+ kmp_affin_mask_t *filtered_mask;
+ KMP_CPU_ALLOC(filtered_mask);
+ KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask);
for (int i = 0; i < num_hw_threads; ++i) {
kmp_hw_thread_t &hw_thread = hw_threads[i];
- // Update type_sub_id
- if (using_core_types)
- core_type_sub_ids.update(hw_thread);
- if (using_core_effs)
- core_eff_sub_ids.update(hw_thread);
+
+ // Figure out the absolute sub ids and core eff/type sub ids
+ if (is_absolute || using_core_effs || using_core_types) {
+ for (int level = 0; level < get_depth(); ++level) {
+ if (hw_thread.sub_ids[level] != prev_sub_ids[level]) {
+ bool found_targeted = false;
+ for (int j = level; j < get_depth(); ++j) {
+ bool targeted = is_targeted(j);
+ if (!found_targeted && targeted) {
+ found_targeted = true;
+ abs_sub_ids[j]++;
+ if (j == core_level && using_core_effs)
+ core_eff_sub_ids[get_core_eff_index(hw_thread)]++;
+ if (j == core_level && using_core_types)
+ core_type_sub_ids[get_core_type_index(hw_thread)]++;
+ } else if (targeted) {
+ abs_sub_ids[j] = 0;
+ if (j == core_level && using_core_effs)
+ core_eff_sub_ids[get_core_eff_index(hw_thread)] = 0;
+ if (j == core_level && using_core_types)
+ core_type_sub_ids[get_core_type_index(hw_thread)] = 0;
+ }
+ }
+ break;
+ }
+ }
+ for (int level = 0; level < get_depth(); ++level)
+ prev_sub_ids[level] = hw_thread.sub_ids[level];
+ }
// Check to see if this hardware thread should be filtered
bool should_be_filtered = false;
@@ -1186,71 +1362,60 @@ bool kmp_topology_t::filter_hw_subset() {
int num = hw_subset_item.num[attr_idx];
int offset = hw_subset_item.offset[attr_idx];
if (using_core_types)
- sub_id = core_type_sub_ids.get_sub_id(hw_thread);
+ sub_id = core_type_sub_ids[get_core_type_index(hw_thread)];
else
- sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
+ sub_id = core_eff_sub_ids[get_core_eff_index(hw_thread)];
if (sub_id < offset ||
(num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
should_be_filtered = true;
break;
}
} else {
+ int sub_id;
int num = hw_subset_item.num[0];
int offset = hw_subset_item.offset[0];
- if (hw_thread.sub_ids[level] < offset ||
- (num != kmp_hw_subset_t::USE_ALL &&
- hw_thread.sub_ids[level] >= offset + num)) {
+ if (is_absolute)
+ sub_id = abs_sub_ids[level];
+ else
+ sub_id = hw_thread.sub_ids[level];
+ if (hw_thread.ids[level] == kmp_hw_thread_t::UNKNOWN_ID ||
+ sub_id < offset ||
+ (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
should_be_filtered = true;
break;
}
}
}
// Collect filtering information
- filtered[i] = should_be_filtered;
- if (should_be_filtered)
+ if (should_be_filtered) {
+ KMP_CPU_CLR(hw_thread.os_id, filtered_mask);
num_filtered++;
+ }
}
// One last check that we shouldn't allow filtering entire machine
if (num_filtered == num_hw_threads) {
- KMP_AFF_WARNING(AffHWSubsetAllFiltered);
- __kmp_free(filtered);
+ KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered);
return false;
}
// Apply the filter
- int new_index = 0;
- for (int i = 0; i < num_hw_threads; ++i) {
- if (!filtered[i]) {
- if (i != new_index)
- hw_threads[new_index] = hw_threads[i];
- new_index++;
- } else {
-#if KMP_AFFINITY_SUPPORTED
- KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask);
-#endif
- __kmp_avail_proc--;
- }
- }
-
- KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
- num_hw_threads = new_index;
-
- // Post hardware subset canonicalization
- _gather_enumeration_information();
- _discover_uniformity();
- _set_globals();
- _set_last_level_cache();
- __kmp_free(filtered);
+ restrict_to_mask(filtered_mask);
return true;
}
-bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
+bool kmp_topology_t::is_close(int hwt1, int hwt2,
+ const kmp_affinity_t &stgs) const {
+ int hw_level = stgs.gran_levels;
if (hw_level >= depth)
return true;
bool retval = true;
const kmp_hw_thread_t &t1 = hw_threads[hwt1];
const kmp_hw_thread_t &t2 = hw_threads[hwt2];
+ if (stgs.flags.core_types_gran)
+ return t1.attrs.get_core_type() == t2.attrs.get_core_type();
+ if (stgs.flags.core_effs_gran)
+ return t1.attrs.get_core_eff() == t2.attrs.get_core_eff();
for (int i = 0; i < (depth - hw_level); ++i) {
if (t1.ids[i] != t2.ids[i])
return false;
@@ -1260,30 +1425,6 @@ bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
////////////////////////////////////////////////////////////////////////////////
-#if KMP_AFFINITY_SUPPORTED
-class kmp_affinity_raii_t {
- kmp_affin_mask_t *mask;
- bool restored;
-
-public:
- kmp_affinity_raii_t() : restored(false) {
- KMP_CPU_ALLOC(mask);
- KMP_ASSERT(mask != NULL);
- __kmp_get_system_affinity(mask, TRUE);
- }
- void restore() {
- __kmp_set_system_affinity(mask, TRUE);
- KMP_CPU_FREE(mask);
- restored = true;
- }
- ~kmp_affinity_raii_t() {
- if (!restored) {
- __kmp_set_system_affinity(mask, TRUE);
- KMP_CPU_FREE(mask);
- }
- }
-};
-
bool KMPAffinity::picked_api = false;
void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
@@ -1301,7 +1442,7 @@ void KMPAffinity::pick_api() {
// Only use Hwloc if affinity isn't explicitly disabled and
// user requests Hwloc topology method
if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
- __kmp_affinity_type != affinity_disabled) {
+ __kmp_affinity.type != affinity_disabled) {
affinity_dispatch = new KMPHwlocAffinity();
} else
#endif
@@ -1448,15 +1589,13 @@ kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
return buf;
}
-// Return (possibly empty) affinity mask representing the offline CPUs
-// Caller must free the mask
-kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
- kmp_affin_mask_t *offline;
- KMP_CPU_ALLOC(offline);
- KMP_CPU_ZERO(offline);
+static kmp_affin_mask_t *__kmp_parse_cpu_list(const char *path) {
+ kmp_affin_mask_t *mask;
+ KMP_CPU_ALLOC(mask);
+ KMP_CPU_ZERO(mask);
#if KMP_OS_LINUX
int n, begin_cpu, end_cpu;
- kmp_safe_raii_file_t offline_file;
+ kmp_safe_raii_file_t file;
auto skip_ws = [](FILE *f) {
int c;
do {
@@ -1465,29 +1604,29 @@ kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
if (c != EOF)
ungetc(c, f);
};
- // File contains CSV of integer ranges representing the offline CPUs
+ // File contains CSV of integer ranges representing the CPUs
// e.g., 1,2,4-7,9,11-15
- int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
+ int status = file.try_open(path, "r");
if (status != 0)
- return offline;
- while (!feof(offline_file)) {
- skip_ws(offline_file);
- n = fscanf(offline_file, "%d", &begin_cpu);
+ return mask;
+ while (!feof(file)) {
+ skip_ws(file);
+ n = fscanf(file, "%d", &begin_cpu);
if (n != 1)
break;
- skip_ws(offline_file);
- int c = fgetc(offline_file);
+ skip_ws(file);
+ int c = fgetc(file);
if (c == EOF || c == ',') {
// Just single CPU
end_cpu = begin_cpu;
} else if (c == '-') {
// Range of CPUs
- skip_ws(offline_file);
- n = fscanf(offline_file, "%d", &end_cpu);
+ skip_ws(file);
+ n = fscanf(file, "%d", &end_cpu);
if (n != 1)
break;
- skip_ws(offline_file);
- c = fgetc(offline_file); // skip ','
+ skip_ws(file);
+ c = fgetc(file); // skip ','
} else {
// Syntax problem
break;
@@ -1497,13 +1636,19 @@ kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
continue;
}
- // Insert [begin_cpu, end_cpu] into offline mask
+ // Insert [begin_cpu, end_cpu] into mask
for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
- KMP_CPU_SET(cpu, offline);
+ KMP_CPU_SET(cpu, mask);
}
}
#endif
- return offline;
+ return mask;
+}
+
+// Return (possibly empty) affinity mask representing the offline CPUs
+// Caller must free the mask
+kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
+ return __kmp_parse_cpu_list("/sys/devices/system/cpu/offline");
}
// Return the number of available procs
@@ -1592,6 +1737,7 @@ static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
case HWLOC_OBJ_PU:
return KMP_HW_THREAD;
case HWLOC_OBJ_GROUP:
+#if HWLOC_API_VERSION >= 0x00020000
if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
return KMP_HW_DIE;
else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
@@ -1600,6 +1746,7 @@ static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
return KMP_HW_MODULE;
else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
return KMP_HW_PROC_GROUP;
+#endif
return KMP_HW_UNKNOWN;
#if HWLOC_API_VERSION >= 0x00020100
case HWLOC_OBJ_DIE:
@@ -1663,14 +1810,14 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
hwloc_topology_t tp = __kmp_hwloc_topology;
*msg_id = kmp_i18n_null;
- if (__kmp_affinity_verbose) {
+ if (__kmp_affinity.flags.verbose) {
KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
}
if (!KMP_AFFINITY_CAPABLE()) {
// Hack to try and infer the machine topology using only the data
// available from hwloc on the current thread, and __kmp_xproc.
- KMP_ASSERT(__kmp_affinity_type == affinity_none);
+ KMP_ASSERT(__kmp_affinity.type == affinity_none);
// hwloc only guarantees existance of PU object, so check PACKAGE and CORE
hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
if (o != NULL)
@@ -1682,6 +1829,8 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
__kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
else
__kmp_nThreadsPerCore = 1; // no CORE found
+ if (__kmp_nThreadsPerCore == 0)
+ __kmp_nThreadsPerCore = 1;
__kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
if (nCoresPerPkg == 0)
nCoresPerPkg = 1; // to prevent possible division by 0
@@ -1689,6 +1838,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
return true;
}
+#if HWLOC_API_VERSION >= 0x00020400
// Handle multiple types of cores if they exist on the system
int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
@@ -1727,19 +1877,14 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
}
}
}
+#endif
root = hwloc_get_root_obj(tp);
// Figure out the depth and types in the topology
depth = 0;
- pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
- KMP_ASSERT(pu);
- obj = pu;
- types[depth] = KMP_HW_THREAD;
- hwloc_types[depth] = obj->type;
- depth++;
- while (obj != root && obj != NULL) {
- obj = obj->parent;
+ obj = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
+ while (obj && obj != root) {
#if HWLOC_API_VERSION >= 0x00020000
if (obj->memory_arity) {
hwloc_obj_t memory;
@@ -1761,6 +1906,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
hwloc_types[depth] = obj->type;
depth++;
}
+ obj = obj->parent;
}
KMP_ASSERT(depth > 0);
@@ -1787,7 +1933,9 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
hw_thread.clear();
hw_thread.ids[index] = pu->logical_index;
hw_thread.os_id = pu->os_index;
+ hw_thread.original_idx = hw_thread_index;
// If multiple core types, then set that attribute for the hardware thread
+#if HWLOC_API_VERSION >= 0x00020400
if (cpukinds) {
int cpukind_index = -1;
for (int i = 0; i < nr_cpu_kinds; ++i) {
@@ -1801,6 +1949,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
}
}
+#endif
index--;
}
obj = pu;
@@ -1825,7 +1974,6 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
hw_thread.ids[index + 1] = sub_id;
index--;
}
- prev = memory;
}
prev = obj;
}
@@ -1845,12 +1993,14 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
hw_thread_index++;
}
+#if HWLOC_API_VERSION >= 0x00020400
// Free the core types information
if (cpukinds) {
for (int idx = 0; idx < nr_cpu_kinds; ++idx)
hwloc_bitmap_free(cpukinds[idx].mask);
__kmp_free(cpukinds);
}
+#endif
__kmp_topology->sort_ids();
return true;
}
@@ -1864,15 +2014,15 @@ static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
int depth = 3;
kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
- if (__kmp_affinity_verbose) {
+ if (__kmp_affinity.flags.verbose) {
KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
}
- // Even if __kmp_affinity_type == affinity_none, this routine might still
- // called to set __kmp_ncores, as well as
+ // Even if __kmp_affinity.type == affinity_none, this routine might still
+ // be called to set __kmp_ncores, as well as
// __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
if (!KMP_AFFINITY_CAPABLE()) {
- KMP_ASSERT(__kmp_affinity_type == affinity_none);
+ KMP_ASSERT(__kmp_affinity.type == affinity_none);
__kmp_ncores = nPackages = __kmp_xproc;
__kmp_nThreadsPerCore = nCoresPerPkg = 1;
return true;
@@ -1897,12 +2047,13 @@ static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
hw_thread.clear();
hw_thread.os_id = i;
+ hw_thread.original_idx = avail_ct;
hw_thread.ids[0] = i;
hw_thread.ids[1] = 0;
hw_thread.ids[2] = 0;
avail_ct++;
}
- if (__kmp_affinity_verbose) {
+ if (__kmp_affinity.flags.verbose) {
KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
}
return true;
@@ -1919,13 +2070,13 @@ static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
- if (__kmp_affinity_verbose) {
+ if (__kmp_affinity.flags.verbose) {
KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
}
// If we aren't affinity capable, then use flat topology
if (!KMP_AFFINITY_CAPABLE()) {
- KMP_ASSERT(__kmp_affinity_type == affinity_none);
+ KMP_ASSERT(__kmp_affinity.type == affinity_none);
nPackages = __kmp_num_proc_groups;
__kmp_nThreadsPerCore = 1;
__kmp_ncores = __kmp_xproc;
@@ -1942,11 +2093,13 @@ static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
continue;
}
- kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
+ kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
hw_thread.clear();
hw_thread.os_id = i;
+ hw_thread.original_idx = avail_ct;
hw_thread.ids[0] = i / BITS_PER_GROUP;
hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
+ avail_ct++;
}
return true;
}
@@ -2002,15 +2155,43 @@ static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
return 0;
}
-class kmp_cache_info_t {
+class cpuid_cache_info_t {
public:
struct info_t {
- unsigned level, mask;
+ unsigned level = 0;
+ unsigned mask = 0;
+ bool operator==(const info_t &rhs) const {
+ return level == rhs.level && mask == rhs.mask;
+ }
+ bool operator!=(const info_t &rhs) const { return !operator==(rhs); }
};
- kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
+ cpuid_cache_info_t() : depth(0) {
+ table[MAX_CACHE_LEVEL].level = 0;
+ table[MAX_CACHE_LEVEL].mask = 0;
+ }
size_t get_depth() const { return depth; }
info_t &operator[](size_t index) { return table[index]; }
const info_t &operator[](size_t index) const { return table[index]; }
+ bool operator==(const cpuid_cache_info_t &rhs) const {
+ if (rhs.depth != depth)
+ return false;
+ for (size_t i = 0; i < depth; ++i)
+ if (table[i] != rhs.table[i])
+ return false;
+ return true;
+ }
+ bool operator!=(const cpuid_cache_info_t &rhs) const {
+ return !operator==(rhs);
+ }
+ // Get cache information assocaited with L1, L2, L3 cache, etc.
+ // If level does not exist, then return the "NULL" level (level 0)
+ const info_t &get_level(unsigned level) const {
+ for (size_t i = 0; i < depth; ++i) {
+ if (table[i].level == level)
+ return table[i];
+ }
+ return table[MAX_CACHE_LEVEL];
+ }
static kmp_hw_t get_topology_type(unsigned level) {
KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
@@ -2024,13 +2205,6 @@ public:
}
return KMP_HW_UNKNOWN;
}
-
-private:
- static const int MAX_CACHE_LEVEL = 3;
-
- size_t depth;
- info_t table[MAX_CACHE_LEVEL];
-
void get_leaf4_levels() {
unsigned level = 0;
while (depth < MAX_CACHE_LEVEL) {
@@ -2055,6 +2229,11 @@ private:
level++;
}
}
+ static const int MAX_CACHE_LEVEL = 3;
+
+private:
+ size_t depth;
+ info_t table[MAX_CACHE_LEVEL + 1];
};
// On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
@@ -2065,7 +2244,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
kmp_cpuid buf;
*msg_id = kmp_i18n_null;
- if (__kmp_affinity_verbose) {
+ if (__kmp_affinity.flags.verbose) {
KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
}
@@ -2084,7 +2263,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
if (!KMP_AFFINITY_CAPABLE()) {
// Hack to try and infer the machine topology using only the data
// available from cpuid on the current thread, and __kmp_xproc.
- KMP_ASSERT(__kmp_affinity_type == affinity_none);
+ KMP_ASSERT(__kmp_affinity.type == affinity_none);
// Get an upper bound on the number of threads per package using cpuid(1).
// On some OS/chps combinations where HT is supported by the chip but is
@@ -2136,7 +2315,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
// From here on, we can assume that it is safe to call
// __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
- // __kmp_affinity_type = affinity_none.
+ // __kmp_affinity.type = affinity_none.
// Save the affinity mask for the current thread.
kmp_affinity_raii_t previous_affinity;
@@ -2362,6 +2541,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
hw_thread.ids[idx++] = threadInfo[i].threadId;
}
hw_thread.os_id = os;
+ hw_thread.original_idx = i;
}
__kmp_free(threadInfo);
@@ -2417,15 +2597,13 @@ enum {
INTEL_LEVEL_TYPE_INVALID = 0, // Package level
INTEL_LEVEL_TYPE_SMT = 1,
INTEL_LEVEL_TYPE_CORE = 2,
- INTEL_LEVEL_TYPE_TILE = 3,
- INTEL_LEVEL_TYPE_MODULE = 4,
+ INTEL_LEVEL_TYPE_MODULE = 3,
+ INTEL_LEVEL_TYPE_TILE = 4,
INTEL_LEVEL_TYPE_DIE = 5,
INTEL_LEVEL_TYPE_LAST = 6,
};
-
-struct cpuid_level_info_t {
- unsigned level_type, mask, mask_width, nitems, cache_mask;
-};
+KMP_BUILD_ASSERT(INTEL_LEVEL_TYPE_LAST < sizeof(unsigned) * CHAR_BIT);
+#define KMP_LEAF_1F_KNOWN_LEVELS ((1u << INTEL_LEVEL_TYPE_LAST) - 1u)
static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
switch (intel_type) {
@@ -2445,16 +2623,78 @@ static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
return KMP_HW_UNKNOWN;
}
-// This function takes the topology leaf, a levels array to store the levels
-// detected and a bitmap of the known levels.
-// Returns the number of levels in the topology
-static unsigned
-__kmp_x2apicid_get_levels(int leaf,
- cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
- kmp_uint64 known_levels) {
+static int __kmp_topology_type_2_intel_type(kmp_hw_t type) {
+ switch (type) {
+ case KMP_HW_SOCKET:
+ return INTEL_LEVEL_TYPE_INVALID;
+ case KMP_HW_THREAD:
+ return INTEL_LEVEL_TYPE_SMT;
+ case KMP_HW_CORE:
+ return INTEL_LEVEL_TYPE_CORE;
+ case KMP_HW_TILE:
+ return INTEL_LEVEL_TYPE_TILE;
+ case KMP_HW_MODULE:
+ return INTEL_LEVEL_TYPE_MODULE;
+ case KMP_HW_DIE:
+ return INTEL_LEVEL_TYPE_DIE;
+ default:
+ return INTEL_LEVEL_TYPE_INVALID;
+ }
+}
+
+struct cpuid_level_info_t {
+ unsigned level_type, mask, mask_width, nitems, cache_mask;
+};
+
+class cpuid_topo_desc_t {
+ unsigned desc = 0;
+
+public:
+ void clear() { desc = 0; }
+ bool contains(int intel_type) const {
+ KMP_DEBUG_ASSERT(intel_type >= 0 && intel_type < INTEL_LEVEL_TYPE_LAST);
+ if ((1u << intel_type) & desc)
+ return true;
+ return false;
+ }
+ bool contains_topology_type(kmp_hw_t type) const {
+ KMP_DEBUG_ASSERT(type >= 0 && type < KMP_HW_LAST);
+ int intel_type = __kmp_topology_type_2_intel_type(type);
+ return contains(intel_type);
+ }
+ bool contains(cpuid_topo_desc_t rhs) const {
+ return ((desc | rhs.desc) == desc);
+ }
+ void add(int intel_type) { desc |= (1u << intel_type); }
+ void add(cpuid_topo_desc_t rhs) { desc |= rhs.desc; }
+};
+
+struct cpuid_proc_info_t {
+ // Topology info
+ int os_id;
+ unsigned apic_id;
+ unsigned depth;
+ // Hybrid info
+ unsigned native_model_id;
+ int efficiency;
+ kmp_hw_core_type_t type;
+ cpuid_topo_desc_t description;
+
+ cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
+};
+
+// This function takes the topology leaf, an info pointer to store the levels
+// detected, and writable descriptors for the total topology.
+// Returns whether total types, depth, or description were modified.
+static bool __kmp_x2apicid_get_levels(int leaf, cpuid_proc_info_t *info,
+ kmp_hw_t total_types[KMP_HW_LAST],
+ int *total_depth,
+ cpuid_topo_desc_t *total_description) {
unsigned level, levels_index;
unsigned level_type, mask_width, nitems;
kmp_cpuid buf;
+ cpuid_level_info_t(&levels)[INTEL_LEVEL_TYPE_LAST] = info->levels;
+ bool retval = false;
// New algorithm has known topology layers act as highest unknown topology
// layers when unknown topology layers exist.
@@ -2469,10 +2709,12 @@ __kmp_x2apicid_get_levels(int leaf,
level_type = __kmp_extract_bits<8, 15>(buf.ecx);
mask_width = __kmp_extract_bits<0, 4>(buf.eax);
nitems = __kmp_extract_bits<0, 15>(buf.ebx);
- if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
- return 0;
+ if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) {
+ info->depth = 0;
+ return retval;
+ }
- if (known_levels & (1ull << level_type)) {
+ if (KMP_LEAF_1F_KNOWN_LEVELS & (1u << level_type)) {
// Add a new level to the topology
KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
levels[levels_index].level_type = level_type;
@@ -2488,6 +2730,26 @@ __kmp_x2apicid_get_levels(int leaf,
}
level++;
} while (level_type != INTEL_LEVEL_TYPE_INVALID);
+ KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
+ info->description.clear();
+ info->depth = levels_index;
+
+ // If types, depth, and total_description are uninitialized,
+ // then initialize them now
+ if (*total_depth == 0) {
+ *total_depth = info->depth;
+ total_description->clear();
+ for (int i = *total_depth - 1, j = 0; i >= 0; --i, ++j) {
+ total_types[j] =
+ __kmp_intel_type_2_topology_type(info->levels[i].level_type);
+ total_description->add(info->levels[i].level_type);
+ }
+ retval = true;
+ }
+
+ // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
+ if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID)
+ return 0;
// Set the masks to & with apicid
for (unsigned i = 0; i < levels_index; ++i) {
@@ -2497,42 +2759,65 @@ __kmp_x2apicid_get_levels(int leaf,
for (unsigned j = 0; j < i; ++j)
levels[i].mask ^= levels[j].mask;
} else {
- KMP_DEBUG_ASSERT(levels_index > 0);
+ KMP_DEBUG_ASSERT(i > 0);
levels[i].mask = (-1) << levels[i - 1].mask_width;
levels[i].cache_mask = 0;
}
+ info->description.add(info->levels[i].level_type);
}
- return levels_index;
+
+ // If this processor has level type not on other processors, then make
+ // sure to include it in total types, depth, and description.
+ // One assumption here is that the first type, i.e. socket, is known.
+ // Another assumption is that types array is always large enough to fit any
+ // new layers since its length is KMP_HW_LAST.
+ if (!total_description->contains(info->description)) {
+ for (int i = info->depth - 1, j = 0; i >= 0; --i, ++j) {
+ // If this level is known already, then skip it.
+ if (total_description->contains(levels[i].level_type))
+ continue;
+ // Unknown level, insert before last known level
+ kmp_hw_t curr_type =
+ __kmp_intel_type_2_topology_type(levels[i].level_type);
+ KMP_ASSERT(j != 0 && "Bad APIC Id information");
+ // Move over all known levels to make room for new level
+ for (int k = info->depth - 1; k >= j; --k) {
+ KMP_DEBUG_ASSERT(k + 1 < KMP_HW_LAST);
+ total_types[k + 1] = total_types[k];
+ }
+ // Insert new level
+ total_types[j] = curr_type;
+ (*total_depth)++;
+ }
+ total_description->add(info->description);
+ retval = true;
+ }
+ return retval;
}
static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
- cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
- unsigned levels_index;
kmp_cpuid buf;
- kmp_uint64 known_levels;
- int topology_leaf, highest_leaf, apic_id;
+ int topology_leaf, highest_leaf;
int num_leaves;
+ int depth = 0;
+ cpuid_topo_desc_t total_description;
static int leaves[] = {0, 0};
- kmp_i18n_id_t leaf_message_id;
+ // If affinity is disabled, __kmp_avail_proc may be zero
+ int ninfos = (__kmp_avail_proc > 0 ? __kmp_avail_proc : 1);
+ cpuid_proc_info_t *proc_info = (cpuid_proc_info_t *)__kmp_allocate(
+ (sizeof(cpuid_proc_info_t) + sizeof(cpuid_cache_info_t)) * ninfos);
+ cpuid_cache_info_t *cache_info = (cpuid_cache_info_t *)(proc_info + ninfos);
- KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
+ kmp_i18n_id_t leaf_message_id;
*msg_id = kmp_i18n_null;
- if (__kmp_affinity_verbose) {
+ if (__kmp_affinity.flags.verbose) {
KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
}
- // Figure out the known topology levels
- known_levels = 0ull;
- for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
- if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
- known_levels |= (1ull << i);
- }
- }
-
// Get the highest cpuid leaf supported
__kmp_x86_cpuid(0, 0, &buf);
highest_leaf = buf.eax;
@@ -2566,16 +2851,18 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
if (buf.ebx == 0)
continue;
topology_leaf = leaf;
- levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
- if (levels_index == 0)
+ __kmp_x2apicid_get_levels(leaf, &proc_info[0], types, &depth,
+ &total_description);
+ if (depth == 0)
continue;
break;
}
- if (topology_leaf == -1 || levels_index == 0) {
+ if (topology_leaf == -1 || depth == 0) {
*msg_id = leaf_message_id;
+ __kmp_free(proc_info);
return false;
}
- KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
+ KMP_ASSERT(depth <= INTEL_LEVEL_TYPE_LAST);
// The algorithm used starts by setting the affinity to each available thread
// and retrieving info from the cpuid instruction, so if we are not capable of
@@ -2585,46 +2872,23 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
if (!KMP_AFFINITY_CAPABLE()) {
// Hack to try and infer the machine topology using only the data
// available from cpuid on the current thread, and __kmp_xproc.
- KMP_ASSERT(__kmp_affinity_type == affinity_none);
- for (unsigned i = 0; i < levels_index; ++i) {
- if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
- __kmp_nThreadsPerCore = levels[i].nitems;
- } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
- nCoresPerPkg = levels[i].nitems;
+ KMP_ASSERT(__kmp_affinity.type == affinity_none);
+ for (int i = 0; i < depth; ++i) {
+ if (proc_info[0].levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
+ __kmp_nThreadsPerCore = proc_info[0].levels[i].nitems;
+ } else if (proc_info[0].levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
+ nCoresPerPkg = proc_info[0].levels[i].nitems;
}
}
__kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+ __kmp_free(proc_info);
return true;
}
- // Allocate the data structure to be returned.
- int depth = levels_index;
- for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
- types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
- __kmp_topology =
- kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
-
- // Insert equivalent cache types if they exist
- kmp_cache_info_t cache_info;
- for (size_t i = 0; i < cache_info.get_depth(); ++i) {
- const kmp_cache_info_t::info_t &info = cache_info[i];
- unsigned cache_mask = info.mask;
- unsigned cache_level = info.level;
- for (unsigned j = 0; j < levels_index; ++j) {
- unsigned hw_cache_mask = levels[j].cache_mask;
- kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
- if (hw_cache_mask == cache_mask && j < levels_index - 1) {
- kmp_hw_t type =
- __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
- __kmp_topology->set_equivalent_type(cache_type, type);
- }
- }
- }
-
// From here on, we can assume that it is safe to call
// __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
- // __kmp_affinity_type = affinity_none.
+ // __kmp_affinity.type = affinity_none.
// Save the affinity mask for the current thread.
kmp_affinity_raii_t previous_affinity;
@@ -2633,56 +2897,167 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
// to it, and obtaining the pertinent information using the cpuid instr.
unsigned int proc;
int hw_thread_index = 0;
- KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
- cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
- unsigned my_levels_index;
+ bool uniform_caches = true;
+ KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
// Skip this proc if it is not included in the machine model.
if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
continue;
}
KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
+ // Gather topology information
__kmp_affinity_dispatch->bind_thread(proc);
-
- // New algorithm
__kmp_x86_cpuid(topology_leaf, 0, &buf);
- apic_id = buf.edx;
- kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
- my_levels_index =
- __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
- if (my_levels_index == 0 || my_levels_index != levels_index) {
+ proc_info[hw_thread_index].os_id = proc;
+ proc_info[hw_thread_index].apic_id = buf.edx;
+ __kmp_x2apicid_get_levels(topology_leaf, &proc_info[hw_thread_index], types,
+ &depth, &total_description);
+ if (proc_info[hw_thread_index].depth == 0) {
*msg_id = kmp_i18n_str_InvalidCpuidInfo;
+ __kmp_free(proc_info);
return false;
}
- hw_thread.clear();
- hw_thread.os_id = proc;
- // Put in topology information
- for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
- hw_thread.ids[idx] = apic_id & my_levels[j].mask;
- if (j > 0) {
- hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
- }
- }
+ // Gather cache information and insert afterwards
+ cache_info[hw_thread_index].get_leaf4_levels();
+ if (uniform_caches && hw_thread_index > 0)
+ if (cache_info[0] != cache_info[hw_thread_index])
+ uniform_caches = false;
// Hybrid information
if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
- kmp_hw_core_type_t type;
- unsigned native_model_id;
- int efficiency;
- __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
- hw_thread.attrs.set_core_type(type);
- hw_thread.attrs.set_core_eff(efficiency);
+ __kmp_get_hybrid_info(&proc_info[hw_thread_index].type,
+ &proc_info[hw_thread_index].efficiency,
+ &proc_info[hw_thread_index].native_model_id);
}
hw_thread_index++;
}
KMP_ASSERT(hw_thread_index > 0);
+ previous_affinity.restore();
+
+ // Allocate the data structure to be returned.
+ __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
+
+ // Create topology Ids and hybrid types in __kmp_topology
+ for (int i = 0; i < __kmp_topology->get_num_hw_threads(); ++i) {
+ kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+ hw_thread.clear();
+ hw_thread.os_id = proc_info[i].os_id;
+ hw_thread.original_idx = i;
+ unsigned apic_id = proc_info[i].apic_id;
+ // Put in topology information
+ for (int j = 0, idx = depth - 1; j < depth; ++j, --idx) {
+ if (!(proc_info[i].description.contains_topology_type(
+ __kmp_topology->get_type(j)))) {
+ hw_thread.ids[idx] = kmp_hw_thread_t::UNKNOWN_ID;
+ } else {
+ hw_thread.ids[idx] = apic_id & proc_info[i].levels[j].mask;
+ if (j > 0) {
+ hw_thread.ids[idx] >>= proc_info[i].levels[j - 1].mask_width;
+ }
+ }
+ }
+ hw_thread.attrs.set_core_type(proc_info[i].type);
+ hw_thread.attrs.set_core_eff(proc_info[i].efficiency);
+ }
+
__kmp_topology->sort_ids();
+
+ // Change Ids to logical Ids
+ for (int j = 0; j < depth - 1; ++j) {
+ int new_id = 0;
+ int prev_id = __kmp_topology->at(0).ids[j];
+ int curr_id = __kmp_topology->at(0).ids[j + 1];
+ __kmp_topology->at(0).ids[j + 1] = new_id;
+ for (int i = 1; i < __kmp_topology->get_num_hw_threads(); ++i) {
+ kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+ if (hw_thread.ids[j] == prev_id && hw_thread.ids[j + 1] == curr_id) {
+ hw_thread.ids[j + 1] = new_id;
+ } else if (hw_thread.ids[j] == prev_id &&
+ hw_thread.ids[j + 1] != curr_id) {
+ curr_id = hw_thread.ids[j + 1];
+ hw_thread.ids[j + 1] = ++new_id;
+ } else {
+ prev_id = hw_thread.ids[j];
+ curr_id = hw_thread.ids[j + 1];
+ hw_thread.ids[j + 1] = ++new_id;
+ }
+ }
+ }
+
+ // First check for easy cache placement. This occurs when caches are
+ // equivalent to a layer in the CPUID leaf 0xb or 0x1f topology.
+ if (uniform_caches) {
+ for (size_t i = 0; i < cache_info[0].get_depth(); ++i) {
+ unsigned cache_mask = cache_info[0][i].mask;
+ unsigned cache_level = cache_info[0][i].level;
+ KMP_ASSERT(cache_level <= cpuid_cache_info_t::MAX_CACHE_LEVEL);
+ kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(cache_level);
+ __kmp_topology->set_equivalent_type(cache_type, cache_type);
+ for (int j = 0; j < depth; ++j) {
+ unsigned hw_cache_mask = proc_info[0].levels[j].cache_mask;
+ if (hw_cache_mask == cache_mask && j < depth - 1) {
+ kmp_hw_t type = __kmp_intel_type_2_topology_type(
+ proc_info[0].levels[j + 1].level_type);
+ __kmp_topology->set_equivalent_type(cache_type, type);
+ }
+ }
+ }
+ } else {
+ // If caches are non-uniform, then record which caches exist.
+ for (int i = 0; i < __kmp_topology->get_num_hw_threads(); ++i) {
+ for (size_t j = 0; j < cache_info[i].get_depth(); ++j) {
+ unsigned cache_level = cache_info[i][j].level;
+ kmp_hw_t cache_type =
+ cpuid_cache_info_t::get_topology_type(cache_level);
+ if (__kmp_topology->get_equivalent_type(cache_type) == KMP_HW_UNKNOWN)
+ __kmp_topology->set_equivalent_type(cache_type, cache_type);
+ }
+ }
+ }
+
+ // See if any cache level needs to be added manually through cache Ids
+ bool unresolved_cache_levels = false;
+ for (unsigned level = 1; level <= cpuid_cache_info_t::MAX_CACHE_LEVEL;
+ ++level) {
+ kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(level);
+ // This also filters out caches which may not be in the topology
+ // since the equivalent type might be KMP_HW_UNKNOWN.
+ if (__kmp_topology->get_equivalent_type(cache_type) == cache_type) {
+ unresolved_cache_levels = true;
+ break;
+ }
+ }
+
+ // Insert unresolved cache layers into machine topology using cache Ids
+ if (unresolved_cache_levels) {
+ int num_hw_threads = __kmp_topology->get_num_hw_threads();
+ int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
+ for (unsigned l = 1; l <= cpuid_cache_info_t::MAX_CACHE_LEVEL; ++l) {
+ kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(l);
+ if (__kmp_topology->get_equivalent_type(cache_type) != cache_type)
+ continue;
+ for (int i = 0; i < num_hw_threads; ++i) {
+ int original_idx = __kmp_topology->at(i).original_idx;
+ ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
+ const cpuid_cache_info_t::info_t &info =
+ cache_info[original_idx].get_level(l);
+ // if cache level not in topology for this processor, then skip
+ if (info.level == 0)
+ continue;
+ ids[i] = info.mask & proc_info[original_idx].apic_id;
+ }
+ __kmp_topology->insert_layer(cache_type, ids);
+ }
+ }
+
if (!__kmp_topology->check_ids()) {
kmp_topology_t::deallocate(__kmp_topology);
__kmp_topology = nullptr;
*msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
+ __kmp_free(proc_info);
return false;
}
+ __kmp_free(proc_info);
return true;
}
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
@@ -2716,14 +3091,16 @@ static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
// Set the array sizes for the hierarchy layers
static void __kmp_dispatch_set_hierarchy_values() {
// Set the maximum number of L1's to number of cores
- // Set the maximum number of L2's to to either number of cores / 2 for
+ // Set the maximum number of L2's to either number of cores / 2 for
// Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
// Or the number of cores for Intel(R) Xeon(R) processors
// Set the maximum number of NUMA nodes and L3's to number of packages
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
+#if KMP_ARCH_X86_64 && \
+ (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
+ KMP_OS_WINDOWS) && \
KMP_MIC_SUPPORTED
if (__kmp_mic_type >= mic3)
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
@@ -2738,7 +3115,9 @@ static void __kmp_dispatch_set_hierarchy_values() {
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
__kmp_nThreadsPerCore;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
+#if KMP_ARCH_X86_64 && \
+ (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
+ KMP_OS_WINDOWS) && \
KMP_MIC_SUPPORTED
if (__kmp_mic_type >= mic3)
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
@@ -2800,15 +3179,51 @@ static inline const char *__kmp_cpuinfo_get_envvar() {
return envvar;
}
+static bool __kmp_package_id_from_core_siblings_list(unsigned **threadInfo,
+ unsigned num_avail,
+ unsigned idx) {
+ if (!KMP_AFFINITY_CAPABLE())
+ return false;
+
+ char path[256];
+ KMP_SNPRINTF(path, sizeof(path),
+ "/sys/devices/system/cpu/cpu%u/topology/core_siblings_list",
+ threadInfo[idx][osIdIndex]);
+ kmp_affin_mask_t *siblings = __kmp_parse_cpu_list(path);
+ for (unsigned i = 0; i < num_avail; ++i) {
+ unsigned cpu_id = threadInfo[i][osIdIndex];
+ KMP_ASSERT(cpu_id < __kmp_affin_mask_size * CHAR_BIT);
+ if (!KMP_CPU_ISSET(cpu_id, siblings))
+ continue;
+ if (threadInfo[i][pkgIdIndex] == UINT_MAX) {
+ // Arbitrarily pick the first index we encounter, it only matters that
+ // the value is the same for all siblings.
+ threadInfo[i][pkgIdIndex] = idx;
+ } else if (threadInfo[i][pkgIdIndex] != idx) {
+ // Contradictory sibling lists.
+ KMP_CPU_FREE(siblings);
+ return false;
+ }
+ }
+ KMP_ASSERT(threadInfo[idx][pkgIdIndex] != UINT_MAX);
+ KMP_CPU_FREE(siblings);
+ return true;
+}
+
// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
-// affinity map.
+// affinity map. On AIX, the map is obtained through system SRAD (Scheduler
+// Resource Allocation Domain).
static bool __kmp_affinity_create_cpuinfo_map(int *line,
kmp_i18n_id_t *const msg_id) {
+ *msg_id = kmp_i18n_null;
+
+#if KMP_OS_AIX
+ unsigned num_records = __kmp_xproc;
+#else
const char *filename = __kmp_cpuinfo_get_filename();
const char *envvar = __kmp_cpuinfo_get_envvar();
- *msg_id = kmp_i18n_null;
- if (__kmp_affinity_verbose) {
+ if (__kmp_affinity.flags.verbose) {
KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
}
@@ -2865,6 +3280,7 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
*msg_id = kmp_i18n_str_CantRewindCpuinfo;
return false;
}
+#endif // KMP_OS_AIX
// Allocate the array of records to store the proc info in. The dummy
// element at the end makes the logic in filling them out easier to code.
@@ -2894,8 +3310,96 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
INIT_PROC_INFO(threadInfo[i]);
}
+#if KMP_OS_AIX
+ int smt_threads;
+ lpar_info_format1_t cpuinfo;
+ unsigned num_avail = __kmp_xproc;
+
+ if (__kmp_affinity.flags.verbose)
+ KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "system info for topology");
+
+ // Get the number of SMT threads per core.
+ smt_threads = syssmt(GET_NUMBER_SMT_SETS, 0, 0, NULL);
+
+ // Allocate a resource set containing available system resourses.
+ rsethandle_t sys_rset = rs_alloc(RS_SYSTEM);
+ if (sys_rset == NULL) {
+ CLEANUP_THREAD_INFO;
+ *msg_id = kmp_i18n_str_UnknownTopology;
+ return false;
+ }
+ // Allocate a resource set for the SRAD info.
+ rsethandle_t srad = rs_alloc(RS_EMPTY);
+ if (srad == NULL) {
+ rs_free(sys_rset);
+ CLEANUP_THREAD_INFO;
+ *msg_id = kmp_i18n_str_UnknownTopology;
+ return false;
+ }
+
+ // Get the SRAD system detail level.
+ int sradsdl = rs_getinfo(NULL, R_SRADSDL, 0);
+ if (sradsdl < 0) {
+ rs_free(sys_rset);
+ rs_free(srad);
+ CLEANUP_THREAD_INFO;
+ *msg_id = kmp_i18n_str_UnknownTopology;
+ return false;
+ }
+ // Get the number of RADs at that SRAD SDL.
+ int num_rads = rs_numrads(sys_rset, sradsdl, 0);
+ if (num_rads < 0) {
+ rs_free(sys_rset);
+ rs_free(srad);
+ CLEANUP_THREAD_INFO;
+ *msg_id = kmp_i18n_str_UnknownTopology;
+ return false;
+ }
+
+ // Get the maximum number of procs that may be contained in a resource set.
+ int max_procs = rs_getinfo(NULL, R_MAXPROCS, 0);
+ if (max_procs < 0) {
+ rs_free(sys_rset);
+ rs_free(srad);
+ CLEANUP_THREAD_INFO;
+ *msg_id = kmp_i18n_str_UnknownTopology;
+ return false;
+ }
+
+ int cur_rad = 0;
+ int num_set = 0;
+ for (int srad_idx = 0; cur_rad < num_rads && srad_idx < VMI_MAXRADS;
+ ++srad_idx) {
+ // Check if the SRAD is available in the RSET.
+ if (rs_getrad(sys_rset, srad, sradsdl, srad_idx, 0) < 0)
+ continue;
+
+ for (int cpu = 0; cpu < max_procs; cpu++) {
+ // Set the info for the cpu if it is in the SRAD.
+ if (rs_op(RS_TESTRESOURCE, srad, NULL, R_PROCS, cpu)) {
+ threadInfo[cpu][osIdIndex] = cpu;
+ threadInfo[cpu][pkgIdIndex] = cur_rad;
+ threadInfo[cpu][coreIdIndex] = cpu / smt_threads;
+ ++num_set;
+ if (num_set >= num_avail) {
+ // Done if all available CPUs have been set.
+ break;
+ }
+ }
+ }
+ ++cur_rad;
+ }
+ rs_free(sys_rset);
+ rs_free(srad);
+
+ // The topology is already sorted.
+
+#else // !KMP_OS_AIX
unsigned num_avail = 0;
*line = 0;
+#if KMP_ARCH_S390X
+ bool reading_s390x_sys_info = true;
+#endif
while (!feof(f)) {
// Create an inner scoping level, so that all the goto targets at the end of
// the loop appear in an outer scoping level. This avoids warnings about
@@ -2931,7 +3435,31 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
}
(*line)++;
+#if KMP_ARCH_LOONGARCH64
+ // The parsing logic of /proc/cpuinfo in this function highly depends on
+ // the blank lines between each processor info block. But on LoongArch a
+ // blank line exists before the first processor info block (i.e. after the
+ // "system type" line). This blank line was added because the "system
+ // type" line is unrelated to any of the CPUs. We must skip this line so
+ // that the original logic works on LoongArch.
+ if (*buf == '\n' && *line == 2)
+ continue;
+#endif
+#if KMP_ARCH_S390X
+ // s390x /proc/cpuinfo starts with a variable number of lines containing
+ // the overall system information. Skip them.
+ if (reading_s390x_sys_info) {
+ if (*buf == '\n')
+ reading_s390x_sys_info = false;
+ continue;
+ }
+#endif
+
+#if KMP_ARCH_S390X
+ char s1[] = "cpu number";
+#else
char s1[] = "processor";
+#endif
if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
CHECK_LINE;
char *p = strchr(buf + sizeof(s1) - 1, ':');
@@ -2957,6 +3485,23 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
threadInfo[num_avail][osIdIndex]);
__kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
+#if KMP_ARCH_S390X
+ // Disambiguate physical_package_id.
+ unsigned book_id;
+ KMP_SNPRINTF(path, sizeof(path),
+ "/sys/devices/system/cpu/cpu%u/topology/book_id",
+ threadInfo[num_avail][osIdIndex]);
+ __kmp_read_from_file(path, "%u", &book_id);
+ threadInfo[num_avail][pkgIdIndex] |= (book_id << 8);
+
+ unsigned drawer_id;
+ KMP_SNPRINTF(path, sizeof(path),
+ "/sys/devices/system/cpu/cpu%u/topology/drawer_id",
+ threadInfo[num_avail][osIdIndex]);
+ __kmp_read_from_file(path, "%u", &drawer_id);
+ threadInfo[num_avail][pkgIdIndex] |= (drawer_id << 16);
+#endif
+
KMP_SNPRINTF(path, sizeof(path),
"/sys/devices/system/cpu/cpu%u/topology/core_id",
threadInfo[num_avail][osIdIndex]);
@@ -3040,21 +3585,17 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
return false;
}
- // Check for missing fields. The osId field must be there, and we
- // currently require that the physical id field is specified, also.
+ // Check for missing fields. The osId field must be there. The physical
+ // id field will be checked later.
if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
CLEANUP_THREAD_INFO;
*msg_id = kmp_i18n_str_MissingProcField;
return false;
}
- if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
- CLEANUP_THREAD_INFO;
- *msg_id = kmp_i18n_str_MissingPhysicalIDField;
- return false;
- }
// Skip this proc if it is not included in the machine model.
- if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
+ if (KMP_AFFINITY_CAPABLE() &&
+ !KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
__kmp_affin_fullMask)) {
INIT_PROC_INFO(threadInfo[num_avail]);
continue;
@@ -3080,6 +3621,18 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
}
*line = 0;
+ // At least on powerpc, Linux may return -1 for physical_package_id. Try
+ // to reconstruct topology from core_siblings_list in that case.
+ for (i = 0; i < num_avail; ++i) {
+ if (threadInfo[i][pkgIdIndex] == UINT_MAX) {
+ if (!__kmp_package_id_from_core_siblings_list(threadInfo, num_avail, i)) {
+ CLEANUP_THREAD_INFO;
+ *msg_id = kmp_i18n_str_MissingPhysicalIDField;
+ return false;
+ }
+ }
+ }
+
#if KMP_MIC && REDUCE_TEAM_SIZE
unsigned teamSize = 0;
#endif // KMP_MIC && REDUCE_TEAM_SIZE
@@ -3096,6 +3649,8 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
qsort(threadInfo, num_avail, sizeof(*threadInfo),
__kmp_affinity_cmp_ProcCpuInfo_phys_id);
+#endif // KMP_OS_AIX
+
// The table is now sorted by pkgId / coreId / threadId, but we really don't
// know the radix of any of the fields. pkgId's may be sparsely assigned among
// the chips on a system. Although coreId's are usually assigned
@@ -3210,7 +3765,7 @@ restart_radix_check:
return false;
}
- // If the thread ids were not specified and we see entries entries that
+ // If the thread ids were not specified and we see entries that
// are duplicates, start the loop over and assign the thread ids manually.
assign_thread_ids = true;
goto restart_radix_check;
@@ -3239,7 +3794,7 @@ restart_radix_check:
// not enabled.
__kmp_ncores = totals[coreIdIndex];
if (!KMP_AFFINITY_CAPABLE()) {
- KMP_ASSERT(__kmp_affinity_type == affinity_none);
+ KMP_ASSERT(__kmp_affinity.type == affinity_none);
return true;
}
@@ -3301,10 +3856,10 @@ restart_radix_check:
for (i = 0; i < num_avail; ++i) {
unsigned os = threadInfo[i][osIdIndex];
int src_index;
- int dst_index = 0;
kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
hw_thread.clear();
hw_thread.os_id = os;
+ hw_thread.original_idx = i;
idx = 0;
for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
@@ -3318,7 +3873,6 @@ restart_radix_check:
} else if (src_index == threadIdIndex) {
hw_thread.ids[threadLevel] = threadInfo[i][src_index];
}
- dst_index++;
}
}
@@ -3329,6 +3883,32 @@ restart_radix_check:
__kmp_free(counts);
CLEANUP_THREAD_INFO;
__kmp_topology->sort_ids();
+
+ int tlevel = __kmp_topology->get_level(KMP_HW_THREAD);
+ if (tlevel > 0) {
+ // If the thread level does not have ids, then put them in.
+ if (__kmp_topology->at(0).ids[tlevel] == kmp_hw_thread_t::UNKNOWN_ID) {
+ __kmp_topology->at(0).ids[tlevel] = 0;
+ }
+ for (int i = 1; i < __kmp_topology->get_num_hw_threads(); ++i) {
+ kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+ if (hw_thread.ids[tlevel] != kmp_hw_thread_t::UNKNOWN_ID)
+ continue;
+ kmp_hw_thread_t &prev_hw_thread = __kmp_topology->at(i - 1);
+ // Check if socket, core, anything above thread level changed.
+ // If the ids did change, then restart thread id at 0
+ // Otherwise, set thread id to prev thread's id + 1
+ for (int j = 0; j < tlevel; ++j) {
+ if (hw_thread.ids[j] != prev_hw_thread.ids[j]) {
+ hw_thread.ids[tlevel] = 0;
+ break;
+ }
+ }
+ if (hw_thread.ids[tlevel] == kmp_hw_thread_t::UNKNOWN_ID)
+ hw_thread.ids[tlevel] = prev_hw_thread.ids[tlevel] + 1;
+ }
+ }
+
if (!__kmp_topology->check_ids()) {
kmp_topology_t::deallocate(__kmp_topology);
__kmp_topology = nullptr;
@@ -3341,16 +3921,25 @@ restart_radix_check:
// Create and return a table of affinity masks, indexed by OS thread ID.
// This routine handles OR'ing together all the affinity masks of threads
// that are sufficiently close, if granularity > fine.
-static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
- unsigned *numUnique) {
+template <typename FindNextFunctionType>
+static void __kmp_create_os_id_masks(unsigned *numUnique,
+ kmp_affinity_t &affinity,
+ FindNextFunctionType find_next) {
// First form a table of affinity masks in order of OS thread id.
int maxOsId;
int i;
int numAddrs = __kmp_topology->get_num_hw_threads();
int depth = __kmp_topology->get_depth();
+ const char *env_var = __kmp_get_affinity_env_var(affinity);
KMP_ASSERT(numAddrs);
KMP_ASSERT(depth);
+ i = find_next(-1);
+ // If could not find HW thread location that satisfies find_next conditions,
+ // then return and fallback to increment find_next.
+ if (i >= numAddrs)
+ return;
+
maxOsId = 0;
for (i = numAddrs - 1;; --i) {
int osId = __kmp_topology->at(i).os_id;
@@ -3360,14 +3949,14 @@ static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
if (i == 0)
break;
}
- kmp_affin_mask_t *osId2Mask;
- KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
- KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
- if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
- KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
+ affinity.num_os_id_masks = maxOsId + 1;
+ KMP_CPU_ALLOC_ARRAY(affinity.os_id_masks, affinity.num_os_id_masks);
+ KMP_ASSERT(affinity.gran_levels >= 0);
+ if (affinity.flags.verbose && (affinity.gran_levels > 0)) {
+ KMP_INFORM(ThreadsMigrate, env_var, affinity.gran_levels);
}
- if (__kmp_affinity_gran_levels >= (int)depth) {
- KMP_AFF_WARNING(AffThreadsMayMigrate);
+ if (affinity.gran_levels >= (int)depth) {
+ KMP_AFF_WARNING(affinity, AffThreadsMayMigrate);
}
// Run through the table, forming the masks for all threads on each core.
@@ -3380,22 +3969,25 @@ static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
kmp_affin_mask_t *sum;
KMP_CPU_ALLOC_ON_STACK(sum);
KMP_CPU_ZERO(sum);
- KMP_CPU_SET(__kmp_topology->at(0).os_id, sum);
- for (i = 1; i < numAddrs; i++) {
+
+ i = j = leader = find_next(-1);
+ KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
+ kmp_full_mask_modifier_t full_mask;
+ for (i = find_next(i); i < numAddrs; i = find_next(i)) {
// If this thread is sufficiently close to the leader (within the
// granularity setting), then set the bit for this os thread in the
// affinity mask for this group, and go on to the next thread.
- if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) {
+ if (__kmp_topology->is_close(leader, i, affinity)) {
KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
continue;
}
// For every thread in this group, copy the mask to the thread's entry in
- // the osId2Mask table. Mark the first address as a leader.
- for (; j < i; j++) {
+ // the OS Id mask table. Mark the first address as a leader.
+ for (; j < i; j = find_next(j)) {
int osId = __kmp_topology->at(j).os_id;
KMP_DEBUG_ASSERT(osId <= maxOsId);
- kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
+ kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
KMP_CPU_COPY(mask, sum);
__kmp_topology->at(j).leader = (j == leader);
}
@@ -3403,25 +3995,30 @@ static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
// Start a new mask.
leader = i;
+ full_mask.include(sum);
KMP_CPU_ZERO(sum);
KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
}
// For every thread in last group, copy the mask to the thread's
- // entry in the osId2Mask table.
- for (; j < i; j++) {
+ // entry in the OS Id mask table.
+ for (; j < i; j = find_next(j)) {
int osId = __kmp_topology->at(j).os_id;
KMP_DEBUG_ASSERT(osId <= maxOsId);
- kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
+ kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
KMP_CPU_COPY(mask, sum);
__kmp_topology->at(j).leader = (j == leader);
}
+ full_mask.include(sum);
unique++;
KMP_CPU_FREE_FROM_STACK(sum);
- *maxIndex = maxOsId;
+ // See if the OS Id mask table further restricts or changes the full mask
+ if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
+ __kmp_topology->print(env_var);
+ }
+
*numUnique = unique;
- return osId2Mask;
}
// Stuff for the affinity proclist parsers. It's easier to declare these vars
@@ -3454,7 +4051,7 @@ static int nextNewMask;
{ \
if (((_osId) > _maxOsId) || \
(!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
- KMP_AFF_WARNING(AffIgnoreInvalidProcID, _osId); \
+ KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, _osId); \
} else { \
ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
} \
@@ -3462,12 +4059,13 @@ static int nextNewMask;
// Re-parse the proclist (for the explicit affinity type), and form the list
// of affinity newMasks indexed by gtid.
-static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
- unsigned int *out_numMasks,
- const char *proclist,
- kmp_affin_mask_t *osId2Mask,
- int maxOsId) {
+static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) {
int i;
+ kmp_affin_mask_t **out_masks = &affinity.masks;
+ unsigned *out_numMasks = &affinity.num_masks;
+ const char *proclist = affinity.proclist;
+ kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
+ int maxOsId = affinity.num_os_id_masks - 1;
const char *scan = proclist;
const char *next = proclist;
@@ -3505,7 +4103,7 @@ static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
// Copy the mask for that osId to the sum (union) mask.
if ((num > maxOsId) ||
(!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
- KMP_AFF_WARNING(AffIgnoreInvalidProcID, num);
+ KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
KMP_CPU_ZERO(sumMask);
} else {
KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
@@ -3537,7 +4135,7 @@ static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
// Add the mask for that osId to the sum mask.
if ((num > maxOsId) ||
(!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
- KMP_AFF_WARNING(AffIgnoreInvalidProcID, num);
+ KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
} else {
KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
setSize++;
@@ -3672,10 +4270,11 @@ signed := + signed
signed := - signed
-----------------------------------------------------------------------------*/
static void __kmp_process_subplace_list(const char **scan,
- kmp_affin_mask_t *osId2Mask,
- int maxOsId, kmp_affin_mask_t *tempMask,
+ kmp_affinity_t &affinity, int maxOsId,
+ kmp_affin_mask_t *tempMask,
int *setSize) {
const char *next;
+ kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
for (;;) {
int start, count, stride, i;
@@ -3694,7 +4293,7 @@ static void __kmp_process_subplace_list(const char **scan,
if (**scan == '}' || **scan == ',') {
if ((start > maxOsId) ||
(!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
- KMP_AFF_WARNING(AffIgnoreInvalidProcID, start);
+ KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
} else {
KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
(*setSize)++;
@@ -3723,7 +4322,7 @@ static void __kmp_process_subplace_list(const char **scan,
for (i = 0; i < count; i++) {
if ((start > maxOsId) ||
(!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
- KMP_AFF_WARNING(AffIgnoreInvalidProcID, start);
+ KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
break; // don't proliferate warnings for large count
} else {
KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
@@ -3770,7 +4369,7 @@ static void __kmp_process_subplace_list(const char **scan,
for (i = 0; i < count; i++) {
if ((start > maxOsId) ||
(!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
- KMP_AFF_WARNING(AffIgnoreInvalidProcID, start);
+ KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
break; // don't proliferate warnings for large count
} else {
KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
@@ -3789,21 +4388,22 @@ static void __kmp_process_subplace_list(const char **scan,
}
}
-static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
+static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity,
int maxOsId, kmp_affin_mask_t *tempMask,
int *setSize) {
const char *next;
+ kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
// valid follow sets are '{' '!' and num
SKIP_WS(*scan);
if (**scan == '{') {
(*scan)++; // skip '{'
- __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
+ __kmp_process_subplace_list(scan, affinity, maxOsId, tempMask, setSize);
KMP_ASSERT2(**scan == '}', "bad explicit places list");
(*scan)++; // skip '}'
} else if (**scan == '!') {
(*scan)++; // skip '!'
- __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
+ __kmp_process_place(scan, affinity, maxOsId, tempMask, setSize);
KMP_CPU_COMPLEMENT(maxOsId, tempMask);
} else if ((**scan >= '0') && (**scan <= '9')) {
next = *scan;
@@ -3812,7 +4412,7 @@ static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
KMP_ASSERT(num >= 0);
if ((num > maxOsId) ||
(!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
- KMP_AFF_WARNING(AffIgnoreInvalidProcID, num);
+ KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
} else {
KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
(*setSize)++;
@@ -3824,12 +4424,13 @@ static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
}
// static void
-void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
- unsigned int *out_numMasks,
- const char *placelist,
- kmp_affin_mask_t *osId2Mask,
- int maxOsId) {
+void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) {
int i, j, count, stride, sign;
+ kmp_affin_mask_t **out_masks = &affinity.masks;
+ unsigned *out_numMasks = &affinity.num_masks;
+ const char *placelist = affinity.proclist;
+ kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
+ int maxOsId = affinity.num_os_id_masks - 1;
const char *scan = placelist;
const char *next = placelist;
@@ -3849,7 +4450,7 @@ void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
int setSize = 0;
for (;;) {
- __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
+ __kmp_process_place(&scan, affinity, maxOsId, tempMask, &setSize);
// valid follow sets are ',' ':' and EOL
SKIP_WS(scan);
@@ -3930,7 +4531,7 @@ void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
(!KMP_CPU_ISSET(j + stride,
KMP_CPU_INDEX(osId2Mask, j + stride)))) {
if (i < count - 1) {
- KMP_AFF_WARNING(AffIgnoreInvalidProcID, j + stride);
+ KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, j + stride);
}
continue;
}
@@ -4028,28 +4629,149 @@ static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
static int *procarr = NULL;
static int __kmp_aff_depth = 0;
+static int *__kmp_osid_to_hwthread_map = NULL;
+
+static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
+ kmp_affinity_ids_t &ids,
+ kmp_affinity_attrs_t &attrs) {
+ if (!KMP_AFFINITY_CAPABLE())
+ return;
+
+ // Initiailze ids and attrs thread data
+ for (int i = 0; i < KMP_HW_LAST; ++i)
+ ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
+ attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
+
+ // Iterate through each os id within the mask and determine
+ // the topology id and attribute information
+ int cpu;
+ int depth = __kmp_topology->get_depth();
+ KMP_CPU_SET_ITERATE(cpu, mask) {
+ int osid_idx = __kmp_osid_to_hwthread_map[cpu];
+ ids.os_id = cpu;
+ const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
+ for (int level = 0; level < depth; ++level) {
+ kmp_hw_t type = __kmp_topology->get_type(level);
+ int id = hw_thread.sub_ids[level];
+ if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
+ ids.ids[type] = id;
+ } else {
+ // This mask spans across multiple topology units, set it as such
+ // and mark every level below as such as well.
+ ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+ for (; level < depth; ++level) {
+ kmp_hw_t type = __kmp_topology->get_type(level);
+ ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+ }
+ }
+ }
+ if (!attrs.valid) {
+ attrs.core_type = hw_thread.attrs.get_core_type();
+ attrs.core_eff = hw_thread.attrs.get_core_eff();
+ attrs.valid = 1;
+ } else {
+ // This mask spans across multiple attributes, set it as such
+ if (attrs.core_type != hw_thread.attrs.get_core_type())
+ attrs.core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+ if (attrs.core_eff != hw_thread.attrs.get_core_eff())
+ attrs.core_eff = kmp_hw_attr_t::UNKNOWN_CORE_EFF;
+ }
+ }
+}
+
+static void __kmp_affinity_get_thread_topology_info(kmp_info_t *th) {
+ if (!KMP_AFFINITY_CAPABLE())
+ return;
+ const kmp_affin_mask_t *mask = th->th.th_affin_mask;
+ kmp_affinity_ids_t &ids = th->th.th_topology_ids;
+ kmp_affinity_attrs_t &attrs = th->th.th_topology_attrs;
+ __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
+}
+
+// Assign the topology information to each place in the place list
+// A thread can then grab not only its affinity mask, but the topology
+// information associated with that mask. e.g., Which socket is a thread on
+static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) {
+ if (!KMP_AFFINITY_CAPABLE())
+ return;
+ if (affinity.type != affinity_none) {
+ KMP_ASSERT(affinity.num_os_id_masks);
+ KMP_ASSERT(affinity.os_id_masks);
+ }
+ KMP_ASSERT(affinity.num_masks);
+ KMP_ASSERT(affinity.masks);
+ KMP_ASSERT(__kmp_affin_fullMask);
+
+ int max_cpu = __kmp_affin_fullMask->get_max_cpu();
+ int num_hw_threads = __kmp_topology->get_num_hw_threads();
+
+ // Allocate thread topology information
+ if (!affinity.ids) {
+ affinity.ids = (kmp_affinity_ids_t *)__kmp_allocate(
+ sizeof(kmp_affinity_ids_t) * affinity.num_masks);
+ }
+ if (!affinity.attrs) {
+ affinity.attrs = (kmp_affinity_attrs_t *)__kmp_allocate(
+ sizeof(kmp_affinity_attrs_t) * affinity.num_masks);
+ }
+ if (!__kmp_osid_to_hwthread_map) {
+ // Want the +1 because max_cpu should be valid index into map
+ __kmp_osid_to_hwthread_map =
+ (int *)__kmp_allocate(sizeof(int) * (max_cpu + 1));
+ }
+
+ // Create the OS proc to hardware thread map
+ for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) {
+ int os_id = __kmp_topology->at(hw_thread).os_id;
+ if (KMP_CPU_ISSET(os_id, __kmp_affin_fullMask))
+ __kmp_osid_to_hwthread_map[os_id] = hw_thread;
+ }
+
+ for (unsigned i = 0; i < affinity.num_masks; ++i) {
+ kmp_affinity_ids_t &ids = affinity.ids[i];
+ kmp_affinity_attrs_t &attrs = affinity.attrs[i];
+ kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.masks, i);
+ __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
+ }
+}
+
+// Called when __kmp_topology is ready
+static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
+ // Initialize other data structures which depend on the topology
+ if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
+ machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
+ __kmp_affinity_get_topology_info(affinity);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
+#endif
+ }
+}
// Create a one element mask array (set of places) which only contains the
// initial process's affinity mask
-static void __kmp_create_affinity_none_places() {
+static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {
KMP_ASSERT(__kmp_affin_fullMask != NULL);
- KMP_ASSERT(__kmp_affinity_type == affinity_none);
- __kmp_affinity_num_masks = 1;
- KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
- kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0);
+ KMP_ASSERT(affinity.type == affinity_none);
+ KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+ affinity.num_masks = 1;
+ KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
+ kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0);
KMP_CPU_COPY(dest, __kmp_affin_fullMask);
+ __kmp_aux_affinity_initialize_other_data(affinity);
}
-static void __kmp_aux_affinity_initialize(void) {
- if (__kmp_affinity_masks != NULL) {
- KMP_ASSERT(__kmp_affin_fullMask != NULL);
- return;
- }
-
+static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) {
// Create the "full" mask - this defines all of the processors that we
// consider to be in the machine model. If respect is set, then it is the
// initialization thread's affinity mask. Otherwise, it is all processors that
// we know about on the machine.
+ int verbose = affinity.flags.verbose;
+ const char *env_var = affinity.env_var;
+
+ // Already initialized
+ if (__kmp_affin_fullMask && __kmp_affin_origMask)
+ return;
+
if (__kmp_affin_fullMask == NULL) {
KMP_CPU_ALLOC(__kmp_affin_fullMask);
}
@@ -4060,7 +4782,7 @@ static void __kmp_aux_affinity_initialize(void) {
__kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
// Make a copy before possible expanding to the entire machine mask
__kmp_affin_origMask->copy(__kmp_affin_fullMask);
- if (__kmp_affinity_respect_mask) {
+ if (affinity.flags.respect) {
// Count the number of available processors.
unsigned i;
__kmp_avail_proc = 0;
@@ -4071,24 +4793,24 @@ static void __kmp_aux_affinity_initialize(void) {
__kmp_avail_proc++;
}
if (__kmp_avail_proc > __kmp_xproc) {
- KMP_AFF_WARNING(ErrorInitializeAffinity);
- __kmp_affinity_type = affinity_none;
+ KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
+ affinity.type = affinity_none;
KMP_AFFINITY_DISABLE();
return;
}
- if (__kmp_affinity_verbose) {
+ if (verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
__kmp_affin_fullMask);
- KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+ KMP_INFORM(InitOSProcSetRespect, env_var, buf);
}
} else {
- if (__kmp_affinity_verbose) {
+ if (verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
__kmp_affin_fullMask);
- KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+ KMP_INFORM(InitOSProcSetNotRespect, env_var, buf);
}
__kmp_avail_proc =
__kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
@@ -4103,8 +4825,13 @@ static void __kmp_aux_affinity_initialize(void) {
#endif
}
}
+}
+static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
+ bool success = false;
+ const char *env_var = affinity.env_var;
kmp_i18n_id_t msg_id = kmp_i18n_null;
+ int verbose = affinity.flags.verbose;
// For backward compatibility, setting KMP_CPUINFO_FILE =>
// KMP_TOPOLOGY_METHOD=cpuinfo
@@ -4113,7 +4840,6 @@ static void __kmp_aux_affinity_initialize(void) {
__kmp_affinity_top_method = affinity_top_method_cpuinfo;
}
- bool success = false;
if (__kmp_affinity_top_method == affinity_top_method_all) {
// In the default code path, errors are not fatal - we just try using
// another method. We only emit a warning message if affinity is on, or the
@@ -4123,11 +4849,11 @@ static void __kmp_aux_affinity_initialize(void) {
__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
if (!__kmp_hwloc_error) {
success = __kmp_affinity_create_hwloc_map(&msg_id);
- if (!success && __kmp_affinity_verbose) {
- KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+ if (!success && verbose) {
+ KMP_INFORM(AffIgnoringHwloc, env_var);
}
- } else if (__kmp_affinity_verbose) {
- KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+ } else if (verbose) {
+ KMP_INFORM(AffIgnoringHwloc, env_var);
}
}
#endif
@@ -4135,24 +4861,24 @@ static void __kmp_aux_affinity_initialize(void) {
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
if (!success) {
success = __kmp_affinity_create_x2apicid_map(&msg_id);
- if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
- KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+ if (!success && verbose && msg_id != kmp_i18n_null) {
+ KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
}
}
if (!success) {
success = __kmp_affinity_create_apicid_map(&msg_id);
- if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
- KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+ if (!success && verbose && msg_id != kmp_i18n_null) {
+ KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
}
}
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
-#if KMP_OS_LINUX
+#if KMP_OS_LINUX || KMP_OS_AIX
if (!success) {
int line = 0;
success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
- if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
- KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+ if (!success && verbose && msg_id != kmp_i18n_null) {
+ KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
}
}
#endif /* KMP_OS_LINUX */
@@ -4160,16 +4886,16 @@ static void __kmp_aux_affinity_initialize(void) {
#if KMP_GROUP_AFFINITY
if (!success && (__kmp_num_proc_groups > 1)) {
success = __kmp_affinity_create_proc_group_map(&msg_id);
- if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
- KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+ if (!success && verbose && msg_id != kmp_i18n_null) {
+ KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
}
}
#endif /* KMP_GROUP_AFFINITY */
if (!success) {
success = __kmp_affinity_create_flat_map(&msg_id);
- if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
- KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+ if (!success && verbose && msg_id != kmp_i18n_null) {
+ KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
}
KMP_ASSERT(success);
}
@@ -4241,130 +4967,187 @@ static void __kmp_aux_affinity_initialize(void) {
// Early exit if topology could not be created
if (!__kmp_topology) {
if (KMP_AFFINITY_CAPABLE()) {
- KMP_AFF_WARNING(ErrorInitializeAffinity);
+ KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
}
if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
__kmp_ncores > 0) {
__kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
__kmp_topology->canonicalize(nPackages, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
- if (__kmp_affinity_verbose) {
- __kmp_topology->print("KMP_AFFINITY");
+ if (verbose) {
+ __kmp_topology->print(env_var);
}
}
- __kmp_affinity_type = affinity_none;
- __kmp_create_affinity_none_places();
-#if KMP_USE_HIER_SCHED
- __kmp_dispatch_set_hierarchy_values();
-#endif
- KMP_AFFINITY_DISABLE();
- return;
+ return false;
}
- // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and
- // initialize other data structures which depend on the topology
+ // Canonicalize, print (if requested), apply KMP_HW_SUBSET
__kmp_topology->canonicalize();
- if (__kmp_affinity_verbose)
- __kmp_topology->print("KMP_AFFINITY");
+ if (verbose)
+ __kmp_topology->print(env_var);
bool filtered = __kmp_topology->filter_hw_subset();
- if (filtered) {
-#if KMP_OS_WINDOWS
- // Copy filtered full mask if topology has single processor group
- if (__kmp_num_proc_groups <= 1)
-#endif
- __kmp_affin_origMask->copy(__kmp_affin_fullMask);
- }
- if (filtered && __kmp_affinity_verbose)
+ if (filtered && verbose)
__kmp_topology->print("KMP_HW_SUBSET");
- machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
- KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+ return success;
+}
+
+static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
+ bool is_regular_affinity = (&affinity == &__kmp_affinity);
+ bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity);
+ const char *env_var = __kmp_get_affinity_env_var(affinity);
+
+ if (affinity.flags.initialized) {
+ KMP_ASSERT(__kmp_affin_fullMask != NULL);
+ return;
+ }
+
+ if (is_regular_affinity && (!__kmp_affin_fullMask || !__kmp_affin_origMask))
+ __kmp_aux_affinity_initialize_masks(affinity);
+
+ if (is_regular_affinity && !__kmp_topology) {
+ bool success = __kmp_aux_affinity_initialize_topology(affinity);
+ if (success) {
+ KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+ } else {
+ affinity.type = affinity_none;
+ KMP_AFFINITY_DISABLE();
+ }
+ }
+
// If KMP_AFFINITY=none, then only create the single "none" place
// which is the process's initial affinity mask or the number of
// hardware threads depending on respect,norespect
- if (__kmp_affinity_type == affinity_none) {
- __kmp_create_affinity_none_places();
+ if (affinity.type == affinity_none) {
+ __kmp_create_affinity_none_places(affinity);
#if KMP_USE_HIER_SCHED
__kmp_dispatch_set_hierarchy_values();
#endif
+ affinity.flags.initialized = TRUE;
return;
}
+
+ __kmp_topology->set_granularity(affinity);
int depth = __kmp_topology->get_depth();
// Create the table of masks, indexed by thread Id.
- unsigned maxIndex;
- unsigned numUnique;
- kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique);
- if (__kmp_affinity_gran_levels == 0) {
- KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
- }
-
- switch (__kmp_affinity_type) {
+ unsigned numUnique = 0;
+ int numAddrs = __kmp_topology->get_num_hw_threads();
+ // If OMP_PLACES=cores:<attribute> specified, then attempt
+ // to make OS Id mask table using those attributes
+ if (affinity.core_attr_gran.valid) {
+ __kmp_create_os_id_masks(&numUnique, affinity, [&](int idx) {
+ KMP_ASSERT(idx >= -1);
+ for (int i = idx + 1; i < numAddrs; ++i)
+ if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran))
+ return i;
+ return numAddrs;
+ });
+ if (!affinity.os_id_masks) {
+ const char *core_attribute;
+ if (affinity.core_attr_gran.core_eff != kmp_hw_attr_t::UNKNOWN_CORE_EFF)
+ core_attribute = "core_efficiency";
+ else
+ core_attribute = "core_type";
+ KMP_AFF_WARNING(affinity, AffIgnoringNotAvailable, env_var,
+ core_attribute,
+ __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true))
+ }
+ }
+ // If core attributes did not work, or none were specified,
+ // then make OS Id mask table using typical incremental way with
+ // checking for validity of each id at granularity level specified.
+ if (!affinity.os_id_masks) {
+ int gran = affinity.gran_levels;
+ int gran_level = depth - 1 - affinity.gran_levels;
+ if (gran >= 0 && gran_level >= 0 && gran_level < depth) {
+ __kmp_create_os_id_masks(
+ &numUnique, affinity, [depth, numAddrs, &affinity](int idx) {
+ KMP_ASSERT(idx >= -1);
+ int gran = affinity.gran_levels;
+ int gran_level = depth - 1 - affinity.gran_levels;
+ for (int i = idx + 1; i < numAddrs; ++i)
+ if ((gran >= depth) ||
+ (gran < depth && __kmp_topology->at(i).ids[gran_level] !=
+ kmp_hw_thread_t::UNKNOWN_ID))
+ return i;
+ return numAddrs;
+ });
+ }
+ }
+ // Final attempt to make OS Id mask table using typical incremental way.
+ if (!affinity.os_id_masks) {
+ __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) {
+ KMP_ASSERT(idx >= -1);
+ return idx + 1;
+ });
+ }
+
+ switch (affinity.type) {
case affinity_explicit:
- KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
- if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
- __kmp_affinity_process_proclist(
- &__kmp_affinity_masks, &__kmp_affinity_num_masks,
- __kmp_affinity_proclist, osId2Mask, maxIndex);
+ KMP_DEBUG_ASSERT(affinity.proclist != NULL);
+ if (is_hidden_helper_affinity ||
+ __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
+ __kmp_affinity_process_proclist(affinity);
} else {
- __kmp_affinity_process_placelist(
- &__kmp_affinity_masks, &__kmp_affinity_num_masks,
- __kmp_affinity_proclist, osId2Mask, maxIndex);
- }
- if (__kmp_affinity_num_masks == 0) {
- KMP_AFF_WARNING(AffNoValidProcID);
- __kmp_affinity_type = affinity_none;
- __kmp_create_affinity_none_places();
+ __kmp_affinity_process_placelist(affinity);
+ }
+ if (affinity.num_masks == 0) {
+ KMP_AFF_WARNING(affinity, AffNoValidProcID);
+ affinity.type = affinity_none;
+ __kmp_create_affinity_none_places(affinity);
+ affinity.flags.initialized = TRUE;
return;
}
break;
// The other affinity types rely on sorting the hardware threads according to
- // some permutation of the machine topology tree. Set __kmp_affinity_compact
- // and __kmp_affinity_offset appropriately, then jump to a common code
+ // some permutation of the machine topology tree. Set affinity.compact
+ // and affinity.offset appropriately, then jump to a common code
// fragment to do the sort and create the array of affinity masks.
case affinity_logical:
- __kmp_affinity_compact = 0;
- if (__kmp_affinity_offset) {
- __kmp_affinity_offset =
- __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
+ affinity.compact = 0;
+ if (affinity.offset) {
+ affinity.offset =
+ __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
}
goto sortTopology;
case affinity_physical:
if (__kmp_nThreadsPerCore > 1) {
- __kmp_affinity_compact = 1;
- if (__kmp_affinity_compact >= depth) {
- __kmp_affinity_compact = 0;
+ affinity.compact = 1;
+ if (affinity.compact >= depth) {
+ affinity.compact = 0;
}
} else {
- __kmp_affinity_compact = 0;
+ affinity.compact = 0;
}
- if (__kmp_affinity_offset) {
- __kmp_affinity_offset =
- __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
+ if (affinity.offset) {
+ affinity.offset =
+ __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
}
goto sortTopology;
case affinity_scatter:
- if (__kmp_affinity_compact >= depth) {
- __kmp_affinity_compact = 0;
+ if (affinity.compact >= depth) {
+ affinity.compact = 0;
} else {
- __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
+ affinity.compact = depth - 1 - affinity.compact;
}
goto sortTopology;
case affinity_compact:
- if (__kmp_affinity_compact >= depth) {
- __kmp_affinity_compact = depth - 1;
+ if (affinity.compact >= depth) {
+ affinity.compact = depth - 1;
}
goto sortTopology;
case affinity_balanced:
- if (depth <= 1) {
- KMP_AFF_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
- __kmp_affinity_type = affinity_none;
- __kmp_create_affinity_none_places();
+ if (depth <= 1 || is_hidden_helper_affinity) {
+ KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
+ affinity.type = affinity_none;
+ __kmp_create_affinity_none_places(affinity);
+ affinity.flags.initialized = TRUE;
return;
} else if (!__kmp_topology->is_uniform()) {
// Save the depth for further usage
@@ -4379,8 +5162,10 @@ static void __kmp_aux_affinity_initialize(void) {
int nproc = ncores * maxprocpercore;
if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
- KMP_AFF_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
- __kmp_affinity_type = affinity_none;
+ KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
+ affinity.type = affinity_none;
+ __kmp_create_affinity_none_places(affinity);
+ affinity.flags.initialized = TRUE;
return;
}
@@ -4405,48 +5190,57 @@ static void __kmp_aux_affinity_initialize(void) {
procarr[core * maxprocpercore + inlastcore] = proc;
}
}
- if (__kmp_affinity_compact >= depth) {
- __kmp_affinity_compact = depth - 1;
+ if (affinity.compact >= depth) {
+ affinity.compact = depth - 1;
}
sortTopology:
// Allocate the gtid->affinity mask table.
- if (__kmp_affinity_dups) {
- __kmp_affinity_num_masks = __kmp_avail_proc;
+ if (affinity.flags.dups) {
+ affinity.num_masks = __kmp_avail_proc;
} else {
- __kmp_affinity_num_masks = numUnique;
+ affinity.num_masks = numUnique;
}
if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
(__kmp_affinity_num_places > 0) &&
- ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
- __kmp_affinity_num_masks = __kmp_affinity_num_places;
+ ((unsigned)__kmp_affinity_num_places < affinity.num_masks) &&
+ !is_hidden_helper_affinity) {
+ affinity.num_masks = __kmp_affinity_num_places;
}
- KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
+ KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
// Sort the topology table according to the current setting of
- // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
- __kmp_topology->sort_compact();
+ // affinity.compact, then fill out affinity.masks.
+ __kmp_topology->sort_compact(affinity);
{
int i;
unsigned j;
int num_hw_threads = __kmp_topology->get_num_hw_threads();
+ kmp_full_mask_modifier_t full_mask;
for (i = 0, j = 0; i < num_hw_threads; i++) {
- if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) {
+ if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) {
continue;
}
int osId = __kmp_topology->at(i).os_id;
- kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
- kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
+ kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId);
+ if (KMP_CPU_ISEMPTY(src))
+ continue;
+ kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
KMP_ASSERT(KMP_CPU_ISSET(osId, src));
KMP_CPU_COPY(dest, src);
- if (++j >= __kmp_affinity_num_masks) {
+ full_mask.include(src);
+ if (++j >= affinity.num_masks) {
break;
}
}
- KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
+ KMP_DEBUG_ASSERT(j == affinity.num_masks);
+ // See if the places list further restricts or changes the full mask
+ if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
+ __kmp_topology->print(env_var);
+ }
}
// Sort the topology back using ids
__kmp_topology->sort_ids();
@@ -4455,56 +5249,64 @@ static void __kmp_aux_affinity_initialize(void) {
default:
KMP_ASSERT2(0, "Unexpected affinity setting");
}
-
- KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
+ __kmp_aux_affinity_initialize_other_data(affinity);
+ affinity.flags.initialized = TRUE;
}
-void __kmp_affinity_initialize(void) {
+void __kmp_affinity_initialize(kmp_affinity_t &affinity) {
// Much of the code above was written assuming that if a machine was not
- // affinity capable, then __kmp_affinity_type == affinity_none. We now
- // explicitly represent this as __kmp_affinity_type == affinity_disabled.
- // There are too many checks for __kmp_affinity_type == affinity_none
- // in this code. Instead of trying to change them all, check if
- // __kmp_affinity_type == affinity_disabled, and if so, slam it with
- // affinity_none, call the real initialization routine, then restore
- // __kmp_affinity_type to affinity_disabled.
- int disabled = (__kmp_affinity_type == affinity_disabled);
- if (!KMP_AFFINITY_CAPABLE()) {
+ // affinity capable, then affinity type == affinity_none.
+ // We now explicitly represent this as affinity type == affinity_disabled.
+ // There are too many checks for affinity type == affinity_none in this code.
+ // Instead of trying to change them all, check if
+ // affinity type == affinity_disabled, and if so, slam it with affinity_none,
+ // call the real initialization routine, then restore affinity type to
+ // affinity_disabled.
+ int disabled = (affinity.type == affinity_disabled);
+ if (!KMP_AFFINITY_CAPABLE())
KMP_ASSERT(disabled);
- }
- if (disabled) {
- __kmp_affinity_type = affinity_none;
- }
- __kmp_aux_affinity_initialize();
- if (disabled) {
- __kmp_affinity_type = affinity_disabled;
- }
+ if (disabled)
+ affinity.type = affinity_none;
+ __kmp_aux_affinity_initialize(affinity);
+ if (disabled)
+ affinity.type = affinity_disabled;
}
void __kmp_affinity_uninitialize(void) {
- if (__kmp_affinity_masks != NULL) {
- KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
- __kmp_affinity_masks = NULL;
- }
- if (__kmp_affin_fullMask != NULL) {
- KMP_CPU_FREE(__kmp_affin_fullMask);
- __kmp_affin_fullMask = NULL;
+ for (kmp_affinity_t *affinity : __kmp_affinities) {
+ if (affinity->masks != NULL)
+ KMP_CPU_FREE_ARRAY(affinity->masks, affinity->num_masks);
+ if (affinity->os_id_masks != NULL)
+ KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks);
+ if (affinity->proclist != NULL)
+ __kmp_free(affinity->proclist);
+ if (affinity->ids != NULL)
+ __kmp_free(affinity->ids);
+ if (affinity->attrs != NULL)
+ __kmp_free(affinity->attrs);
+ *affinity = KMP_AFFINITY_INIT(affinity->env_var);
}
if (__kmp_affin_origMask != NULL) {
+ if (KMP_AFFINITY_CAPABLE()) {
+#if KMP_OS_AIX
+ // Uninitialize by unbinding the thread.
+ bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
+#else
+ __kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
+#endif
+ }
KMP_CPU_FREE(__kmp_affin_origMask);
__kmp_affin_origMask = NULL;
}
- __kmp_affinity_num_masks = 0;
- __kmp_affinity_type = affinity_default;
__kmp_affinity_num_places = 0;
- if (__kmp_affinity_proclist != NULL) {
- __kmp_free(__kmp_affinity_proclist);
- __kmp_affinity_proclist = NULL;
- }
if (procarr != NULL) {
__kmp_free(procarr);
procarr = NULL;
}
+ if (__kmp_osid_to_hwthread_map) {
+ __kmp_free(__kmp_osid_to_hwthread_map);
+ __kmp_osid_to_hwthread_map = NULL;
+ }
#if KMP_USE_HWLOC
if (__kmp_hwloc_topology != NULL) {
hwloc_topology_destroy(__kmp_hwloc_topology);
@@ -4522,12 +5324,36 @@ void __kmp_affinity_uninitialize(void) {
KMPAffinity::destroy_api();
}
+static void __kmp_select_mask_by_gtid(int gtid, const kmp_affinity_t *affinity,
+ int *place, kmp_affin_mask_t **mask) {
+ int mask_idx;
+ bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
+ if (is_hidden_helper)
+ // The first gtid is the regular primary thread, the second gtid is the main
+ // thread of hidden team which does not participate in task execution.
+ mask_idx = gtid - 2;
+ else
+ mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
+ KMP_DEBUG_ASSERT(affinity->num_masks > 0);
+ *place = (mask_idx + affinity->offset) % affinity->num_masks;
+ *mask = KMP_CPU_INDEX(affinity->masks, *place);
+}
+
+// This function initializes the per-thread data concerning affinity including
+// the mask and topology information
void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
+
+ kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+
+ // Set the thread topology information to default of unknown
+ for (int id = 0; id < KMP_HW_LAST; ++id)
+ th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
+ th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
+
if (!KMP_AFFINITY_CAPABLE()) {
return;
}
- kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
if (th->th.th_affin_mask == NULL) {
KMP_CPU_ALLOC(th->th.th_affin_mask);
} else {
@@ -4535,16 +5361,24 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
}
// Copy the thread mask to the kmp_info_t structure. If
- // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
- // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
- // then the full mask is the same as the mask of the initialization thread.
+ // __kmp_affinity.type == affinity_none, copy the "full" mask, i.e.
+ // one that has all of the OS proc ids set, or if
+ // __kmp_affinity.flags.respect is set, then the full mask is the
+ // same as the mask of the initialization thread.
kmp_affin_mask_t *mask;
int i;
+ const kmp_affinity_t *affinity;
+ bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
+
+ if (is_hidden_helper)
+ affinity = &__kmp_hh_affinity;
+ else
+ affinity = &__kmp_affinity;
- if (KMP_AFFINITY_NON_PROC_BIND) {
- if ((__kmp_affinity_type == affinity_none) ||
- (__kmp_affinity_type == affinity_balanced) ||
- KMP_HIDDEN_HELPER_THREAD(gtid)) {
+ if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) {
+ if ((affinity->type == affinity_none) ||
+ (affinity->type == affinity_balanced) ||
+ KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
#if KMP_GROUP_AFFINITY
if (__kmp_num_proc_groups > 1) {
return;
@@ -4554,14 +5388,10 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
i = 0;
mask = __kmp_affin_fullMask;
} else {
- int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
- KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
- i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
- mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+ __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
}
} else {
- if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) ||
- (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
+ if (!isa_root || __kmp_nested_proc_bind.bind_types[0] == proc_bind_false) {
#if KMP_GROUP_AFFINITY
if (__kmp_num_proc_groups > 1) {
return;
@@ -4571,85 +5401,94 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
i = KMP_PLACE_ALL;
mask = __kmp_affin_fullMask;
} else {
- // int i = some hash function or just a counter that doesn't
- // always start at 0. Use adjusted gtid for now.
- int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
- KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
- i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
- mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+ __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
}
}
th->th.th_current_place = i;
- if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) {
+ if (isa_root && !is_hidden_helper) {
th->th.th_new_place = i;
th->th.th_first_place = 0;
- th->th.th_last_place = __kmp_affinity_num_masks - 1;
+ th->th.th_last_place = affinity->num_masks - 1;
} else if (KMP_AFFINITY_NON_PROC_BIND) {
// When using a Non-OMP_PROC_BIND affinity method,
// set all threads' place-partition-var to the entire place list
th->th.th_first_place = 0;
- th->th.th_last_place = __kmp_affinity_num_masks - 1;
+ th->th.th_last_place = affinity->num_masks - 1;
+ }
+ // Copy topology information associated with the place
+ if (i >= 0) {
+ th->th.th_topology_ids = __kmp_affinity.ids[i];
+ th->th.th_topology_attrs = __kmp_affinity.attrs[i];
}
if (i == KMP_PLACE_ALL) {
- KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
+ KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n",
gtid));
} else {
- KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
+ KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n",
gtid, i));
}
KMP_CPU_COPY(th->th.th_affin_mask, mask);
+}
- if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid)
- /* to avoid duplicate printing (will be correctly printed on barrier) */
- && (__kmp_affinity_type == affinity_none ||
- (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) {
- char buf[KMP_AFFIN_MASK_PRINT_LEN];
- __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
- th->th.th_affin_mask);
- KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
- __kmp_gettid(), gtid, buf);
+void __kmp_affinity_bind_init_mask(int gtid) {
+ if (!KMP_AFFINITY_CAPABLE()) {
+ return;
}
+ kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+ const kmp_affinity_t *affinity;
+ const char *env_var;
+ bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
-#if KMP_DEBUG
- // Hidden helper thread affinity only printed for debug builds
- if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) {
+ if (is_hidden_helper)
+ affinity = &__kmp_hh_affinity;
+ else
+ affinity = &__kmp_affinity;
+ env_var = __kmp_get_affinity_env_var(*affinity, /*for_binding=*/true);
+ /* to avoid duplicate printing (will be correctly printed on barrier) */
+ if (affinity->flags.verbose && (affinity->type == affinity_none ||
+ (th->th.th_current_place != KMP_PLACE_ALL &&
+ affinity->type != affinity_balanced)) &&
+ !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
th->th.th_affin_mask);
- KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)",
- (kmp_int32)getpid(), __kmp_gettid(), gtid, buf);
+ KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
+ gtid, buf);
}
-#endif
#if KMP_OS_WINDOWS
// On Windows* OS, the process affinity mask might have changed. If the user
// didn't request affinity and this call fails, just continue silently.
// See CQ171393.
- if (__kmp_affinity_type == affinity_none) {
+ if (affinity->type == affinity_none) {
__kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
} else
#endif
+#ifndef KMP_OS_AIX
+ // Do not set the full mask as the init mask on AIX.
__kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
+#endif
}
-void __kmp_affinity_set_place(int gtid) {
- if (!KMP_AFFINITY_CAPABLE()) {
+void __kmp_affinity_bind_place(int gtid) {
+ // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND
+ if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) {
return;
}
kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
- KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
+ KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current "
"place = %d)\n",
gtid, th->th.th_new_place, th->th.th_current_place));
// Check that the new place is within this thread's partition.
KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
KMP_ASSERT(th->th.th_new_place >= 0);
- KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
+ KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity.num_masks);
if (th->th.th_first_place <= th->th.th_last_place) {
KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
(th->th.th_new_place <= th->th.th_last_place));
@@ -4661,11 +5500,11 @@ void __kmp_affinity_set_place(int gtid) {
// Copy the thread mask to the kmp_info_t structure,
// and set this thread's affinity.
kmp_affin_mask_t *mask =
- KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
+ KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place);
KMP_CPU_COPY(th->th.th_affin_mask, mask);
th->th.th_current_place = th->th.th_new_place;
- if (__kmp_affinity_verbose) {
+ if (__kmp_affinity.flags.verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
th->th.th_affin_mask);
@@ -4733,7 +5572,7 @@ int __kmp_aux_set_affinity(void **mask) {
th->th.th_current_place = KMP_PLACE_UNDEFINED;
th->th.th_new_place = KMP_PLACE_UNDEFINED;
th->th.th_first_place = 0;
- th->th.th_last_place = __kmp_affinity_num_masks - 1;
+ th->th.th_last_place = __kmp_affinity.num_masks - 1;
// Turn off 4.0 affinity for the current tread at this parallel level.
th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
@@ -4744,7 +5583,7 @@ int __kmp_aux_set_affinity(void **mask) {
int __kmp_aux_get_affinity(void **mask) {
int gtid;
int retval;
-#if KMP_OS_WINDOWS || KMP_DEBUG
+#if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
kmp_info_t *th;
#endif
if (!KMP_AFFINITY_CAPABLE()) {
@@ -4752,7 +5591,7 @@ int __kmp_aux_get_affinity(void **mask) {
}
gtid = __kmp_entry_gtid();
-#if KMP_OS_WINDOWS || KMP_DEBUG
+#if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
th = __kmp_threads[gtid];
#else
(void)gtid; // unused variable
@@ -4775,7 +5614,7 @@ int __kmp_aux_get_affinity(void **mask) {
}
}
-#if !KMP_OS_WINDOWS
+#if !KMP_OS_WINDOWS && !KMP_OS_AIX
retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
KA_TRACE(
@@ -4795,7 +5634,7 @@ int __kmp_aux_get_affinity(void **mask) {
KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
return 0;
-#endif /* KMP_OS_WINDOWS */
+#endif /* !KMP_OS_WINDOWS && !KMP_OS_AIX */
}
int __kmp_aux_get_affinity_max_proc() {
@@ -4908,17 +5747,40 @@ int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
}
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+// Returns first os proc id with ATOM core
+int __kmp_get_first_osid_with_ecore(void) {
+ int low = 0;
+ int high = __kmp_topology->get_num_hw_threads() - 1;
+ int mid = 0;
+ while (high - low > 1) {
+ mid = (high + low) / 2;
+ if (__kmp_topology->at(mid).attrs.get_core_type() ==
+ KMP_HW_CORE_TYPE_CORE) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
+ return mid;
+ }
+ return -1;
+}
+#endif
+
// Dynamic affinity settings - Affinity balanced
void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
KMP_DEBUG_ASSERT(th);
bool fine_gran = true;
int tid = th->th.th_info.ds.ds_tid;
+ const char *env_var = "KMP_AFFINITY";
// Do not perform balanced affinity for the hidden helper threads
if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
return;
- switch (__kmp_affinity_gran) {
+ switch (__kmp_affinity.gran) {
case KMP_HW_THREAD:
break;
case KMP_HW_CORE:
@@ -4976,12 +5838,13 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
KMP_CPU_SET(osID, mask);
}
}
- if (__kmp_affinity_verbose) {
+ if (__kmp_affinity.flags.verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
- KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
- __kmp_gettid(), tid, buf);
+ KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
+ tid, buf);
}
+ __kmp_affinity_get_thread_topology_info(th);
__kmp_set_system_affinity(mask, TRUE);
} else { // Non-uniform topology
@@ -5142,17 +6005,19 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
__kmp_free(newarr);
}
- if (__kmp_affinity_verbose) {
+ if (__kmp_affinity.flags.verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
- KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
- __kmp_gettid(), tid, buf);
+ KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
+ tid, buf);
}
+ __kmp_affinity_get_thread_topology_info(th);
__kmp_set_system_affinity(mask, TRUE);
}
}
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
+ KMP_OS_AIX
// We don't need this entry for Windows because
// there is GetProcessAffinityMask() api
//
@@ -5187,7 +6052,11 @@ extern "C"
"set full mask for thread %d\n",
gtid));
KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
+#if KMP_OS_AIX
+ return bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
+#else
return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
+#endif
}
#endif