Update contrib/libs/cxxsupp/openmp to 14.0.4

ref:77c6cdda99b217d50c4deadca11f5611fa0dc168
author: thegeorg <thegeorg@yandex-team.ru> 2022-06-03 10:53:07 +0300
committer: thegeorg <thegeorg@yandex-team.ru> 2022-06-03 10:53:07 +0300
commit: a1d4361e379e2c72a469ad1bd64569cbc2db131f (patch)
tree: 0caddb240a10132376e4653a31578e117d33f9fd /contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
parent: 41f55a521834080d9d703c099c0418cfff3a0546 (diff)
download: ydb-a1d4361e379e2c72a469ad1bd64569cbc2db131f.tar.gz
1 files changed, 667 insertions, 51 deletions
diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
index 8b40bd7ecd..414a27fb05 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
@@ -26,6 +26,7 @@
 #define HWLOC_GROUP_KIND_INTEL_DIE 104
 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
 #endif
+#include <ctype.h>
 
 // The machine topology
 kmp_topology_t *__kmp_topology = nullptr;
@@ -123,6 +124,20 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
   return ((plural) ? "unknowns" : "unknown");
 }
 
+const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
+  switch (type) {
+  case KMP_HW_CORE_TYPE_UNKNOWN:
+    return "unknown";
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  case KMP_HW_CORE_TYPE_ATOM:
+    return "Intel Atom(R) processor";
+  case KMP_HW_CORE_TYPE_CORE:
+    return "Intel(R) Core(TM) processor";
+#endif
+  }
+  return "unknown";
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // kmp_hw_thread_t methods
 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
@@ -174,20 +189,94 @@ void kmp_hw_thread_t::print() const {
   for (int i = 0; i < depth; ++i) {
     printf("%4d ", ids[i]);
   }
+  if (attrs) {
+    if (attrs.is_core_type_valid())
+      printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type()));
+    if (attrs.is_core_eff_valid())
+      printf(" (eff=%d)", attrs.get_core_eff());
+  }
   printf("\n");
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // kmp_topology_t methods
 
+// Add a layer to the topology based on the ids. Assume the topology
+// is perfectly nested (i.e., so no object has more than one parent)
+void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
+  // Figure out where the layer should go by comparing the ids of the current
+  // layers with the new ids
+  int target_layer;
+  int previous_id = kmp_hw_thread_t::UNKNOWN_ID;
+  int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID;
+
+  // Start from the highest layer and work down to find target layer
+  // If new layer is equal to another layer then put the new layer above
+  for (target_layer = 0; target_layer < depth; ++target_layer) {
+    bool layers_equal = true;
+    bool strictly_above_target_layer = false;
+    for (int i = 0; i < num_hw_threads; ++i) {
+      int id = hw_threads[i].ids[target_layer];
+      int new_id = ids[i];
+      if (id != previous_id && new_id == previous_new_id) {
+        // Found the layer we are strictly above
+        strictly_above_target_layer = true;
+        layers_equal = false;
+        break;
+      } else if (id == previous_id && new_id != previous_new_id) {
+        // Found a layer we are below. Move to next layer and check.
+        layers_equal = false;
+        break;
+      }
+      previous_id = id;
+      previous_new_id = new_id;
+    }
+    if (strictly_above_target_layer || layers_equal)
+      break;
+  }
+
+  // Found the layer we are above. Now move everything to accommodate the new
+  // layer. And put the new ids and type into the topology.
+  for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
+    types[j] = types[i];
+  types[target_layer] = type;
+  for (int k = 0; k < num_hw_threads; ++k) {
+    for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
+      hw_threads[k].ids[j] = hw_threads[k].ids[i];
+    hw_threads[k].ids[target_layer] = ids[k];
+  }
+  equivalent[type] = type;
+  depth++;
+}
+
+#if KMP_GROUP_AFFINITY
+// Insert the Windows Processor Group structure into the topology
+void kmp_topology_t::_insert_windows_proc_groups() {
+  // Do not insert the processor group structure for a single group
+  if (__kmp_num_proc_groups == 1)
+    return;
+  kmp_affin_mask_t *mask;
+  int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
+  KMP_CPU_ALLOC(mask);
+  for (int i = 0; i < num_hw_threads; ++i) {
+    KMP_CPU_ZERO(mask);
+    KMP_CPU_SET(hw_threads[i].os_id, mask);
+    ids[i] = __kmp_get_proc_group(mask);
+  }
+  KMP_CPU_FREE(mask);
+  _insert_layer(KMP_HW_PROC_GROUP, ids);
+  __kmp_free(ids);
+}
+#endif
+
 // Remove layers that don't add information to the topology.
 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
 void kmp_topology_t::_remove_radix1_layers() {
   int preference[KMP_HW_LAST];
   int top_index1, top_index2;
   // Set up preference associative array
-  preference[KMP_HW_PROC_GROUP] = 110;
-  preference[KMP_HW_SOCKET] = 100;
+  preference[KMP_HW_SOCKET] = 110;
+  preference[KMP_HW_PROC_GROUP] = 100;
   preference[KMP_HW_CORE] = 95;
   preference[KMP_HW_THREAD] = 90;
   preference[KMP_HW_NUMA] = 85;
@@ -305,6 +394,7 @@ void kmp_topology_t::_gather_enumeration_information() {
     count[i] = 0;
     ratio[i] = 0;
   }
+  int core_level = get_level(KMP_HW_CORE);
   for (int i = 0; i < num_hw_threads; ++i) {
     kmp_hw_thread_t &hw_thread = hw_threads[i];
     for (int layer = 0; layer < depth; ++layer) {
@@ -320,6 +410,29 @@ void kmp_topology_t::_gather_enumeration_information() {
             ratio[l] = max[l];
           max[l] = 1;
         }
+        // Figure out the number of different core types
+        // and efficiencies for hybrid CPUs
+        if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) {
+          if (hw_thread.attrs.is_core_eff_valid() &&
+              hw_thread.attrs.core_eff >= num_core_efficiencies) {
+            // Because efficiencies can range from 0 to max efficiency - 1,
+            // the number of efficiencies is max efficiency + 1
+            num_core_efficiencies = hw_thread.attrs.core_eff + 1;
+          }
+          if (hw_thread.attrs.is_core_type_valid()) {
+            bool found = false;
+            for (int j = 0; j < num_core_types; ++j) {
+              if (hw_thread.attrs.get_core_type() == core_types[j]) {
+                found = true;
+                break;
+              }
+            }
+            if (!found) {
+              KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES);
+              core_types[num_core_types++] = hw_thread.attrs.get_core_type();
+            }
+          }
+        }
         break;
       }
     }
@@ -333,6 +446,42 @@ void kmp_topology_t::_gather_enumeration_information() {
   }
 }
 
+int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr,
+                                          int above_level,
+                                          bool find_all) const {
+  int current, current_max;
+  int previous_id[KMP_HW_LAST];
+  for (int i = 0; i < depth; ++i)
+    previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
+  int core_level = get_level(KMP_HW_CORE);
+  if (find_all)
+    above_level = -1;
+  KMP_ASSERT(above_level < core_level);
+  current_max = 0;
+  current = 0;
+  for (int i = 0; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &hw_thread = hw_threads[i];
+    if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) {
+      if (current > current_max)
+        current_max = current;
+      current = hw_thread.attrs.contains(attr);
+    } else {
+      for (int level = above_level + 1; level <= core_level; ++level) {
+        if (hw_thread.ids[level] != previous_id[level]) {
+          if (hw_thread.attrs.contains(attr))
+            current++;
+          break;
+        }
+      }
+    }
+    for (int level = 0; level < depth; ++level)
+      previous_id[level] = hw_thread.ids[level];
+  }
+  if (current > current_max)
+    current_max = current;
+  return current_max;
+}
+
 // Find out if the topology is uniform
 void kmp_topology_t::_discover_uniformity() {
   int num = 1;
@@ -406,7 +555,7 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
   kmp_topology_t *retval;
   // Allocate all data in one large allocation
   size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
-                sizeof(int) * ndepth * 3;
+                sizeof(int) * (size_t)KMP_HW_LAST * 3;
   char *bytes = (char *)__kmp_allocate(size);
   retval = (kmp_topology_t *)bytes;
   if (nproc > 0) {
@@ -419,8 +568,12 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
   int *arr =
       (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
   retval->types = (kmp_hw_t *)arr;
-  retval->ratio = arr + ndepth;
-  retval->count = arr + 2 * ndepth;
+  retval->ratio = arr + (size_t)KMP_HW_LAST;
+  retval->count = arr + 2 * (size_t)KMP_HW_LAST;
+  retval->num_core_efficiencies = 0;
+  retval->num_core_types = 0;
+  for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
+    retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
   KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
   for (int i = 0; i < ndepth; ++i) {
     retval->types[i] = types[i];
@@ -478,6 +631,13 @@ void kmp_topology_t::dump() const {
   }
   printf("\n");
 
+  printf("* num_core_eff: %d\n", num_core_efficiencies);
+  printf("* num_core_types: %d\n", num_core_types);
+  printf("* core_types: ");
+  for (int i = 0; i < num_core_types; ++i)
+    printf("%3d ", core_types[i]);
+  printf("\n");
+
   printf("* equivalent map:\n");
   KMP_FOREACH_HW_TYPE(i) {
     const char *key = __kmp_hw_get_keyword(i);
@@ -571,6 +731,29 @@ void kmp_topology_t::print(const char *env_var) const {
   }
   KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
 
+  // Hybrid topology information
+  if (__kmp_is_hybrid_cpu()) {
+    for (int i = 0; i < num_core_types; ++i) {
+      kmp_hw_core_type_t core_type = core_types[i];
+      kmp_hw_attr_t attr;
+      attr.clear();
+      attr.set_core_type(core_type);
+      int ncores = get_ncores_with_attr(attr);
+      if (ncores > 0) {
+        KMP_INFORM(TopologyHybrid, env_var, ncores,
+                   __kmp_hw_get_core_type_string(core_type));
+        KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS)
+        for (int eff = 0; eff < num_core_efficiencies; ++eff) {
+          attr.set_core_eff(eff);
+          int ncores_with_eff = get_ncores_with_attr(attr);
+          if (ncores_with_eff > 0) {
+            KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff);
+          }
+        }
+      }
+    }
+  }
+
   if (num_hw_threads <= 0) {
     __kmp_str_buf_free(&buf);
     return;
@@ -585,6 +768,10 @@ void kmp_topology_t::print(const char *env_var) const {
       __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
       __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
     }
+    if (__kmp_is_hybrid_cpu())
+      __kmp_str_buf_print(
+          &buf, "(%s)",
+          __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type()));
     KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
   }
 
@@ -592,6 +779,9 @@ void kmp_topology_t::print(const char *env_var) const {
 }
 
 void kmp_topology_t::canonicalize() {
+#if KMP_GROUP_AFFINITY
+  _insert_windows_proc_groups();
+#endif
   _remove_radix1_layers();
   _gather_enumeration_information();
   _discover_uniformity();
@@ -640,6 +830,25 @@ void kmp_topology_t::canonicalize() {
                   __kmp_hw_get_catalog_string(gran_type));
       __kmp_affinity_gran = gran_type;
     }
+#if KMP_GROUP_AFFINITY
+    // If more than one processor group exists, and the level of
+    // granularity specified by the user is too coarse, then the
+    // granularity must be adjusted "down" to processor group affinity
+    // because threads can only exist within one processor group.
+    // For example, if a user sets granularity=socket and there are two
+    // processor groups that cover a socket, then the runtime must
+    // restrict the granularity down to the processor group level.
+    if (__kmp_num_proc_groups > 1) {
+      int gran_depth = __kmp_topology->get_level(gran_type);
+      int proc_group_depth = __kmp_topology->get_level(KMP_HW_PROC_GROUP);
+      if (gran_depth >= 0 && proc_group_depth >= 0 &&
+          gran_depth < proc_group_depth) {
+        KMP_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY",
+                    __kmp_hw_get_catalog_string(__kmp_affinity_gran));
+        __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP;
+      }
+    }
+#endif
     __kmp_affinity_gran_levels = 0;
     for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
       __kmp_affinity_gran_levels++;
@@ -673,6 +882,56 @@ void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
   _discover_uniformity();
 }
 
+// Represents running sub IDs for a single core attribute where
+// attribute values have SIZE possibilities.
+template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
+  int last_level; // last level in topology to consider for sub_ids
+  int sub_id[SIZE]; // The sub ID for a given attribute value
+  int prev_sub_id[KMP_HW_LAST];
+  IndexFunc indexer;
+
+public:
+  kmp_sub_ids_t(int last_level) : last_level(last_level) {
+    KMP_ASSERT(last_level < KMP_HW_LAST);
+    for (size_t i = 0; i < SIZE; ++i)
+      sub_id[i] = -1;
+    for (size_t i = 0; i < KMP_HW_LAST; ++i)
+      prev_sub_id[i] = -1;
+  }
+  void update(const kmp_hw_thread_t &hw_thread) {
+    int idx = indexer(hw_thread);
+    KMP_ASSERT(idx < (int)SIZE);
+    for (int level = 0; level <= last_level; ++level) {
+      if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
+        if (level < last_level)
+          sub_id[idx] = -1;
+        sub_id[idx]++;
+        break;
+      }
+    }
+    for (int level = 0; level <= last_level; ++level)
+      prev_sub_id[level] = hw_thread.sub_ids[level];
+  }
+  int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
+    return sub_id[indexer(hw_thread)];
+  }
+};
+
+static kmp_str_buf_t *
+__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
+                                 bool plural) {
+  __kmp_str_buf_init(buf);
+  if (attr.is_core_type_valid())
+    __kmp_str_buf_print(buf, "%s %s",
+                        __kmp_hw_get_core_type_string(attr.get_core_type()),
+                        __kmp_hw_get_catalog_string(KMP_HW_CORE, plural));
+  else
+    __kmp_str_buf_print(buf, "%s eff=%d",
+                        __kmp_hw_get_catalog_string(KMP_HW_CORE, plural),
+                        attr.get_core_eff());
+  return buf;
+}
+
 // Apply the KMP_HW_SUBSET envirable to the topology
 // Returns true if KMP_HW_SUBSET filtered any processors
 // otherwise, returns false
@@ -681,18 +940,27 @@ bool kmp_topology_t::filter_hw_subset() {
   if (!__kmp_hw_subset)
     return false;
 
+  // First, sort the KMP_HW_SUBSET items by the machine topology
+  __kmp_hw_subset->sort();
+
   // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
+  bool using_core_types = false;
+  bool using_core_effs = false;
   int hw_subset_depth = __kmp_hw_subset->get_depth();
   kmp_hw_t specified[KMP_HW_LAST];
+  int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
   KMP_ASSERT(hw_subset_depth > 0);
   KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
+  int core_level = get_level(KMP_HW_CORE);
   for (int i = 0; i < hw_subset_depth; ++i) {
     int max_count;
-    int num = __kmp_hw_subset->at(i).num;
-    int offset = __kmp_hw_subset->at(i).offset;
-    kmp_hw_t type = __kmp_hw_subset->at(i).type;
+    const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i);
+    int num = item.num[0];
+    int offset = item.offset[0];
+    kmp_hw_t type = item.type;
     kmp_hw_t equivalent_type = equivalent[type];
     int level = get_level(type);
+    topology_levels[i] = level;
 
     // Check to see if current layer is in detected machine topology
     if (equivalent_type != KMP_HW_UNKNOWN) {
@@ -703,8 +971,8 @@ bool kmp_topology_t::filter_hw_subset() {
       return false;
     }
 
-    // Check to see if current layer has already been specified
-    // either directly or through an equivalent type
+    // Check to see if current layer has already been
+    // specified either directly or through an equivalent type
     if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
       KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type),
                   __kmp_hw_get_catalog_string(specified[equivalent_type]));
@@ -712,66 +980,247 @@ bool kmp_topology_t::filter_hw_subset() {
     }
     specified[equivalent_type] = type;
 
-    // Check to see if layers are in order
-    if (i + 1 < hw_subset_depth) {
-      kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type);
-      if (next_type == KMP_HW_UNKNOWN) {
-        KMP_WARNING(
-            AffHWSubsetNotExistGeneric,
-            __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type));
-        return false;
-      }
-      int next_topology_level = get_level(next_type);
-      if (level > next_topology_level) {
-        KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type),
-                    __kmp_hw_get_catalog_string(next_type));
-        return false;
-      }
-    }
-
     // Check to see if each layer's num & offset parameters are valid
     max_count = get_ratio(level);
-    if (max_count < 0 || num + offset > max_count) {
+    if (max_count < 0 ||
+        (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
       bool plural = (num > 1);
       KMP_WARNING(AffHWSubsetManyGeneric,
                   __kmp_hw_get_catalog_string(type, plural));
       return false;
     }
+
+    // Check to see if core attributes are consistent
+    if (core_level == level) {
+      // Determine which core attributes are specified
+      for (int j = 0; j < item.num_attrs; ++j) {
+        if (item.attr[j].is_core_type_valid())
+          using_core_types = true;
+        if (item.attr[j].is_core_eff_valid())
+          using_core_effs = true;
+      }
+
+      // Check if using a single core attribute on non-hybrid arch.
+      // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute.
+      //
+      // Check if using multiple core attributes on non-hyrbid arch.
+      // Ignore all of KMP_HW_SUBSET if this is the case.
+      if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
+        if (item.num_attrs == 1) {
+          if (using_core_effs) {
+            KMP_WARNING(AffHWSubsetIgnoringAttr, "efficiency");
+          } else {
+            KMP_WARNING(AffHWSubsetIgnoringAttr, "core_type");
+          }
+          using_core_effs = false;
+          using_core_types = false;
+        } else {
+          KMP_WARNING(AffHWSubsetAttrsNonHybrid);
+          return false;
+        }
+      }
+
+      // Check if using both core types and core efficiencies together
+      if (using_core_types && using_core_effs) {
+        KMP_WARNING(AffHWSubsetIncompat, "core_type", "efficiency");
+        return false;
+      }
+
+      // Check that core efficiency values are valid
+      if (using_core_effs) {
+        for (int j = 0; j < item.num_attrs; ++j) {
+          if (item.attr[j].is_core_eff_valid()) {
+            int core_eff = item.attr[j].get_core_eff();
+            if (core_eff < 0 || core_eff >= num_core_efficiencies) {
+              kmp_str_buf_t buf;
+              __kmp_str_buf_init(&buf);
+              __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff());
+              __kmp_msg(kmp_ms_warning,
+                        KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str),
+                        KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1),
+                        __kmp_msg_null);
+              __kmp_str_buf_free(&buf);
+              return false;
+            }
+          }
+        }
+      }
+
+      // Check that the number of requested cores with attributes is valid
+      if (using_core_types || using_core_effs) {
+        for (int j = 0; j < item.num_attrs; ++j) {
+          int num = item.num[j];
+          int offset = item.offset[j];
+          int level_above = core_level - 1;
+          if (level_above >= 0) {
+            max_count = get_ncores_with_attr_per(item.attr[j], level_above);
+            if (max_count <= 0 ||
+                (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
+              kmp_str_buf_t buf;
+              __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
+              KMP_WARNING(AffHWSubsetManyGeneric, buf.str);
+              __kmp_str_buf_free(&buf);
+              return false;
+            }
+          }
+        }
+      }
+
+      if ((using_core_types || using_core_effs) && item.num_attrs > 1) {
+        for (int j = 0; j < item.num_attrs; ++j) {
+          // Ambiguous use of specific core attribute + generic core
+          // e.g., 4c & 3c:intel_core or 4c & 3c:eff1
+          if (!item.attr[j]) {
+            kmp_hw_attr_t other_attr;
+            for (int k = 0; k < item.num_attrs; ++k) {
+              if (item.attr[k] != item.attr[j]) {
+                other_attr = item.attr[k];
+                break;
+              }
+            }
+            kmp_str_buf_t buf;
+            __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
+            KMP_WARNING(AffHWSubsetIncompat,
+                        __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
+            __kmp_str_buf_free(&buf);
+            return false;
+          }
+          // Allow specifying a specific core type or core eff exactly once
+          for (int k = 0; k < j; ++k) {
+            if (!item.attr[j] || !item.attr[k])
+              continue;
+            if (item.attr[k] == item.attr[j]) {
+              kmp_str_buf_t buf;
+              __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
+                                               item.num[j] > 0);
+              KMP_WARNING(AffHWSubsetAttrRepeat, buf.str);
+              __kmp_str_buf_free(&buf);
+              return false;
+            }
+          }
+        }
+      }
+    }
   }
 
-  // Apply the filtered hardware subset
-  int new_index = 0;
+  struct core_type_indexer {
+    int operator()(const kmp_hw_thread_t &t) const {
+      switch (t.attrs.get_core_type()) {
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+      case KMP_HW_CORE_TYPE_ATOM:
+        return 1;
+      case KMP_HW_CORE_TYPE_CORE:
+        return 2;
+#endif
+      case KMP_HW_CORE_TYPE_UNKNOWN:
+        return 0;
+      }
+      KMP_ASSERT(0);
+      return 0;
+    }
+  };
+  struct core_eff_indexer {
+    int operator()(const kmp_hw_thread_t &t) const {
+      return t.attrs.get_core_eff();
+    }
+  };
+
+  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
+      core_level);
+  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
+      core_level);
+
+  // Determine which hardware threads should be filtered.
+  int num_filtered = 0;
+  bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads);
   for (int i = 0; i < num_hw_threads; ++i) {
     kmp_hw_thread_t &hw_thread = hw_threads[i];
+    // Update type_sub_id
+    if (using_core_types)
+      core_type_sub_ids.update(hw_thread);
+    if (using_core_effs)
+      core_eff_sub_ids.update(hw_thread);
+
     // Check to see if this hardware thread should be filtered
     bool should_be_filtered = false;
-    for (int level = 0, hw_subset_index = 0;
-         level < depth && hw_subset_index < hw_subset_depth; ++level) {
-      kmp_hw_t topology_type = types[level];
-      auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
-      kmp_hw_t hw_subset_type = hw_subset_item.type;
-      if (topology_type != hw_subset_type)
+    for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth;
+         ++hw_subset_index) {
+      const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
+      int level = topology_levels[hw_subset_index];
+      if (level == -1)
         continue;
-      int num = hw_subset_item.num;
-      int offset = hw_subset_item.offset;
-      hw_subset_index++;
-      if (hw_thread.sub_ids[level] < offset ||
-          hw_thread.sub_ids[level] >= offset + num) {
-        should_be_filtered = true;
-        break;
+      if ((using_core_effs || using_core_types) && level == core_level) {
+        // Look for the core attribute in KMP_HW_SUBSET which corresponds
+        // to this hardware thread's core attribute. Use this num,offset plus
+        // the running sub_id for the particular core attribute of this hardware
+        // thread to determine if the hardware thread should be filtered or not.
+        int attr_idx;
+        kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type();
+        int core_eff = hw_thread.attrs.get_core_eff();
+        for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) {
+          if (using_core_types &&
+              hw_subset_item.attr[attr_idx].get_core_type() == core_type)
+            break;
+          if (using_core_effs &&
+              hw_subset_item.attr[attr_idx].get_core_eff() == core_eff)
+            break;
+        }
+        // This core attribute isn't in the KMP_HW_SUBSET so always filter it.
+        if (attr_idx == hw_subset_item.num_attrs) {
+          should_be_filtered = true;
+          break;
+        }
+        int sub_id;
+        int num = hw_subset_item.num[attr_idx];
+        int offset = hw_subset_item.offset[attr_idx];
+        if (using_core_types)
+          sub_id = core_type_sub_ids.get_sub_id(hw_thread);
+        else
+          sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
+        if (sub_id < offset ||
+            (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
+          should_be_filtered = true;
+          break;
+        }
+      } else {
+        int num = hw_subset_item.num[0];
+        int offset = hw_subset_item.offset[0];
+        if (hw_thread.sub_ids[level] < offset ||
+            (num != kmp_hw_subset_t::USE_ALL &&
+             hw_thread.sub_ids[level] >= offset + num)) {
+          should_be_filtered = true;
+          break;
+        }
       }
     }
-    if (!should_be_filtered) {
+    // Collect filtering information
+    filtered[i] = should_be_filtered;
+    if (should_be_filtered)
+      num_filtered++;
+  }
+
+  // One last check that we shouldn't allow filtering entire machine
+  if (num_filtered == num_hw_threads) {
+    KMP_WARNING(AffHWSubsetAllFiltered);
+    __kmp_free(filtered);
+    return false;
+  }
+
+  // Apply the filter
+  int new_index = 0;
+  for (int i = 0; i < num_hw_threads; ++i) {
+    if (!filtered[i]) {
       if (i != new_index)
-        hw_threads[new_index] = hw_thread;
+        hw_threads[new_index] = hw_threads[i];
       new_index++;
     } else {
 #if KMP_AFFINITY_SUPPORTED
-      KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask);
+      KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask);
 #endif
       __kmp_avail_proc--;
     }
   }
+
   KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
   num_hw_threads = new_index;
 
@@ -780,6 +1229,7 @@ bool kmp_topology_t::filter_hw_subset() {
   _discover_uniformity();
   _set_globals();
   _set_last_level_cache();
+  __kmp_free(filtered);
   return true;
 }
 
@@ -986,7 +1436,67 @@ kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
   return buf;
 }
 
-void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
+// Return (possibly empty) affinity mask representing the offline CPUs
+// Caller must free the mask
+kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
+  kmp_affin_mask_t *offline;
+  KMP_CPU_ALLOC(offline);
+  KMP_CPU_ZERO(offline);
+#if KMP_OS_LINUX
+  int n, begin_cpu, end_cpu;
+  kmp_safe_raii_file_t offline_file;
+  auto skip_ws = [](FILE *f) {
+    int c;
+    do {
+      c = fgetc(f);
+    } while (isspace(c));
+    if (c != EOF)
+      ungetc(c, f);
+  };
+  // File contains CSV of integer ranges representing the offline CPUs
+  // e.g., 1,2,4-7,9,11-15
+  int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
+  if (status != 0)
+    return offline;
+  while (!feof(offline_file)) {
+    skip_ws(offline_file);
+    n = fscanf(offline_file, "%d", &begin_cpu);
+    if (n != 1)
+      break;
+    skip_ws(offline_file);
+    int c = fgetc(offline_file);
+    if (c == EOF || c == ',') {
+      // Just single CPU
+      end_cpu = begin_cpu;
+    } else if (c == '-') {
+      // Range of CPUs
+      skip_ws(offline_file);
+      n = fscanf(offline_file, "%d", &end_cpu);
+      if (n != 1)
+        break;
+      skip_ws(offline_file);
+      c = fgetc(offline_file); // skip ','
+    } else {
+      // Syntax problem
+      break;
+    }
+    // Ensure a valid range of CPUs
+    if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 ||
+        end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
+      continue;
+    }
+    // Insert [begin_cpu, end_cpu] into offline mask
+    for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
+      KMP_CPU_SET(cpu, offline);
+    }
+  }
+#endif
+  return offline;
+}
+
+// Return the number of available procs
+int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
+  int avail_proc = 0;
   KMP_CPU_ZERO(mask);
 
 #if KMP_GROUP_AFFINITY
@@ -999,6 +1509,7 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
       int num = __kmp_GetActiveProcessorCount(group);
       for (i = 0; i < num; i++) {
         KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
+        avail_proc++;
       }
     }
   } else
@@ -1007,10 +1518,18 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
 
   {
     int proc;
+    kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus();
     for (proc = 0; proc < __kmp_xproc; proc++) {
+      // Skip offline CPUs
+      if (KMP_CPU_ISSET(proc, offline_cpus))
+        continue;
       KMP_CPU_SET(proc, mask);
+      avail_proc++;
     }
+    KMP_CPU_FREE(offline_cpus);
   }
+
+  return avail_proc;
 }
 
 // All of the __kmp_affinity_create_*_map() routines should allocate the
@@ -1156,6 +1675,45 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
     return true;
   }
 
+  // Handle multiple types of cores if they exist on the system
+  int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
+
+  typedef struct kmp_hwloc_cpukinds_info_t {
+    int efficiency;
+    kmp_hw_core_type_t core_type;
+    hwloc_bitmap_t mask;
+  } kmp_hwloc_cpukinds_info_t;
+  kmp_hwloc_cpukinds_info_t *cpukinds = nullptr;
+
+  if (nr_cpu_kinds > 0) {
+    unsigned nr_infos;
+    struct hwloc_info_s *infos;
+    cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate(
+        sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds);
+    for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) {
+      cpukinds[idx].efficiency = -1;
+      cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+      cpukinds[idx].mask = hwloc_bitmap_alloc();
+      if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask,
+                                  &cpukinds[idx].efficiency, &nr_infos, &infos,
+                                  0) == 0) {
+        for (unsigned i = 0; i < nr_infos; ++i) {
+          if (__kmp_str_match("CoreType", 8, infos[i].name)) {
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+            if (__kmp_str_match("IntelAtom", 9, infos[i].value)) {
+              cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM;
+              break;
+            } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) {
+              cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE;
+              break;
+            }
+#endif
+          }
+        }
+      }
+    }
+  }
+
   root = hwloc_get_root_obj(tp);
 
   // Figure out the depth and types in the topology
@@ -1215,6 +1773,20 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       hw_thread.clear();
       hw_thread.ids[index] = pu->logical_index;
       hw_thread.os_id = pu->os_index;
+      // If multiple core types, then set that attribute for the hardware thread
+      if (cpukinds) {
+        int cpukind_index = -1;
+        for (int i = 0; i < nr_cpu_kinds; ++i) {
+          if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) {
+            cpukind_index = i;
+            break;
+          }
+        }
+        if (cpukind_index >= 0) {
+          hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type);
+          hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
+        }
+      }
       index--;
     }
     obj = pu;
@@ -1258,6 +1830,13 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
     if (included)
       hw_thread_index++;
   }
+
+  // Free the core types information
+  if (cpukinds) {
+    for (int idx = 0; idx < nr_cpu_kinds; ++idx)
+      hwloc_bitmap_free(cpukinds[idx].mask);
+    __kmp_free(cpukinds);
+  }
   __kmp_topology->sort_ids();
   return true;
 }
@@ -1782,6 +2361,26 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
   return true;
 }
 
+// Hybrid cpu detection using CPUID.1A
+// Thread should be pinned to processor already
+static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency,
+                                  unsigned *native_model_id) {
+  kmp_cpuid buf;
+  __kmp_x86_cpuid(0x1a, 0, &buf);
+  *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
+  switch (*type) {
+  case KMP_HW_CORE_TYPE_ATOM:
+    *efficiency = 0;
+    break;
+  case KMP_HW_CORE_TYPE_CORE:
+    *efficiency = 1;
+    break;
+  default:
+    *efficiency = 0;
+  }
+  *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
+}
+
 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
 // architectures support a newer interface for specifying the x2APIC Ids,
 // based on CPUID.B or CPUID.1F
@@ -2051,6 +2650,15 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
         hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
       }
     }
+    // Hybrid information
+    if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
+      kmp_hw_core_type_t type;
+      unsigned native_model_id;
+      int efficiency;
+      __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
+      hw_thread.attrs.set_core_type(type);
+      hw_thread.attrs.set_core_eff(efficiency);
+    }
     hw_thread_index++;
   }
   KMP_ASSERT(hw_thread_index > 0);
@@ -2386,7 +2994,10 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
         unsigned val;
         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
           goto no_val;
-        KMP_ASSERT(nodeIdIndex + level <= maxIndex);
+        // validate the input before using level:
+        if (level > (unsigned)__kmp_xproc) { // level is too big
+          level = __kmp_xproc;
+        }
         if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
           goto dup_field;
         threadInfo[num_avail][nodeIdIndex + level] = val;
@@ -3497,8 +4108,8 @@ static void __kmp_aux_affinity_initialize(void) {
                                   __kmp_affin_fullMask);
         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
       }
-      __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
-      __kmp_avail_proc = __kmp_xproc;
+      __kmp_avail_proc =
+          __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
 #if KMP_OS_WINDOWS
       // Set the process affinity mask since threads' affinity
       // masks must be subset of process mask in Windows* OS
@@ -4145,14 +4756,19 @@ int __kmp_aux_set_affinity(void **mask) {
 int __kmp_aux_get_affinity(void **mask) {
   int gtid;
   int retval;
+#if KMP_OS_WINDOWS || KMP_DEBUG
   kmp_info_t *th;
-
+#endif
   if (!KMP_AFFINITY_CAPABLE()) {
     return -1;
   }
 
   gtid = __kmp_entry_gtid();
+#if KMP_OS_WINDOWS || KMP_DEBUG
   th = __kmp_threads[gtid];
+#else
+  (void)gtid; // unused variable
+#endif
   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
 
   KA_TRACE(
author	thegeorg <thegeorg@yandex-team.ru>	2022-06-03 10:53:07 +0300
committer	thegeorg <thegeorg@yandex-team.ru>	2022-06-03 10:53:07 +0300
commit	a1d4361e379e2c72a469ad1bd64569cbc2db131f (patch)
tree	0caddb240a10132376e4653a31578e117d33f9fd /contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
parent	41f55a521834080d9d703c099c0418cfff3a0546 (diff)
download	ydb-a1d4361e379e2c72a469ad1bd64569cbc2db131f.tar.gz