Update contrib/libs/cxxsupp/openmp to 20.1.7

commit_hash:722dd5fe79203d22ad4a0be288ac0caeb6b3dd68
author: mikhnenko <[email protected]> 2025-07-15 20:05:43 +0300
committer: mikhnenko <[email protected]> 2025-07-15 20:52:16 +0300
commit: a40bd4f45bbc18fd95b1596e655b8942ceb2cf4b (patch)
tree: bce599ca02c778c277198de6d131d37db71997d0 /contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
parent: 728e0eaef4dc1f1152d2c3a4cc1bbdf597f3ef3d (diff)
1 files changed, 1489 insertions, 620 deletions
diff --git a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
index b9a8d49d8da..624fb3b0761 100644
--- a/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
+++ b/contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
@@ -38,6 +38,43 @@ static hierarchy_info machine_hierarchy;
 
 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
 
+#if KMP_AFFINITY_SUPPORTED
+// Helper class to see if place lists further restrict the fullMask
+class kmp_full_mask_modifier_t {
+  kmp_affin_mask_t *mask;
+
+public:
+  kmp_full_mask_modifier_t() {
+    KMP_CPU_ALLOC(mask);
+    KMP_CPU_ZERO(mask);
+  }
+  ~kmp_full_mask_modifier_t() {
+    KMP_CPU_FREE(mask);
+    mask = nullptr;
+  }
+  void include(const kmp_affin_mask_t *other) { KMP_CPU_UNION(mask, other); }
+  // If the new full mask is different from the current full mask,
+  // then switch them. Returns true if full mask was affected, false otherwise.
+  bool restrict_to_mask() {
+    // See if the new mask further restricts or changes the full mask
+    if (KMP_CPU_EQUAL(__kmp_affin_fullMask, mask) || KMP_CPU_ISEMPTY(mask))
+      return false;
+    return __kmp_topology->restrict_to_mask(mask);
+  }
+};
+
+static inline const char *
+__kmp_get_affinity_env_var(const kmp_affinity_t &affinity,
+                           bool for_binding = false) {
+  if (affinity.flags.omp_places) {
+    if (for_binding)
+      return "OMP_PROC_BIND";
+    return "OMP_PLACES";
+  }
+  return affinity.env_var;
+}
+#endif // KMP_AFFINITY_SUPPORTED
+
 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
   kmp_uint32 depth;
   // The test below is true if affinity is available, but set to "none". Need to
@@ -90,8 +127,12 @@ const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
     return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
   case KMP_HW_PROC_GROUP:
     return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
+  case KMP_HW_UNKNOWN:
+  case KMP_HW_LAST:
+    return KMP_I18N_STR(Unknown);
   }
-  return KMP_I18N_STR(Unknown);
+  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
 }
 
 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
@@ -120,13 +161,18 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
     return ((plural) ? "threads" : "thread");
   case KMP_HW_PROC_GROUP:
     return ((plural) ? "proc_groups" : "proc_group");
+  case KMP_HW_UNKNOWN:
+  case KMP_HW_LAST:
+    return ((plural) ? "unknowns" : "unknown");
   }
-  return ((plural) ? "unknowns" : "unknown");
+  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
 }
 
 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
   switch (type) {
   case KMP_HW_CORE_TYPE_UNKNOWN:
+  case KMP_HW_MAX_NUM_CORE_TYPES:
     return "unknown";
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   case KMP_HW_CORE_TYPE_ATOM:
@@ -135,19 +181,19 @@ const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
     return "Intel(R) Core(TM) processor";
 #endif
   }
-  return "unknown";
+  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
 }
 
 #if KMP_AFFINITY_SUPPORTED
 // If affinity is supported, check the affinity
 // verbose and warning flags before printing warning
-#define KMP_AFF_WARNING(...)                                                   \
-  if (__kmp_affinity_verbose ||                                                \
-      (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {   \
+#define KMP_AFF_WARNING(s, ...)                                                \
+  if (s.flags.verbose || (s.flags.warnings && (s.type != affinity_none))) {    \
     KMP_WARNING(__VA_ARGS__);                                                  \
   }
 #else
-#define KMP_AFF_WARNING KMP_WARNING
+#define KMP_AFF_WARNING(s, ...) KMP_WARNING(__VA_ARGS__)
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -157,7 +203,26 @@ int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
   const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
   int depth = __kmp_topology->get_depth();
   for (int level = 0; level < depth; ++level) {
-    if (ahwthread->ids[level] < bhwthread->ids[level])
+    // Reverse sort (higher efficiencies earlier in list) cores by core
+    // efficiency if available.
+    if (__kmp_is_hybrid_cpu() &&
+        __kmp_topology->get_type(level) == KMP_HW_CORE &&
+        ahwthread->attrs.is_core_eff_valid() &&
+        bhwthread->attrs.is_core_eff_valid()) {
+      if (ahwthread->attrs.get_core_eff() < bhwthread->attrs.get_core_eff())
+        return 1;
+      if (ahwthread->attrs.get_core_eff() > bhwthread->attrs.get_core_eff())
+        return -1;
+    }
+    if (ahwthread->ids[level] == bhwthread->ids[level])
+      continue;
+    // If the hardware id is unknown for this level, then place hardware thread
+    // further down in the sorted list as it should take last priority
+    if (ahwthread->ids[level] == UNKNOWN_ID)
+      return 1;
+    else if (bhwthread->ids[level] == UNKNOWN_ID)
+      return -1;
+    else if (ahwthread->ids[level] < bhwthread->ids[level])
       return -1;
     else if (ahwthread->ids[level] > bhwthread->ids[level])
       return 1;
@@ -175,9 +240,10 @@ int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
   const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
   const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
   int depth = __kmp_topology->get_depth();
-  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
-  KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth);
-  for (i = 0; i < __kmp_affinity_compact; i++) {
+  int compact = __kmp_topology->compact;
+  KMP_DEBUG_ASSERT(compact >= 0);
+  KMP_DEBUG_ASSERT(compact <= depth);
+  for (i = 0; i < compact; i++) {
     int j = depth - i - 1;
     if (aa->sub_ids[j] < bb->sub_ids[j])
       return -1;
@@ -185,7 +251,7 @@ int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
       return 1;
   }
   for (; i < depth; i++) {
-    int j = i - __kmp_affinity_compact;
+    int j = i - compact;
     if (aa->sub_ids[j] < bb->sub_ids[j])
       return -1;
     if (aa->sub_ids[j] > bb->sub_ids[j])
@@ -199,7 +265,7 @@ void kmp_hw_thread_t::print() const {
   int depth = __kmp_topology->get_depth();
   printf("%4d ", os_id);
   for (int i = 0; i < depth; ++i) {
-    printf("%4d ", ids[i]);
+    printf("%4d (%d) ", ids[i], sub_ids[i]);
   }
   if (attrs) {
     if (attrs.is_core_type_valid())
@@ -207,6 +273,8 @@ void kmp_hw_thread_t::print() const {
     if (attrs.is_core_eff_valid())
       printf(" (eff=%d)", attrs.get_core_eff());
   }
+  if (leader)
+    printf(" (leader)");
   printf("\n");
 }
 
@@ -215,7 +283,7 @@ void kmp_hw_thread_t::print() const {
 
 // Add a layer to the topology based on the ids. Assume the topology
 // is perfectly nested (i.e., so no object has more than one parent)
-void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
+void kmp_topology_t::insert_layer(kmp_hw_t type, const int *ids) {
   // Figure out where the layer should go by comparing the ids of the current
   // layers with the new ids
   int target_layer;
@@ -276,8 +344,11 @@ void kmp_topology_t::_insert_windows_proc_groups() {
     ids[i] = __kmp_get_proc_group(mask);
   }
   KMP_CPU_FREE(mask);
-  _insert_layer(KMP_HW_PROC_GROUP, ids);
+  insert_layer(KMP_HW_PROC_GROUP, ids);
   __kmp_free(ids);
+
+  // sort topology after adding proc groups
+  __kmp_topology->sort_ids();
 }
 #endif
 
@@ -413,10 +484,13 @@ void kmp_topology_t::_gather_enumeration_information() {
       int id = hw_thread.ids[layer];
       if (id != previous_id[layer]) {
         // Add an additional increment to each count
-        for (int l = layer; l < depth; ++l)
-          count[l]++;
+        for (int l = layer; l < depth; ++l) {
+          if (hw_thread.ids[l] != kmp_hw_thread_t::UNKNOWN_ID)
+            count[l]++;
+        }
         // Keep track of topology layer ratio statistics
-        max[layer]++;
+        if (hw_thread.ids[layer] != kmp_hw_thread_t::UNKNOWN_ID)
+          max[layer]++;
         for (int l = layer + 1; l < depth; ++l) {
           if (max[l] > ratio[l])
             ratio[l] = max[l];
@@ -584,6 +658,7 @@ kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
   retval->count = arr + 2 * (size_t)KMP_HW_LAST;
   retval->num_core_efficiencies = 0;
   retval->num_core_types = 0;
+  retval->compact = 0;
   for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
     retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
   KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
@@ -674,7 +749,11 @@ void kmp_topology_t::print(const char *env_var) const {
   kmp_hw_t print_types[KMP_HW_LAST + 2];
 
   // Num Available Threads
-  KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
+  if (num_hw_threads) {
+    KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
+  } else {
+    KMP_INFORM(AvailableOSProc, env_var, __kmp_xproc);
+  }
 
   // Uniform or not
   if (is_uniform()) {
@@ -776,6 +855,8 @@ void kmp_topology_t::print(const char *env_var) const {
   for (int i = 0; i < num_hw_threads; i++) {
     __kmp_str_buf_clear(&buf);
     for (int level = 0; level < depth; ++level) {
+      if (hw_threads[i].ids[level] == kmp_hw_thread_t::UNKNOWN_ID)
+        continue;
       kmp_hw_t type = types[level];
       __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
       __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
@@ -790,41 +871,45 @@ void kmp_topology_t::print(const char *env_var) const {
   __kmp_str_buf_free(&buf);
 }
 
-void kmp_topology_t::canonicalize() {
-#if KMP_GROUP_AFFINITY
-  _insert_windows_proc_groups();
-#endif
-  _remove_radix1_layers();
-  _gather_enumeration_information();
-  _discover_uniformity();
-  _set_sub_ids();
-  _set_globals();
-  _set_last_level_cache();
-
-#if KMP_MIC_SUPPORTED
-  // Manually Add L2 = Tile equivalence
-  if (__kmp_mic_type == mic3) {
-    if (get_level(KMP_HW_L2) != -1)
-      set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
-    else if (get_level(KMP_HW_TILE) != -1)
-      set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
-  }
-#endif
-
-  // Perform post canonicalization checking
-  KMP_ASSERT(depth > 0);
-  for (int level = 0; level < depth; ++level) {
-    // All counts, ratios, and types must be valid
-    KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
-    KMP_ASSERT_VALID_HW_TYPE(types[level]);
-    // Detected types must point to themselves
-    KMP_ASSERT(equivalent[types[level]] == types[level]);
-  }
-
 #if KMP_AFFINITY_SUPPORTED
+void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const {
+  const char *env_var = __kmp_get_affinity_env_var(affinity);
+  // If requested hybrid CPU attributes for granularity (either OMP_PLACES or
+  // KMP_AFFINITY), but none exist, then reset granularity and have below method
+  // select a granularity and warn user.
+  if (!__kmp_is_hybrid_cpu()) {
+    if (affinity.core_attr_gran.valid) {
+      // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores
+      // instead
+      KMP_AFF_WARNING(
+          affinity, AffIgnoringNonHybrid, env_var,
+          __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
+      affinity.gran = KMP_HW_CORE;
+      affinity.gran_levels = -1;
+      affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
+      affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
+    } else if (affinity.flags.core_types_gran ||
+               affinity.flags.core_effs_gran) {
+      // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead
+      if (affinity.flags.omp_places) {
+        KMP_AFF_WARNING(
+            affinity, AffIgnoringNonHybrid, env_var,
+            __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
+      } else {
+        // KMP_AFFINITY=granularity=core_type|core_eff,...
+        KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
+                        "Intel(R) Hybrid Technology core attribute",
+                        __kmp_hw_get_catalog_string(KMP_HW_CORE));
+      }
+      affinity.gran = KMP_HW_CORE;
+      affinity.gran_levels = -1;
+      affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
+      affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
+    }
+  }
   // Set the number of affinity granularity levels
-  if (__kmp_affinity_gran_levels < 0) {
-    kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran);
+  if (affinity.gran_levels < 0) {
+    kmp_hw_t gran_type = get_equivalent_type(affinity.gran);
     // Check if user's granularity request is valid
     if (gran_type == KMP_HW_UNKNOWN) {
       // First try core, then thread, then package
@@ -837,10 +922,10 @@ void kmp_topology_t::canonicalize() {
       }
       KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
       // Warn user what granularity setting will be used instead
-      KMP_AFF_WARNING(AffGranularityBad, "KMP_AFFINITY",
-                      __kmp_hw_get_catalog_string(__kmp_affinity_gran),
+      KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
+                      __kmp_hw_get_catalog_string(affinity.gran),
                       __kmp_hw_get_catalog_string(gran_type));
-      __kmp_affinity_gran = gran_type;
+      affinity.gran = gran_type;
     }
 #if KMP_GROUP_AFFINITY
     // If more than one processor group exists, and the level of
@@ -855,17 +940,49 @@ void kmp_topology_t::canonicalize() {
       int proc_group_depth = get_level(KMP_HW_PROC_GROUP);
       if (gran_depth >= 0 && proc_group_depth >= 0 &&
           gran_depth < proc_group_depth) {
-        KMP_AFF_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY",
-                        __kmp_hw_get_catalog_string(__kmp_affinity_gran));
-        __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP;
+        KMP_AFF_WARNING(affinity, AffGranTooCoarseProcGroup, env_var,
+                        __kmp_hw_get_catalog_string(affinity.gran));
+        affinity.gran = gran_type = KMP_HW_PROC_GROUP;
       }
     }
 #endif
-    __kmp_affinity_gran_levels = 0;
+    affinity.gran_levels = 0;
     for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
-      __kmp_affinity_gran_levels++;
+      affinity.gran_levels++;
+  }
+}
+#endif
+
+void kmp_topology_t::canonicalize() {
+#if KMP_GROUP_AFFINITY
+  _insert_windows_proc_groups();
+#endif
+  _remove_radix1_layers();
+  _gather_enumeration_information();
+  _discover_uniformity();
+  _set_sub_ids();
+  _set_globals();
+  _set_last_level_cache();
+
+#if KMP_MIC_SUPPORTED
+  // Manually Add L2 = Tile equivalence
+  if (__kmp_mic_type == mic3) {
+    if (get_level(KMP_HW_L2) != -1)
+      set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
+    else if (get_level(KMP_HW_TILE) != -1)
+      set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
+  }
+#endif
+
+  // Perform post canonicalization checking
+  KMP_ASSERT(depth > 0);
+  for (int level = 0; level < depth; ++level) {
+    // All counts, ratios, and types must be valid
+    KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
+    KMP_ASSERT_VALID_HW_TYPE(types[level]);
+    // Detected types must point to themselves
+    KMP_ASSERT(equivalent[types[level]] == types[level]);
   }
-#endif // KMP_AFFINITY_SUPPORTED
 }
 
 // Canonicalize an explicit packages X cores/pkg X threads/core topology
@@ -894,41 +1011,7 @@ void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
   _discover_uniformity();
 }
 
-// Represents running sub IDs for a single core attribute where
-// attribute values have SIZE possibilities.
-template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
-  int last_level; // last level in topology to consider for sub_ids
-  int sub_id[SIZE]; // The sub ID for a given attribute value
-  int prev_sub_id[KMP_HW_LAST];
-  IndexFunc indexer;
-
-public:
-  kmp_sub_ids_t(int last_level) : last_level(last_level) {
-    KMP_ASSERT(last_level < KMP_HW_LAST);
-    for (size_t i = 0; i < SIZE; ++i)
-      sub_id[i] = -1;
-    for (size_t i = 0; i < KMP_HW_LAST; ++i)
-      prev_sub_id[i] = -1;
-  }
-  void update(const kmp_hw_thread_t &hw_thread) {
-    int idx = indexer(hw_thread);
-    KMP_ASSERT(idx < (int)SIZE);
-    for (int level = 0; level <= last_level; ++level) {
-      if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
-        if (level < last_level)
-          sub_id[idx] = -1;
-        sub_id[idx]++;
-        break;
-      }
-    }
-    for (int level = 0; level <= last_level; ++level)
-      prev_sub_id[level] = hw_thread.sub_ids[level];
-  }
-  int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
-    return sub_id[indexer(hw_thread)];
-  }
-};
-
+#if KMP_AFFINITY_SUPPORTED
 static kmp_str_buf_t *
 __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
                                  bool plural) {
@@ -944,6 +1027,41 @@ __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
   return buf;
 }
 
+bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) {
+  // Apply the filter
+  bool affected;
+  int new_index = 0;
+  for (int i = 0; i < num_hw_threads; ++i) {
+    int os_id = hw_threads[i].os_id;
+    if (KMP_CPU_ISSET(os_id, mask)) {
+      if (i != new_index)
+        hw_threads[new_index] = hw_threads[i];
+      new_index++;
+    } else {
+      KMP_CPU_CLR(os_id, __kmp_affin_fullMask);
+      __kmp_avail_proc--;
+    }
+  }
+
+  KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
+  affected = (num_hw_threads != new_index);
+  num_hw_threads = new_index;
+
+  // Post hardware subset canonicalization
+  if (affected) {
+    _gather_enumeration_information();
+    _discover_uniformity();
+    _set_globals();
+    _set_last_level_cache();
+#if KMP_OS_WINDOWS
+    // Copy filtered full mask if topology has single processor group
+    if (__kmp_num_proc_groups <= 1)
+#endif
+      __kmp_affin_origMask->copy(__kmp_affin_fullMask);
+  }
+  return affected;
+}
+
 // Apply the KMP_HW_SUBSET envirable to the topology
 // Returns true if KMP_HW_SUBSET filtered any processors
 // otherwise, returns false
@@ -955,9 +1073,12 @@ bool kmp_topology_t::filter_hw_subset() {
   // First, sort the KMP_HW_SUBSET items by the machine topology
   __kmp_hw_subset->sort();
 
+  __kmp_hw_subset->canonicalize(__kmp_topology);
+
   // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
   bool using_core_types = false;
   bool using_core_effs = false;
+  bool is_absolute = __kmp_hw_subset->is_absolute();
   int hw_subset_depth = __kmp_hw_subset->get_depth();
   kmp_hw_t specified[KMP_HW_LAST];
   int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
@@ -978,7 +1099,7 @@ bool kmp_topology_t::filter_hw_subset() {
     if (equivalent_type != KMP_HW_UNKNOWN) {
       __kmp_hw_subset->at(i).type = equivalent_type;
     } else {
-      KMP_AFF_WARNING(AffHWSubsetNotExistGeneric,
+      KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetNotExistGeneric,
                       __kmp_hw_get_catalog_string(type));
       return false;
     }
@@ -986,7 +1107,8 @@ bool kmp_topology_t::filter_hw_subset() {
     // Check to see if current layer has already been
     // specified either directly or through an equivalent type
     if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
-      KMP_AFF_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type),
+      KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetEqvLayers,
+                      __kmp_hw_get_catalog_string(type),
                       __kmp_hw_get_catalog_string(specified[equivalent_type]));
       return false;
     }
@@ -994,12 +1116,14 @@ bool kmp_topology_t::filter_hw_subset() {
 
     // Check to see if each layer's num & offset parameters are valid
     max_count = get_ratio(level);
-    if (max_count < 0 ||
-        (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
-      bool plural = (num > 1);
-      KMP_AFF_WARNING(AffHWSubsetManyGeneric,
-                      __kmp_hw_get_catalog_string(type, plural));
-      return false;
+    if (!is_absolute) {
+      if (max_count < 0 ||
+          (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
+        bool plural = (num > 1);
+        KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
+                        __kmp_hw_get_catalog_string(type, plural));
+        return false;
+      }
     }
 
     // Check to see if core attributes are consistent
@@ -1020,21 +1144,24 @@ bool kmp_topology_t::filter_hw_subset() {
       if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
         if (item.num_attrs == 1) {
           if (using_core_effs) {
-            KMP_AFF_WARNING(AffHWSubsetIgnoringAttr, "efficiency");
+            KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
+                            "efficiency");
           } else {
-            KMP_AFF_WARNING(AffHWSubsetIgnoringAttr, "core_type");
+            KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
+                            "core_type");
           }
           using_core_effs = false;
           using_core_types = false;
         } else {
-          KMP_AFF_WARNING(AffHWSubsetAttrsNonHybrid);
+          KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrsNonHybrid);
           return false;
         }
       }
 
       // Check if using both core types and core efficiencies together
       if (using_core_types && using_core_effs) {
-        KMP_AFF_WARNING(AffHWSubsetIncompat, "core_type", "efficiency");
+        KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat, "core_type",
+                        "efficiency");
         return false;
       }
 
@@ -1059,7 +1186,7 @@ bool kmp_topology_t::filter_hw_subset() {
       }
 
       // Check that the number of requested cores with attributes is valid
-      if (using_core_types || using_core_effs) {
+      if ((using_core_types || using_core_effs) && !is_absolute) {
         for (int j = 0; j < item.num_attrs; ++j) {
           int num = item.num[j];
           int offset = item.offset[j];
@@ -1070,7 +1197,7 @@ bool kmp_topology_t::filter_hw_subset() {
                 (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
               kmp_str_buf_t buf;
               __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
-              KMP_AFF_WARNING(AffHWSubsetManyGeneric, buf.str);
+              KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric, buf.str);
               __kmp_str_buf_free(&buf);
               return false;
             }
@@ -1092,7 +1219,7 @@ bool kmp_topology_t::filter_hw_subset() {
             }
             kmp_str_buf_t buf;
             __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
-            KMP_AFF_WARNING(AffHWSubsetIncompat,
+            KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat,
                             __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
             __kmp_str_buf_free(&buf);
             return false;
@@ -1105,7 +1232,7 @@ bool kmp_topology_t::filter_hw_subset() {
               kmp_str_buf_t buf;
               __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
                                                item.num[j] > 0);
-              KMP_AFF_WARNING(AffHWSubsetAttrRepeat, buf.str);
+              KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrRepeat, buf.str);
               __kmp_str_buf_free(&buf);
               return false;
             }
@@ -1115,43 +1242,92 @@ bool kmp_topology_t::filter_hw_subset() {
     }
   }
 
-  struct core_type_indexer {
-    int operator()(const kmp_hw_thread_t &t) const {
-      switch (t.attrs.get_core_type()) {
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
-      case KMP_HW_CORE_TYPE_ATOM:
-        return 1;
-      case KMP_HW_CORE_TYPE_CORE:
-        return 2;
-#endif
-      case KMP_HW_CORE_TYPE_UNKNOWN:
-        return 0;
-      }
-      KMP_ASSERT(0);
-      return 0;
+  // For keeping track of sub_ids for an absolute KMP_HW_SUBSET
+  // or core attributes (core type or efficiency)
+  int prev_sub_ids[KMP_HW_LAST];
+  int abs_sub_ids[KMP_HW_LAST];
+  int core_eff_sub_ids[KMP_HW_MAX_NUM_CORE_EFFS];
+  int core_type_sub_ids[KMP_HW_MAX_NUM_CORE_TYPES];
+  for (size_t i = 0; i < KMP_HW_LAST; ++i) {
+    abs_sub_ids[i] = -1;
+    prev_sub_ids[i] = -1;
+  }
+  for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_EFFS; ++i)
+    core_eff_sub_ids[i] = -1;
+  for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
+    core_type_sub_ids[i] = -1;
+
+  // Determine which hardware threads should be filtered.
+
+  // Helpful to determine if a topology layer is targeted by an absolute subset
+  auto is_targeted = [&](int level) {
+    if (is_absolute) {
+      for (int i = 0; i < hw_subset_depth; ++i)
+        if (topology_levels[i] == level)
+          return true;
+      return false;
     }
+    // If not absolute KMP_HW_SUBSET, then every layer is seen as targeted
+    return true;
   };
-  struct core_eff_indexer {
-    int operator()(const kmp_hw_thread_t &t) const {
-      return t.attrs.get_core_eff();
+
+  // Helpful to index into core type sub Ids array
+  auto get_core_type_index = [](const kmp_hw_thread_t &t) {
+    switch (t.attrs.get_core_type()) {
+    case KMP_HW_CORE_TYPE_UNKNOWN:
+    case KMP_HW_MAX_NUM_CORE_TYPES:
+      return 0;
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    case KMP_HW_CORE_TYPE_ATOM:
+      return 1;
+    case KMP_HW_CORE_TYPE_CORE:
+      return 2;
+#endif
     }
+    KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
+    KMP_BUILTIN_UNREACHABLE;
   };
 
-  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
-      core_level);
-  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
-      core_level);
+  // Helpful to index into core efficiencies sub Ids array
+  auto get_core_eff_index = [](const kmp_hw_thread_t &t) {
+    return t.attrs.get_core_eff();
+  };
 
-  // Determine which hardware threads should be filtered.
   int num_filtered = 0;
-  bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads);
+  kmp_affin_mask_t *filtered_mask;
+  KMP_CPU_ALLOC(filtered_mask);
+  KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask);
   for (int i = 0; i < num_hw_threads; ++i) {
     kmp_hw_thread_t &hw_thread = hw_threads[i];
-    // Update type_sub_id
-    if (using_core_types)
-      core_type_sub_ids.update(hw_thread);
-    if (using_core_effs)
-      core_eff_sub_ids.update(hw_thread);
+
+    // Figure out the absolute sub ids and core eff/type sub ids
+    if (is_absolute || using_core_effs || using_core_types) {
+      for (int level = 0; level < get_depth(); ++level) {
+        if (hw_thread.sub_ids[level] != prev_sub_ids[level]) {
+          bool found_targeted = false;
+          for (int j = level; j < get_depth(); ++j) {
+            bool targeted = is_targeted(j);
+            if (!found_targeted && targeted) {
+              found_targeted = true;
+              abs_sub_ids[j]++;
+              if (j == core_level && using_core_effs)
+                core_eff_sub_ids[get_core_eff_index(hw_thread)]++;
+              if (j == core_level && using_core_types)
+                core_type_sub_ids[get_core_type_index(hw_thread)]++;
+            } else if (targeted) {
+              abs_sub_ids[j] = 0;
+              if (j == core_level && using_core_effs)
+                core_eff_sub_ids[get_core_eff_index(hw_thread)] = 0;
+              if (j == core_level && using_core_types)
+                core_type_sub_ids[get_core_type_index(hw_thread)] = 0;
+            }
+          }
+          break;
+        }
+      }
+      for (int level = 0; level < get_depth(); ++level)
+        prev_sub_ids[level] = hw_thread.sub_ids[level];
+    }
 
     // Check to see if this hardware thread should be filtered
     bool should_be_filtered = false;
@@ -1186,71 +1362,60 @@ bool kmp_topology_t::filter_hw_subset() {
         int num = hw_subset_item.num[attr_idx];
         int offset = hw_subset_item.offset[attr_idx];
         if (using_core_types)
-          sub_id = core_type_sub_ids.get_sub_id(hw_thread);
+          sub_id = core_type_sub_ids[get_core_type_index(hw_thread)];
         else
-          sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
+          sub_id = core_eff_sub_ids[get_core_eff_index(hw_thread)];
         if (sub_id < offset ||
             (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
           should_be_filtered = true;
           break;
         }
       } else {
+        int sub_id;
         int num = hw_subset_item.num[0];
         int offset = hw_subset_item.offset[0];
-        if (hw_thread.sub_ids[level] < offset ||
-            (num != kmp_hw_subset_t::USE_ALL &&
-             hw_thread.sub_ids[level] >= offset + num)) {
+        if (is_absolute)
+          sub_id = abs_sub_ids[level];
+        else
+          sub_id = hw_thread.sub_ids[level];
+        if (hw_thread.ids[level] == kmp_hw_thread_t::UNKNOWN_ID ||
+            sub_id < offset ||
+            (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
           should_be_filtered = true;
           break;
         }
       }
     }
     // Collect filtering information
-    filtered[i] = should_be_filtered;
-    if (should_be_filtered)
+    if (should_be_filtered) {
+      KMP_CPU_CLR(hw_thread.os_id, filtered_mask);
       num_filtered++;
+    }
   }
 
   // One last check that we shouldn't allow filtering entire machine
   if (num_filtered == num_hw_threads) {
-    KMP_AFF_WARNING(AffHWSubsetAllFiltered);
-    __kmp_free(filtered);
+    KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered);
     return false;
   }
 
   // Apply the filter
-  int new_index = 0;
-  for (int i = 0; i < num_hw_threads; ++i) {
-    if (!filtered[i]) {
-      if (i != new_index)
-        hw_threads[new_index] = hw_threads[i];
-      new_index++;
-    } else {
-#if KMP_AFFINITY_SUPPORTED
-      KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask);
-#endif
-      __kmp_avail_proc--;
-    }
-  }
-
-  KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
-  num_hw_threads = new_index;
-
-  // Post hardware subset canonicalization
-  _gather_enumeration_information();
-  _discover_uniformity();
-  _set_globals();
-  _set_last_level_cache();
-  __kmp_free(filtered);
+  restrict_to_mask(filtered_mask);
   return true;
 }
 
-bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
+bool kmp_topology_t::is_close(int hwt1, int hwt2,
+                              const kmp_affinity_t &stgs) const {
+  int hw_level = stgs.gran_levels;
   if (hw_level >= depth)
     return true;
   bool retval = true;
   const kmp_hw_thread_t &t1 = hw_threads[hwt1];
   const kmp_hw_thread_t &t2 = hw_threads[hwt2];
+  if (stgs.flags.core_types_gran)
+    return t1.attrs.get_core_type() == t2.attrs.get_core_type();
+  if (stgs.flags.core_effs_gran)
+    return t1.attrs.get_core_eff() == t2.attrs.get_core_eff();
   for (int i = 0; i < (depth - hw_level); ++i) {
     if (t1.ids[i] != t2.ids[i])
       return false;
@@ -1260,30 +1425,6 @@ bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if KMP_AFFINITY_SUPPORTED
-class kmp_affinity_raii_t {
-  kmp_affin_mask_t *mask;
-  bool restored;
-
-public:
-  kmp_affinity_raii_t() : restored(false) {
-    KMP_CPU_ALLOC(mask);
-    KMP_ASSERT(mask != NULL);
-    __kmp_get_system_affinity(mask, TRUE);
-  }
-  void restore() {
-    __kmp_set_system_affinity(mask, TRUE);
-    KMP_CPU_FREE(mask);
-    restored = true;
-  }
-  ~kmp_affinity_raii_t() {
-    if (!restored) {
-      __kmp_set_system_affinity(mask, TRUE);
-      KMP_CPU_FREE(mask);
-    }
-  }
-};
-
 bool KMPAffinity::picked_api = false;
 
 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
@@ -1301,7 +1442,7 @@ void KMPAffinity::pick_api() {
   // Only use Hwloc if affinity isn't explicitly disabled and
   // user requests Hwloc topology method
   if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
-      __kmp_affinity_type != affinity_disabled) {
+      __kmp_affinity.type != affinity_disabled) {
     affinity_dispatch = new KMPHwlocAffinity();
   } else
 #endif
@@ -1448,15 +1589,13 @@ kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
   return buf;
 }
 
-// Return (possibly empty) affinity mask representing the offline CPUs
-// Caller must free the mask
-kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
-  kmp_affin_mask_t *offline;
-  KMP_CPU_ALLOC(offline);
-  KMP_CPU_ZERO(offline);
+static kmp_affin_mask_t *__kmp_parse_cpu_list(const char *path) {
+  kmp_affin_mask_t *mask;
+  KMP_CPU_ALLOC(mask);
+  KMP_CPU_ZERO(mask);
 #if KMP_OS_LINUX
   int n, begin_cpu, end_cpu;
-  kmp_safe_raii_file_t offline_file;
+  kmp_safe_raii_file_t file;
   auto skip_ws = [](FILE *f) {
     int c;
     do {
@@ -1465,29 +1604,29 @@ kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
     if (c != EOF)
       ungetc(c, f);
   };
-  // File contains CSV of integer ranges representing the offline CPUs
+  // File contains CSV of integer ranges representing the CPUs
   // e.g., 1,2,4-7,9,11-15
-  int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
+  int status = file.try_open(path, "r");
   if (status != 0)
-    return offline;
-  while (!feof(offline_file)) {
-    skip_ws(offline_file);
-    n = fscanf(offline_file, "%d", &begin_cpu);
+    return mask;
+  while (!feof(file)) {
+    skip_ws(file);
+    n = fscanf(file, "%d", &begin_cpu);
     if (n != 1)
       break;
-    skip_ws(offline_file);
-    int c = fgetc(offline_file);
+    skip_ws(file);
+    int c = fgetc(file);
     if (c == EOF || c == ',') {
       // Just single CPU
       end_cpu = begin_cpu;
     } else if (c == '-') {
       // Range of CPUs
-      skip_ws(offline_file);
-      n = fscanf(offline_file, "%d", &end_cpu);
+      skip_ws(file);
+      n = fscanf(file, "%d", &end_cpu);
       if (n != 1)
         break;
-      skip_ws(offline_file);
-      c = fgetc(offline_file); // skip ','
+      skip_ws(file);
+      c = fgetc(file); // skip ','
     } else {
       // Syntax problem
       break;
@@ -1497,13 +1636,19 @@ kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
         end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
       continue;
     }
-    // Insert [begin_cpu, end_cpu] into offline mask
+    // Insert [begin_cpu, end_cpu] into mask
     for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
-      KMP_CPU_SET(cpu, offline);
+      KMP_CPU_SET(cpu, mask);
     }
   }
 #endif
-  return offline;
+  return mask;
+}
+
+// Return (possibly empty) affinity mask representing the offline CPUs
+// Caller must free the mask
+kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
+  return __kmp_parse_cpu_list("/sys/devices/system/cpu/offline");
 }
 
 // Return the number of available procs
@@ -1592,6 +1737,7 @@ static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
   case HWLOC_OBJ_PU:
     return KMP_HW_THREAD;
   case HWLOC_OBJ_GROUP:
+#if HWLOC_API_VERSION >= 0x00020000
     if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
       return KMP_HW_DIE;
     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
@@ -1600,6 +1746,7 @@ static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
       return KMP_HW_MODULE;
     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
       return KMP_HW_PROC_GROUP;
+#endif
     return KMP_HW_UNKNOWN;
 #if HWLOC_API_VERSION >= 0x00020100
   case HWLOC_OBJ_DIE:
@@ -1663,14 +1810,14 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
 
   hwloc_topology_t tp = __kmp_hwloc_topology;
   *msg_id = kmp_i18n_null;
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
   }
 
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
     // available from hwloc on the current thread, and __kmp_xproc.
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
     // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
     hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
     if (o != NULL)
@@ -1682,6 +1829,8 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
     else
       __kmp_nThreadsPerCore = 1; // no CORE found
+    if (__kmp_nThreadsPerCore == 0)
+      __kmp_nThreadsPerCore = 1;
     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
     if (nCoresPerPkg == 0)
       nCoresPerPkg = 1; // to prevent possible division by 0
@@ -1689,6 +1838,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
     return true;
   }
 
+#if HWLOC_API_VERSION >= 0x00020400
   // Handle multiple types of cores if they exist on the system
   int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
 
@@ -1727,19 +1877,14 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       }
     }
   }
+#endif
 
   root = hwloc_get_root_obj(tp);
 
   // Figure out the depth and types in the topology
   depth = 0;
-  pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
-  KMP_ASSERT(pu);
-  obj = pu;
-  types[depth] = KMP_HW_THREAD;
-  hwloc_types[depth] = obj->type;
-  depth++;
-  while (obj != root && obj != NULL) {
-    obj = obj->parent;
+  obj = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
+  while (obj && obj != root) {
 #if HWLOC_API_VERSION >= 0x00020000
     if (obj->memory_arity) {
       hwloc_obj_t memory;
@@ -1761,6 +1906,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       hwloc_types[depth] = obj->type;
       depth++;
     }
+    obj = obj->parent;
   }
   KMP_ASSERT(depth > 0);
 
@@ -1787,7 +1933,9 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       hw_thread.clear();
       hw_thread.ids[index] = pu->logical_index;
       hw_thread.os_id = pu->os_index;
+      hw_thread.original_idx = hw_thread_index;
       // If multiple core types, then set that attribute for the hardware thread
+#if HWLOC_API_VERSION >= 0x00020400
       if (cpukinds) {
         int cpukind_index = -1;
         for (int i = 0; i < nr_cpu_kinds; ++i) {
@@ -1801,6 +1949,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
           hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
         }
       }
+#endif
       index--;
     }
     obj = pu;
@@ -1825,7 +1974,6 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
             hw_thread.ids[index + 1] = sub_id;
             index--;
           }
-          prev = memory;
         }
         prev = obj;
       }
@@ -1845,12 +1993,14 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       hw_thread_index++;
   }
 
+#if HWLOC_API_VERSION >= 0x00020400
   // Free the core types information
   if (cpukinds) {
     for (int idx = 0; idx < nr_cpu_kinds; ++idx)
       hwloc_bitmap_free(cpukinds[idx].mask);
     __kmp_free(cpukinds);
   }
+#endif
   __kmp_topology->sort_ids();
   return true;
 }
@@ -1864,15 +2014,15 @@ static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
   int depth = 3;
   kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
 
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
   }
 
-  // Even if __kmp_affinity_type == affinity_none, this routine might still
-  // called to set __kmp_ncores, as well as
+  // Even if __kmp_affinity.type == affinity_none, this routine might still
+  // be called to set __kmp_ncores, as well as
   // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
   if (!KMP_AFFINITY_CAPABLE()) {
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
     __kmp_ncores = nPackages = __kmp_xproc;
     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
     return true;
@@ -1897,12 +2047,13 @@ static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
     kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
     hw_thread.clear();
     hw_thread.os_id = i;
+    hw_thread.original_idx = avail_ct;
     hw_thread.ids[0] = i;
     hw_thread.ids[1] = 0;
     hw_thread.ids[2] = 0;
     avail_ct++;
   }
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
   }
   return true;
@@ -1919,13 +2070,13 @@ static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
   kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
   const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
 
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
   }
 
   // If we aren't affinity capable, then use flat topology
   if (!KMP_AFFINITY_CAPABLE()) {
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
     nPackages = __kmp_num_proc_groups;
     __kmp_nThreadsPerCore = 1;
     __kmp_ncores = __kmp_xproc;
@@ -1942,11 +2093,13 @@ static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
       continue;
     }
-    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
     hw_thread.clear();
     hw_thread.os_id = i;
+    hw_thread.original_idx = avail_ct;
     hw_thread.ids[0] = i / BITS_PER_GROUP;
     hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
+    avail_ct++;
   }
   return true;
 }
@@ -2002,15 +2155,43 @@ static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
   return 0;
 }
 
-class kmp_cache_info_t {
+class cpuid_cache_info_t {
 public:
   struct info_t {
-    unsigned level, mask;
+    unsigned level = 0;
+    unsigned mask = 0;
+    bool operator==(const info_t &rhs) const {
+      return level == rhs.level && mask == rhs.mask;
+    }
+    bool operator!=(const info_t &rhs) const { return !operator==(rhs); }
   };
-  kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
+  cpuid_cache_info_t() : depth(0) {
+    table[MAX_CACHE_LEVEL].level = 0;
+    table[MAX_CACHE_LEVEL].mask = 0;
+  }
   size_t get_depth() const { return depth; }
   info_t &operator[](size_t index) { return table[index]; }
   const info_t &operator[](size_t index) const { return table[index]; }
+  bool operator==(const cpuid_cache_info_t &rhs) const {
+    if (rhs.depth != depth)
+      return false;
+    for (size_t i = 0; i < depth; ++i)
+      if (table[i] != rhs.table[i])
+        return false;
+    return true;
+  }
+  bool operator!=(const cpuid_cache_info_t &rhs) const {
+    return !operator==(rhs);
+  }
+  // Get cache information assocaited with L1, L2, L3 cache, etc.
+  // If level does not exist, then return the "NULL" level (level 0)
+  const info_t &get_level(unsigned level) const {
+    for (size_t i = 0; i < depth; ++i) {
+      if (table[i].level == level)
+        return table[i];
+    }
+    return table[MAX_CACHE_LEVEL];
+  }
 
   static kmp_hw_t get_topology_type(unsigned level) {
     KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
@@ -2024,13 +2205,6 @@ public:
     }
     return KMP_HW_UNKNOWN;
   }
-
-private:
-  static const int MAX_CACHE_LEVEL = 3;
-
-  size_t depth;
-  info_t table[MAX_CACHE_LEVEL];
-
   void get_leaf4_levels() {
     unsigned level = 0;
     while (depth < MAX_CACHE_LEVEL) {
@@ -2055,6 +2229,11 @@ private:
       level++;
     }
   }
+  static const int MAX_CACHE_LEVEL = 3;
+
+private:
+  size_t depth;
+  info_t table[MAX_CACHE_LEVEL + 1];
 };
 
 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
@@ -2065,7 +2244,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
   kmp_cpuid buf;
   *msg_id = kmp_i18n_null;
 
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
   }
 
@@ -2084,7 +2263,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
     // available from cpuid on the current thread, and __kmp_xproc.
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
 
     // Get an upper bound on the number of threads per package using cpuid(1).
     // On some OS/chps combinations where HT is supported by the chip but is
@@ -2136,7 +2315,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
 
   // From here on, we can assume that it is safe to call
   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
-  // __kmp_affinity_type = affinity_none.
+  // __kmp_affinity.type = affinity_none.
 
   // Save the affinity mask for the current thread.
   kmp_affinity_raii_t previous_affinity;
@@ -2362,6 +2541,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
       hw_thread.ids[idx++] = threadInfo[i].threadId;
     }
     hw_thread.os_id = os;
+    hw_thread.original_idx = i;
   }
 
   __kmp_free(threadInfo);
@@ -2417,15 +2597,13 @@ enum {
   INTEL_LEVEL_TYPE_INVALID = 0, // Package level
   INTEL_LEVEL_TYPE_SMT = 1,
   INTEL_LEVEL_TYPE_CORE = 2,
-  INTEL_LEVEL_TYPE_TILE = 3,
-  INTEL_LEVEL_TYPE_MODULE = 4,
+  INTEL_LEVEL_TYPE_MODULE = 3,
+  INTEL_LEVEL_TYPE_TILE = 4,
   INTEL_LEVEL_TYPE_DIE = 5,
   INTEL_LEVEL_TYPE_LAST = 6,
 };
-
-struct cpuid_level_info_t {
-  unsigned level_type, mask, mask_width, nitems, cache_mask;
-};
+KMP_BUILD_ASSERT(INTEL_LEVEL_TYPE_LAST < sizeof(unsigned) * CHAR_BIT);
+#define KMP_LEAF_1F_KNOWN_LEVELS ((1u << INTEL_LEVEL_TYPE_LAST) - 1u)
 
 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
   switch (intel_type) {
@@ -2445,16 +2623,78 @@ static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
   return KMP_HW_UNKNOWN;
 }
 
-// This function takes the topology leaf, a levels array to store the levels
-// detected and a bitmap of the known levels.
-// Returns the number of levels in the topology
-static unsigned
-__kmp_x2apicid_get_levels(int leaf,
-                          cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
-                          kmp_uint64 known_levels) {
+static int __kmp_topology_type_2_intel_type(kmp_hw_t type) {
+  switch (type) {
+  case KMP_HW_SOCKET:
+    return INTEL_LEVEL_TYPE_INVALID;
+  case KMP_HW_THREAD:
+    return INTEL_LEVEL_TYPE_SMT;
+  case KMP_HW_CORE:
+    return INTEL_LEVEL_TYPE_CORE;
+  case KMP_HW_TILE:
+    return INTEL_LEVEL_TYPE_TILE;
+  case KMP_HW_MODULE:
+    return INTEL_LEVEL_TYPE_MODULE;
+  case KMP_HW_DIE:
+    return INTEL_LEVEL_TYPE_DIE;
+  default:
+    return INTEL_LEVEL_TYPE_INVALID;
+  }
+}
+
+struct cpuid_level_info_t {
+  unsigned level_type, mask, mask_width, nitems, cache_mask;
+};
+
+class cpuid_topo_desc_t {
+  unsigned desc = 0;
+
+public:
+  void clear() { desc = 0; }
+  bool contains(int intel_type) const {
+    KMP_DEBUG_ASSERT(intel_type >= 0 && intel_type < INTEL_LEVEL_TYPE_LAST);
+    if ((1u << intel_type) & desc)
+      return true;
+    return false;
+  }
+  bool contains_topology_type(kmp_hw_t type) const {
+    KMP_DEBUG_ASSERT(type >= 0 && type < KMP_HW_LAST);
+    int intel_type = __kmp_topology_type_2_intel_type(type);
+    return contains(intel_type);
+  }
+  bool contains(cpuid_topo_desc_t rhs) const {
+    return ((desc | rhs.desc) == desc);
+  }
+  void add(int intel_type) { desc |= (1u << intel_type); }
+  void add(cpuid_topo_desc_t rhs) { desc |= rhs.desc; }
+};
+
+struct cpuid_proc_info_t {
+  // Topology info
+  int os_id;
+  unsigned apic_id;
+  unsigned depth;
+  // Hybrid info
+  unsigned native_model_id;
+  int efficiency;
+  kmp_hw_core_type_t type;
+  cpuid_topo_desc_t description;
+
+  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
+};
+
+// This function takes the topology leaf, an info pointer to store the levels
+// detected, and writable descriptors for the total topology.
+// Returns whether total types, depth, or description were modified.
+static bool __kmp_x2apicid_get_levels(int leaf, cpuid_proc_info_t *info,
+                                      kmp_hw_t total_types[KMP_HW_LAST],
+                                      int *total_depth,
+                                      cpuid_topo_desc_t *total_description) {
   unsigned level, levels_index;
   unsigned level_type, mask_width, nitems;
   kmp_cpuid buf;
+  cpuid_level_info_t(&levels)[INTEL_LEVEL_TYPE_LAST] = info->levels;
+  bool retval = false;
 
   // New algorithm has known topology layers act as highest unknown topology
   // layers when unknown topology layers exist.
@@ -2469,10 +2709,12 @@ __kmp_x2apicid_get_levels(int leaf,
     level_type = __kmp_extract_bits<8, 15>(buf.ecx);
     mask_width = __kmp_extract_bits<0, 4>(buf.eax);
     nitems = __kmp_extract_bits<0, 15>(buf.ebx);
-    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
-      return 0;
+    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) {
+      info->depth = 0;
+      return retval;
+    }
 
-    if (known_levels & (1ull << level_type)) {
+    if (KMP_LEAF_1F_KNOWN_LEVELS & (1u << level_type)) {
       // Add a new level to the topology
       KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
       levels[levels_index].level_type = level_type;
@@ -2488,6 +2730,26 @@ __kmp_x2apicid_get_levels(int leaf,
     }
     level++;
   } while (level_type != INTEL_LEVEL_TYPE_INVALID);
+  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
+  info->description.clear();
+  info->depth = levels_index;
+
+  // If types, depth, and total_description are uninitialized,
+  // then initialize them now
+  if (*total_depth == 0) {
+    *total_depth = info->depth;
+    total_description->clear();
+    for (int i = *total_depth - 1, j = 0; i >= 0; --i, ++j) {
+      total_types[j] =
+          __kmp_intel_type_2_topology_type(info->levels[i].level_type);
+      total_description->add(info->levels[i].level_type);
+    }
+    retval = true;
+  }
+
+  // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
+  if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID)
+    return 0;
 
   // Set the masks to & with apicid
   for (unsigned i = 0; i < levels_index; ++i) {
@@ -2497,42 +2759,65 @@ __kmp_x2apicid_get_levels(int leaf,
       for (unsigned j = 0; j < i; ++j)
         levels[i].mask ^= levels[j].mask;
     } else {
-      KMP_DEBUG_ASSERT(levels_index > 0);
+      KMP_DEBUG_ASSERT(i > 0);
       levels[i].mask = (-1) << levels[i - 1].mask_width;
       levels[i].cache_mask = 0;
     }
+    info->description.add(info->levels[i].level_type);
   }
-  return levels_index;
+
+  // If this processor has level type not on other processors, then make
+  // sure to include it in total types, depth, and description.
+  // One assumption here is that the first type, i.e. socket, is known.
+  // Another assumption is that types array is always large enough to fit any
+  // new layers since its length is KMP_HW_LAST.
+  if (!total_description->contains(info->description)) {
+    for (int i = info->depth - 1, j = 0; i >= 0; --i, ++j) {
+      // If this level is known already, then skip it.
+      if (total_description->contains(levels[i].level_type))
+        continue;
+      // Unknown level, insert before last known level
+      kmp_hw_t curr_type =
+          __kmp_intel_type_2_topology_type(levels[i].level_type);
+      KMP_ASSERT(j != 0 && "Bad APIC Id information");
+      // Move over all known levels to make room for new level
+      for (int k = info->depth - 1; k >= j; --k) {
+        KMP_DEBUG_ASSERT(k + 1 < KMP_HW_LAST);
+        total_types[k + 1] = total_types[k];
+      }
+      // Insert new level
+      total_types[j] = curr_type;
+      (*total_depth)++;
+    }
+    total_description->add(info->description);
+    retval = true;
+  }
+  return retval;
 }
 
 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
 
-  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
   kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
-  unsigned levels_index;
   kmp_cpuid buf;
-  kmp_uint64 known_levels;
-  int topology_leaf, highest_leaf, apic_id;
+  int topology_leaf, highest_leaf;
   int num_leaves;
+  int depth = 0;
+  cpuid_topo_desc_t total_description;
   static int leaves[] = {0, 0};
 
-  kmp_i18n_id_t leaf_message_id;
+  // If affinity is disabled, __kmp_avail_proc may be zero
+  int ninfos = (__kmp_avail_proc > 0 ? __kmp_avail_proc : 1);
+  cpuid_proc_info_t *proc_info = (cpuid_proc_info_t *)__kmp_allocate(
+      (sizeof(cpuid_proc_info_t) + sizeof(cpuid_cache_info_t)) * ninfos);
+  cpuid_cache_info_t *cache_info = (cpuid_cache_info_t *)(proc_info + ninfos);
 
-  KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
+  kmp_i18n_id_t leaf_message_id;
 
   *msg_id = kmp_i18n_null;
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
   }
 
-  // Figure out the known topology levels
-  known_levels = 0ull;
-  for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
-    if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
-      known_levels |= (1ull << i);
-    }
-  }
-
   // Get the highest cpuid leaf supported
   __kmp_x86_cpuid(0, 0, &buf);
   highest_leaf = buf.eax;
@@ -2566,16 +2851,18 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
     if (buf.ebx == 0)
       continue;
     topology_leaf = leaf;
-    levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
-    if (levels_index == 0)
+    __kmp_x2apicid_get_levels(leaf, &proc_info[0], types, &depth,
+                              &total_description);
+    if (depth == 0)
       continue;
     break;
   }
-  if (topology_leaf == -1 || levels_index == 0) {
+  if (topology_leaf == -1 || depth == 0) {
     *msg_id = leaf_message_id;
+    __kmp_free(proc_info);
     return false;
   }
-  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
+  KMP_ASSERT(depth <= INTEL_LEVEL_TYPE_LAST);
 
   // The algorithm used starts by setting the affinity to each available thread
   // and retrieving info from the cpuid instruction, so if we are not capable of
@@ -2585,46 +2872,23 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
     // available from cpuid on the current thread, and __kmp_xproc.
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
-    for (unsigned i = 0; i < levels_index; ++i) {
-      if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
-        __kmp_nThreadsPerCore = levels[i].nitems;
-      } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
-        nCoresPerPkg = levels[i].nitems;
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
+    for (int i = 0; i < depth; ++i) {
+      if (proc_info[0].levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
+        __kmp_nThreadsPerCore = proc_info[0].levels[i].nitems;
+      } else if (proc_info[0].levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
+        nCoresPerPkg = proc_info[0].levels[i].nitems;
       }
     }
     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    __kmp_free(proc_info);
     return true;
   }
 
-  // Allocate the data structure to be returned.
-  int depth = levels_index;
-  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
-    types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
-  __kmp_topology =
-      kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
-
-  // Insert equivalent cache types if they exist
-  kmp_cache_info_t cache_info;
-  for (size_t i = 0; i < cache_info.get_depth(); ++i) {
-    const kmp_cache_info_t::info_t &info = cache_info[i];
-    unsigned cache_mask = info.mask;
-    unsigned cache_level = info.level;
-    for (unsigned j = 0; j < levels_index; ++j) {
-      unsigned hw_cache_mask = levels[j].cache_mask;
-      kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
-      if (hw_cache_mask == cache_mask && j < levels_index - 1) {
-        kmp_hw_t type =
-            __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
-        __kmp_topology->set_equivalent_type(cache_type, type);
-      }
-    }
-  }
-
   // From here on, we can assume that it is safe to call
   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
-  // __kmp_affinity_type = affinity_none.
+  // __kmp_affinity.type = affinity_none.
 
   // Save the affinity mask for the current thread.
   kmp_affinity_raii_t previous_affinity;
@@ -2633,56 +2897,167 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
   // to it, and obtaining the pertinent information using the cpuid instr.
   unsigned int proc;
   int hw_thread_index = 0;
-  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
-    cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
-    unsigned my_levels_index;
+  bool uniform_caches = true;
 
+  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
     // Skip this proc if it is not included in the machine model.
     if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
       continue;
     }
     KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
 
+    // Gather topology information
     __kmp_affinity_dispatch->bind_thread(proc);
-
-    // New algorithm
     __kmp_x86_cpuid(topology_leaf, 0, &buf);
-    apic_id = buf.edx;
-    kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
-    my_levels_index =
-        __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
-    if (my_levels_index == 0 || my_levels_index != levels_index) {
+    proc_info[hw_thread_index].os_id = proc;
+    proc_info[hw_thread_index].apic_id = buf.edx;
+    __kmp_x2apicid_get_levels(topology_leaf, &proc_info[hw_thread_index], types,
+                              &depth, &total_description);
+    if (proc_info[hw_thread_index].depth == 0) {
       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+      __kmp_free(proc_info);
       return false;
     }
-    hw_thread.clear();
-    hw_thread.os_id = proc;
-    // Put in topology information
-    for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
-      hw_thread.ids[idx] = apic_id & my_levels[j].mask;
-      if (j > 0) {
-        hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
-      }
-    }
+    // Gather cache information and insert afterwards
+    cache_info[hw_thread_index].get_leaf4_levels();
+    if (uniform_caches && hw_thread_index > 0)
+      if (cache_info[0] != cache_info[hw_thread_index])
+        uniform_caches = false;
     // Hybrid information
     if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
-      kmp_hw_core_type_t type;
-      unsigned native_model_id;
-      int efficiency;
-      __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
-      hw_thread.attrs.set_core_type(type);
-      hw_thread.attrs.set_core_eff(efficiency);
+      __kmp_get_hybrid_info(&proc_info[hw_thread_index].type,
+                            &proc_info[hw_thread_index].efficiency,
+                            &proc_info[hw_thread_index].native_model_id);
     }
     hw_thread_index++;
   }
   KMP_ASSERT(hw_thread_index > 0);
+  previous_affinity.restore();
+
+  // Allocate the data structure to be returned.
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
+
+  // Create topology Ids and hybrid types in __kmp_topology
+  for (int i = 0; i < __kmp_topology->get_num_hw_threads(); ++i) {
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+    hw_thread.clear();
+    hw_thread.os_id = proc_info[i].os_id;
+    hw_thread.original_idx = i;
+    unsigned apic_id = proc_info[i].apic_id;
+    // Put in topology information
+    for (int j = 0, idx = depth - 1; j < depth; ++j, --idx) {
+      if (!(proc_info[i].description.contains_topology_type(
+              __kmp_topology->get_type(j)))) {
+        hw_thread.ids[idx] = kmp_hw_thread_t::UNKNOWN_ID;
+      } else {
+        hw_thread.ids[idx] = apic_id & proc_info[i].levels[j].mask;
+        if (j > 0) {
+          hw_thread.ids[idx] >>= proc_info[i].levels[j - 1].mask_width;
+        }
+      }
+    }
+    hw_thread.attrs.set_core_type(proc_info[i].type);
+    hw_thread.attrs.set_core_eff(proc_info[i].efficiency);
+  }
+
   __kmp_topology->sort_ids();
+
+  // Change Ids to logical Ids
+  for (int j = 0; j < depth - 1; ++j) {
+    int new_id = 0;
+    int prev_id = __kmp_topology->at(0).ids[j];
+    int curr_id = __kmp_topology->at(0).ids[j + 1];
+    __kmp_topology->at(0).ids[j + 1] = new_id;
+    for (int i = 1; i < __kmp_topology->get_num_hw_threads(); ++i) {
+      kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+      if (hw_thread.ids[j] == prev_id && hw_thread.ids[j + 1] == curr_id) {
+        hw_thread.ids[j + 1] = new_id;
+      } else if (hw_thread.ids[j] == prev_id &&
+                 hw_thread.ids[j + 1] != curr_id) {
+        curr_id = hw_thread.ids[j + 1];
+        hw_thread.ids[j + 1] = ++new_id;
+      } else {
+        prev_id = hw_thread.ids[j];
+        curr_id = hw_thread.ids[j + 1];
+        hw_thread.ids[j + 1] = ++new_id;
+      }
+    }
+  }
+
+  // First check for easy cache placement. This occurs when caches are
+  // equivalent to a layer in the CPUID leaf 0xb or 0x1f topology.
+  if (uniform_caches) {
+    for (size_t i = 0; i < cache_info[0].get_depth(); ++i) {
+      unsigned cache_mask = cache_info[0][i].mask;
+      unsigned cache_level = cache_info[0][i].level;
+      KMP_ASSERT(cache_level <= cpuid_cache_info_t::MAX_CACHE_LEVEL);
+      kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(cache_level);
+      __kmp_topology->set_equivalent_type(cache_type, cache_type);
+      for (int j = 0; j < depth; ++j) {
+        unsigned hw_cache_mask = proc_info[0].levels[j].cache_mask;
+        if (hw_cache_mask == cache_mask && j < depth - 1) {
+          kmp_hw_t type = __kmp_intel_type_2_topology_type(
+              proc_info[0].levels[j + 1].level_type);
+          __kmp_topology->set_equivalent_type(cache_type, type);
+        }
+      }
+    }
+  } else {
+    // If caches are non-uniform, then record which caches exist.
+    for (int i = 0; i < __kmp_topology->get_num_hw_threads(); ++i) {
+      for (size_t j = 0; j < cache_info[i].get_depth(); ++j) {
+        unsigned cache_level = cache_info[i][j].level;
+        kmp_hw_t cache_type =
+            cpuid_cache_info_t::get_topology_type(cache_level);
+        if (__kmp_topology->get_equivalent_type(cache_type) == KMP_HW_UNKNOWN)
+          __kmp_topology->set_equivalent_type(cache_type, cache_type);
+      }
+    }
+  }
+
+  // See if any cache level needs to be added manually through cache Ids
+  bool unresolved_cache_levels = false;
+  for (unsigned level = 1; level <= cpuid_cache_info_t::MAX_CACHE_LEVEL;
+       ++level) {
+    kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(level);
+    // This also filters out caches which may not be in the topology
+    // since the equivalent type might be KMP_HW_UNKNOWN.
+    if (__kmp_topology->get_equivalent_type(cache_type) == cache_type) {
+      unresolved_cache_levels = true;
+      break;
+    }
+  }
+
+  // Insert unresolved cache layers into machine topology using cache Ids
+  if (unresolved_cache_levels) {
+    int num_hw_threads = __kmp_topology->get_num_hw_threads();
+    int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
+    for (unsigned l = 1; l <= cpuid_cache_info_t::MAX_CACHE_LEVEL; ++l) {
+      kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(l);
+      if (__kmp_topology->get_equivalent_type(cache_type) != cache_type)
+        continue;
+      for (int i = 0; i < num_hw_threads; ++i) {
+        int original_idx = __kmp_topology->at(i).original_idx;
+        ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
+        const cpuid_cache_info_t::info_t &info =
+            cache_info[original_idx].get_level(l);
+        // if cache level not in topology for this processor, then skip
+        if (info.level == 0)
+          continue;
+        ids[i] = info.mask & proc_info[original_idx].apic_id;
+      }
+      __kmp_topology->insert_layer(cache_type, ids);
+    }
+  }
+
   if (!__kmp_topology->check_ids()) {
     kmp_topology_t::deallocate(__kmp_topology);
     __kmp_topology = nullptr;
     *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
+    __kmp_free(proc_info);
     return false;
   }
+  __kmp_free(proc_info);
   return true;
 }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
@@ -2716,14 +3091,16 @@ static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
 // Set the array sizes for the hierarchy layers
 static void __kmp_dispatch_set_hierarchy_values() {
   // Set the maximum number of L1's to number of cores
-  // Set the maximum number of L2's to to either number of cores / 2 for
+  // Set the maximum number of L2's to either number of cores / 2 for
   // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
   // Or the number of cores for Intel(R) Xeon(R) processors
   // Set the maximum number of NUMA nodes and L3's to number of packages
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) &&   \
+#if KMP_ARCH_X86_64 &&                                                         \
+    (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||    \
+     KMP_OS_WINDOWS) &&                                                        \
     KMP_MIC_SUPPORTED
   if (__kmp_mic_type >= mic3)
     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
@@ -2738,7 +3115,9 @@ static void __kmp_dispatch_set_hierarchy_values() {
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
       __kmp_nThreadsPerCore;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) &&   \
+#if KMP_ARCH_X86_64 &&                                                         \
+    (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||    \
+     KMP_OS_WINDOWS) &&                                                        \
     KMP_MIC_SUPPORTED
   if (__kmp_mic_type >= mic3)
     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
@@ -2800,15 +3179,51 @@ static inline const char *__kmp_cpuinfo_get_envvar() {
   return envvar;
 }
 
+static bool __kmp_package_id_from_core_siblings_list(unsigned **threadInfo,
+                                                     unsigned num_avail,
+                                                     unsigned idx) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return false;
+
+  char path[256];
+  KMP_SNPRINTF(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%u/topology/core_siblings_list",
+               threadInfo[idx][osIdIndex]);
+  kmp_affin_mask_t *siblings = __kmp_parse_cpu_list(path);
+  for (unsigned i = 0; i < num_avail; ++i) {
+    unsigned cpu_id = threadInfo[i][osIdIndex];
+    KMP_ASSERT(cpu_id < __kmp_affin_mask_size * CHAR_BIT);
+    if (!KMP_CPU_ISSET(cpu_id, siblings))
+      continue;
+    if (threadInfo[i][pkgIdIndex] == UINT_MAX) {
+      // Arbitrarily pick the first index we encounter, it only matters that
+      // the value is the same for all siblings.
+      threadInfo[i][pkgIdIndex] = idx;
+    } else if (threadInfo[i][pkgIdIndex] != idx) {
+      // Contradictory sibling lists.
+      KMP_CPU_FREE(siblings);
+      return false;
+    }
+  }
+  KMP_ASSERT(threadInfo[idx][pkgIdIndex] != UINT_MAX);
+  KMP_CPU_FREE(siblings);
+  return true;
+}
+
 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
-// affinity map.
+// affinity map. On AIX, the map is obtained through system SRAD (Scheduler
+// Resource Allocation Domain).
 static bool __kmp_affinity_create_cpuinfo_map(int *line,
                                               kmp_i18n_id_t *const msg_id) {
+  *msg_id = kmp_i18n_null;
+
+#if KMP_OS_AIX
+  unsigned num_records = __kmp_xproc;
+#else
   const char *filename = __kmp_cpuinfo_get_filename();
   const char *envvar = __kmp_cpuinfo_get_envvar();
-  *msg_id = kmp_i18n_null;
 
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
   }
 
@@ -2865,6 +3280,7 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
     *msg_id = kmp_i18n_str_CantRewindCpuinfo;
     return false;
   }
+#endif // KMP_OS_AIX
 
   // Allocate the array of records to store the proc info in.  The dummy
   // element at the end makes the logic in filling them out easier to code.
@@ -2894,8 +3310,96 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
     INIT_PROC_INFO(threadInfo[i]);
   }
 
+#if KMP_OS_AIX
+  int smt_threads;
+  lpar_info_format1_t cpuinfo;
+  unsigned num_avail = __kmp_xproc;
+
+  if (__kmp_affinity.flags.verbose)
+    KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "system info for topology");
+
+  // Get the number of SMT threads per core.
+  smt_threads = syssmt(GET_NUMBER_SMT_SETS, 0, 0, NULL);
+
+  // Allocate a resource set containing available system resourses.
+  rsethandle_t sys_rset = rs_alloc(RS_SYSTEM);
+  if (sys_rset == NULL) {
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+  // Allocate a resource set for the SRAD info.
+  rsethandle_t srad = rs_alloc(RS_EMPTY);
+  if (srad == NULL) {
+    rs_free(sys_rset);
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+
+  // Get the SRAD system detail level.
+  int sradsdl = rs_getinfo(NULL, R_SRADSDL, 0);
+  if (sradsdl < 0) {
+    rs_free(sys_rset);
+    rs_free(srad);
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+  // Get the number of RADs at that SRAD SDL.
+  int num_rads = rs_numrads(sys_rset, sradsdl, 0);
+  if (num_rads < 0) {
+    rs_free(sys_rset);
+    rs_free(srad);
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+
+  // Get the maximum number of procs that may be contained in a resource set.
+  int max_procs = rs_getinfo(NULL, R_MAXPROCS, 0);
+  if (max_procs < 0) {
+    rs_free(sys_rset);
+    rs_free(srad);
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+
+  int cur_rad = 0;
+  int num_set = 0;
+  for (int srad_idx = 0; cur_rad < num_rads && srad_idx < VMI_MAXRADS;
+       ++srad_idx) {
+    // Check if the SRAD is available in the RSET.
+    if (rs_getrad(sys_rset, srad, sradsdl, srad_idx, 0) < 0)
+      continue;
+
+    for (int cpu = 0; cpu < max_procs; cpu++) {
+      // Set the info for the cpu if it is in the SRAD.
+      if (rs_op(RS_TESTRESOURCE, srad, NULL, R_PROCS, cpu)) {
+        threadInfo[cpu][osIdIndex] = cpu;
+        threadInfo[cpu][pkgIdIndex] = cur_rad;
+        threadInfo[cpu][coreIdIndex] = cpu / smt_threads;
+        ++num_set;
+        if (num_set >= num_avail) {
+          // Done if all available CPUs have been set.
+          break;
+        }
+      }
+    }
+    ++cur_rad;
+  }
+  rs_free(sys_rset);
+  rs_free(srad);
+
+  // The topology is already sorted.
+
+#else // !KMP_OS_AIX
   unsigned num_avail = 0;
   *line = 0;
+#if KMP_ARCH_S390X
+  bool reading_s390x_sys_info = true;
+#endif
   while (!feof(f)) {
     // Create an inner scoping level, so that all the goto targets at the end of
     // the loop appear in an outer scoping level. This avoids warnings about
@@ -2931,7 +3435,31 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
       }
       (*line)++;
 
+#if KMP_ARCH_LOONGARCH64
+      // The parsing logic of /proc/cpuinfo in this function highly depends on
+      // the blank lines between each processor info block. But on LoongArch a
+      // blank line exists before the first processor info block (i.e. after the
+      // "system type" line). This blank line was added because the "system
+      // type" line is unrelated to any of the CPUs. We must skip this line so
+      // that the original logic works on LoongArch.
+      if (*buf == '\n' && *line == 2)
+        continue;
+#endif
+#if KMP_ARCH_S390X
+      // s390x /proc/cpuinfo starts with a variable number of lines containing
+      // the overall system information. Skip them.
+      if (reading_s390x_sys_info) {
+        if (*buf == '\n')
+          reading_s390x_sys_info = false;
+        continue;
+      }
+#endif
+
+#if KMP_ARCH_S390X
+      char s1[] = "cpu number";
+#else
       char s1[] = "processor";
+#endif
       if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
         CHECK_LINE;
         char *p = strchr(buf + sizeof(s1) - 1, ':');
@@ -2957,6 +3485,23 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
             threadInfo[num_avail][osIdIndex]);
         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
 
+#if KMP_ARCH_S390X
+        // Disambiguate physical_package_id.
+        unsigned book_id;
+        KMP_SNPRINTF(path, sizeof(path),
+                     "/sys/devices/system/cpu/cpu%u/topology/book_id",
+                     threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &book_id);
+        threadInfo[num_avail][pkgIdIndex] |= (book_id << 8);
+
+        unsigned drawer_id;
+        KMP_SNPRINTF(path, sizeof(path),
+                     "/sys/devices/system/cpu/cpu%u/topology/drawer_id",
+                     threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &drawer_id);
+        threadInfo[num_avail][pkgIdIndex] |= (drawer_id << 16);
+#endif
+
         KMP_SNPRINTF(path, sizeof(path),
                      "/sys/devices/system/cpu/cpu%u/topology/core_id",
                      threadInfo[num_avail][osIdIndex]);
@@ -3040,21 +3585,17 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
         return false;
       }
 
-      // Check for missing fields.  The osId field must be there, and we
-      // currently require that the physical id field is specified, also.
+      // Check for missing fields.  The osId field must be there. The physical
+      // id field will be checked later.
       if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_MissingProcField;
         return false;
       }
-      if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
-        CLEANUP_THREAD_INFO;
-        *msg_id = kmp_i18n_str_MissingPhysicalIDField;
-        return false;
-      }
 
       // Skip this proc if it is not included in the machine model.
-      if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
+      if (KMP_AFFINITY_CAPABLE() &&
+          !KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
                          __kmp_affin_fullMask)) {
         INIT_PROC_INFO(threadInfo[num_avail]);
         continue;
@@ -3080,6 +3621,18 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
   }
   *line = 0;
 
+  // At least on powerpc, Linux may return -1 for physical_package_id. Try
+  // to reconstruct topology from core_siblings_list in that case.
+  for (i = 0; i < num_avail; ++i) {
+    if (threadInfo[i][pkgIdIndex] == UINT_MAX) {
+      if (!__kmp_package_id_from_core_siblings_list(threadInfo, num_avail, i)) {
+        CLEANUP_THREAD_INFO;
+        *msg_id = kmp_i18n_str_MissingPhysicalIDField;
+        return false;
+      }
+    }
+  }
+
 #if KMP_MIC && REDUCE_TEAM_SIZE
   unsigned teamSize = 0;
 #endif // KMP_MIC && REDUCE_TEAM_SIZE
@@ -3096,6 +3649,8 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
   qsort(threadInfo, num_avail, sizeof(*threadInfo),
         __kmp_affinity_cmp_ProcCpuInfo_phys_id);
 
+#endif // KMP_OS_AIX
+
   // The table is now sorted by pkgId / coreId / threadId, but we really don't
   // know the radix of any of the fields. pkgId's may be sparsely assigned among
   // the chips on a system. Although coreId's are usually assigned
@@ -3210,7 +3765,7 @@ restart_radix_check:
         return false;
       }
 
-      // If the thread ids were not specified and we see entries entries that
+      // If the thread ids were not specified and we see entries that
       // are duplicates, start the loop over and assign the thread ids manually.
       assign_thread_ids = true;
       goto restart_radix_check;
@@ -3239,7 +3794,7 @@ restart_radix_check:
   // not enabled.
   __kmp_ncores = totals[coreIdIndex];
   if (!KMP_AFFINITY_CAPABLE()) {
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
     return true;
   }
 
@@ -3301,10 +3856,10 @@ restart_radix_check:
   for (i = 0; i < num_avail; ++i) {
     unsigned os = threadInfo[i][osIdIndex];
     int src_index;
-    int dst_index = 0;
     kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
     hw_thread.clear();
     hw_thread.os_id = os;
+    hw_thread.original_idx = i;
 
     idx = 0;
     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
@@ -3318,7 +3873,6 @@ restart_radix_check:
       } else if (src_index == threadIdIndex) {
         hw_thread.ids[threadLevel] = threadInfo[i][src_index];
       }
-      dst_index++;
     }
   }
 
@@ -3329,6 +3883,32 @@ restart_radix_check:
   __kmp_free(counts);
   CLEANUP_THREAD_INFO;
   __kmp_topology->sort_ids();
+
+  int tlevel = __kmp_topology->get_level(KMP_HW_THREAD);
+  if (tlevel > 0) {
+    // If the thread level does not have ids, then put them in.
+    if (__kmp_topology->at(0).ids[tlevel] == kmp_hw_thread_t::UNKNOWN_ID) {
+      __kmp_topology->at(0).ids[tlevel] = 0;
+    }
+    for (int i = 1; i < __kmp_topology->get_num_hw_threads(); ++i) {
+      kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+      if (hw_thread.ids[tlevel] != kmp_hw_thread_t::UNKNOWN_ID)
+        continue;
+      kmp_hw_thread_t &prev_hw_thread = __kmp_topology->at(i - 1);
+      // Check if socket, core, anything above thread level changed.
+      // If the ids did change, then restart thread id at 0
+      // Otherwise, set thread id to prev thread's id + 1
+      for (int j = 0; j < tlevel; ++j) {
+        if (hw_thread.ids[j] != prev_hw_thread.ids[j]) {
+          hw_thread.ids[tlevel] = 0;
+          break;
+        }
+      }
+      if (hw_thread.ids[tlevel] == kmp_hw_thread_t::UNKNOWN_ID)
+        hw_thread.ids[tlevel] = prev_hw_thread.ids[tlevel] + 1;
+    }
+  }
+
   if (!__kmp_topology->check_ids()) {
     kmp_topology_t::deallocate(__kmp_topology);
     __kmp_topology = nullptr;
@@ -3341,16 +3921,25 @@ restart_radix_check:
 // Create and return a table of affinity masks, indexed by OS thread ID.
 // This routine handles OR'ing together all the affinity masks of threads
 // that are sufficiently close, if granularity > fine.
-static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
-                                            unsigned *numUnique) {
+template <typename FindNextFunctionType>
+static void __kmp_create_os_id_masks(unsigned *numUnique,
+                                     kmp_affinity_t &affinity,
+                                     FindNextFunctionType find_next) {
   // First form a table of affinity masks in order of OS thread id.
   int maxOsId;
   int i;
   int numAddrs = __kmp_topology->get_num_hw_threads();
   int depth = __kmp_topology->get_depth();
+  const char *env_var = __kmp_get_affinity_env_var(affinity);
   KMP_ASSERT(numAddrs);
   KMP_ASSERT(depth);
 
+  i = find_next(-1);
+  // If could not find HW thread location that satisfies find_next conditions,
+  // then return and fallback to increment find_next.
+  if (i >= numAddrs)
+    return;
+
   maxOsId = 0;
   for (i = numAddrs - 1;; --i) {
     int osId = __kmp_topology->at(i).os_id;
@@ -3360,14 +3949,14 @@ static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
     if (i == 0)
       break;
   }
-  kmp_affin_mask_t *osId2Mask;
-  KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
-  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
-  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
-    KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
+  affinity.num_os_id_masks = maxOsId + 1;
+  KMP_CPU_ALLOC_ARRAY(affinity.os_id_masks, affinity.num_os_id_masks);
+  KMP_ASSERT(affinity.gran_levels >= 0);
+  if (affinity.flags.verbose && (affinity.gran_levels > 0)) {
+    KMP_INFORM(ThreadsMigrate, env_var, affinity.gran_levels);
   }
-  if (__kmp_affinity_gran_levels >= (int)depth) {
-    KMP_AFF_WARNING(AffThreadsMayMigrate);
+  if (affinity.gran_levels >= (int)depth) {
+    KMP_AFF_WARNING(affinity, AffThreadsMayMigrate);
   }
 
   // Run through the table, forming the masks for all threads on each core.
@@ -3380,22 +3969,25 @@ static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
   kmp_affin_mask_t *sum;
   KMP_CPU_ALLOC_ON_STACK(sum);
   KMP_CPU_ZERO(sum);
-  KMP_CPU_SET(__kmp_topology->at(0).os_id, sum);
-  for (i = 1; i < numAddrs; i++) {
+
+  i = j = leader = find_next(-1);
+  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
+  kmp_full_mask_modifier_t full_mask;
+  for (i = find_next(i); i < numAddrs; i = find_next(i)) {
     // If this thread is sufficiently close to the leader (within the
     // granularity setting), then set the bit for this os thread in the
     // affinity mask for this group, and go on to the next thread.
-    if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) {
+    if (__kmp_topology->is_close(leader, i, affinity)) {
       KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
       continue;
     }
 
     // For every thread in this group, copy the mask to the thread's entry in
-    // the osId2Mask table.  Mark the first address as a leader.
-    for (; j < i; j++) {
+    // the OS Id mask table. Mark the first address as a leader.
+    for (; j < i; j = find_next(j)) {
       int osId = __kmp_topology->at(j).os_id;
       KMP_DEBUG_ASSERT(osId <= maxOsId);
-      kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
+      kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
       KMP_CPU_COPY(mask, sum);
       __kmp_topology->at(j).leader = (j == leader);
     }
@@ -3403,25 +3995,30 @@ static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
 
     // Start a new mask.
     leader = i;
+    full_mask.include(sum);
     KMP_CPU_ZERO(sum);
     KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
   }
 
   // For every thread in last group, copy the mask to the thread's
-  // entry in the osId2Mask table.
-  for (; j < i; j++) {
+  // entry in the OS Id mask table.
+  for (; j < i; j = find_next(j)) {
     int osId = __kmp_topology->at(j).os_id;
     KMP_DEBUG_ASSERT(osId <= maxOsId);
-    kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
+    kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
     KMP_CPU_COPY(mask, sum);
     __kmp_topology->at(j).leader = (j == leader);
   }
+  full_mask.include(sum);
   unique++;
   KMP_CPU_FREE_FROM_STACK(sum);
 
-  *maxIndex = maxOsId;
+  // See if the OS Id mask table further restricts or changes the full mask
+  if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
+    __kmp_topology->print(env_var);
+  }
+
   *numUnique = unique;
-  return osId2Mask;
 }
 
 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
@@ -3454,7 +4051,7 @@ static int nextNewMask;
   {                                                                            \
     if (((_osId) > _maxOsId) ||                                                \
         (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) {     \
-      KMP_AFF_WARNING(AffIgnoreInvalidProcID, _osId);                          \
+      KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, _osId);                \
     } else {                                                                   \
       ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));                            \
     }                                                                          \
@@ -3462,12 +4059,13 @@ static int nextNewMask;
 
 // Re-parse the proclist (for the explicit affinity type), and form the list
 // of affinity newMasks indexed by gtid.
-static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
-                                            unsigned int *out_numMasks,
-                                            const char *proclist,
-                                            kmp_affin_mask_t *osId2Mask,
-                                            int maxOsId) {
+static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) {
   int i;
+  kmp_affin_mask_t **out_masks = &affinity.masks;
+  unsigned *out_numMasks = &affinity.num_masks;
+  const char *proclist = affinity.proclist;
+  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
+  int maxOsId = affinity.num_os_id_masks - 1;
   const char *scan = proclist;
   const char *next = proclist;
 
@@ -3505,7 +4103,7 @@ static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
       // Copy the mask for that osId to the sum (union) mask.
       if ((num > maxOsId) ||
           (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
-        KMP_AFF_WARNING(AffIgnoreInvalidProcID, num);
+        KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
         KMP_CPU_ZERO(sumMask);
       } else {
         KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
@@ -3537,7 +4135,7 @@ static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
         // Add the mask for that osId to the sum mask.
         if ((num > maxOsId) ||
             (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
-          KMP_AFF_WARNING(AffIgnoreInvalidProcID, num);
+          KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
         } else {
           KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
           setSize++;
@@ -3672,10 +4270,11 @@ signed := + signed
 signed := - signed
 -----------------------------------------------------------------------------*/
 static void __kmp_process_subplace_list(const char **scan,
-                                        kmp_affin_mask_t *osId2Mask,
-                                        int maxOsId, kmp_affin_mask_t *tempMask,
+                                        kmp_affinity_t &affinity, int maxOsId,
+                                        kmp_affin_mask_t *tempMask,
                                         int *setSize) {
   const char *next;
+  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
 
   for (;;) {
     int start, count, stride, i;
@@ -3694,7 +4293,7 @@ static void __kmp_process_subplace_list(const char **scan,
     if (**scan == '}' || **scan == ',') {
       if ((start > maxOsId) ||
           (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
-        KMP_AFF_WARNING(AffIgnoreInvalidProcID, start);
+        KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
       } else {
         KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
         (*setSize)++;
@@ -3723,7 +4322,7 @@ static void __kmp_process_subplace_list(const char **scan,
       for (i = 0; i < count; i++) {
         if ((start > maxOsId) ||
             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
-          KMP_AFF_WARNING(AffIgnoreInvalidProcID, start);
+          KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
           break; // don't proliferate warnings for large count
         } else {
           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
@@ -3770,7 +4369,7 @@ static void __kmp_process_subplace_list(const char **scan,
       for (i = 0; i < count; i++) {
         if ((start > maxOsId) ||
             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
-          KMP_AFF_WARNING(AffIgnoreInvalidProcID, start);
+          KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
           break; // don't proliferate warnings for large count
         } else {
           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
@@ -3789,21 +4388,22 @@ static void __kmp_process_subplace_list(const char **scan,
   }
 }
 
-static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
+static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity,
                                 int maxOsId, kmp_affin_mask_t *tempMask,
                                 int *setSize) {
   const char *next;
+  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
 
   // valid follow sets are '{' '!' and num
   SKIP_WS(*scan);
   if (**scan == '{') {
     (*scan)++; // skip '{'
-    __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
+    __kmp_process_subplace_list(scan, affinity, maxOsId, tempMask, setSize);
     KMP_ASSERT2(**scan == '}', "bad explicit places list");
     (*scan)++; // skip '}'
   } else if (**scan == '!') {
     (*scan)++; // skip '!'
-    __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
+    __kmp_process_place(scan, affinity, maxOsId, tempMask, setSize);
     KMP_CPU_COMPLEMENT(maxOsId, tempMask);
   } else if ((**scan >= '0') && (**scan <= '9')) {
     next = *scan;
@@ -3812,7 +4412,7 @@ static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
     KMP_ASSERT(num >= 0);
     if ((num > maxOsId) ||
         (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
-      KMP_AFF_WARNING(AffIgnoreInvalidProcID, num);
+      KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
     } else {
       KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
       (*setSize)++;
@@ -3824,12 +4424,13 @@ static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
 }
 
 // static void
-void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
-                                      unsigned int *out_numMasks,
-                                      const char *placelist,
-                                      kmp_affin_mask_t *osId2Mask,
-                                      int maxOsId) {
+void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) {
   int i, j, count, stride, sign;
+  kmp_affin_mask_t **out_masks = &affinity.masks;
+  unsigned *out_numMasks = &affinity.num_masks;
+  const char *placelist = affinity.proclist;
+  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
+  int maxOsId = affinity.num_os_id_masks - 1;
   const char *scan = placelist;
   const char *next = placelist;
 
@@ -3849,7 +4450,7 @@ void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
   int setSize = 0;
 
   for (;;) {
-    __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
+    __kmp_process_place(&scan, affinity, maxOsId, tempMask, &setSize);
 
     // valid follow sets are ',' ':' and EOL
     SKIP_WS(scan);
@@ -3930,7 +4531,7 @@ void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
             (!KMP_CPU_ISSET(j + stride,
                             KMP_CPU_INDEX(osId2Mask, j + stride)))) {
           if (i < count - 1) {
-            KMP_AFF_WARNING(AffIgnoreInvalidProcID, j + stride);
+            KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, j + stride);
           }
           continue;
         }
@@ -4028,28 +4629,149 @@ static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
 
 static int *procarr = NULL;
 static int __kmp_aff_depth = 0;
+static int *__kmp_osid_to_hwthread_map = NULL;
+
+static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
+                                                  kmp_affinity_ids_t &ids,
+                                                  kmp_affinity_attrs_t &attrs) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+
+  // Initiailze ids and attrs thread data
+  for (int i = 0; i < KMP_HW_LAST; ++i)
+    ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
+  attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
+
+  // Iterate through each os id within the mask and determine
+  // the topology id and attribute information
+  int cpu;
+  int depth = __kmp_topology->get_depth();
+  KMP_CPU_SET_ITERATE(cpu, mask) {
+    int osid_idx = __kmp_osid_to_hwthread_map[cpu];
+    ids.os_id = cpu;
+    const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
+    for (int level = 0; level < depth; ++level) {
+      kmp_hw_t type = __kmp_topology->get_type(level);
+      int id = hw_thread.sub_ids[level];
+      if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
+        ids.ids[type] = id;
+      } else {
+        // This mask spans across multiple topology units, set it as such
+        // and mark every level below as such as well.
+        ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+        for (; level < depth; ++level) {
+          kmp_hw_t type = __kmp_topology->get_type(level);
+          ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+        }
+      }
+    }
+    if (!attrs.valid) {
+      attrs.core_type = hw_thread.attrs.get_core_type();
+      attrs.core_eff = hw_thread.attrs.get_core_eff();
+      attrs.valid = 1;
+    } else {
+      // This mask spans across multiple attributes, set it as such
+      if (attrs.core_type != hw_thread.attrs.get_core_type())
+        attrs.core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+      if (attrs.core_eff != hw_thread.attrs.get_core_eff())
+        attrs.core_eff = kmp_hw_attr_t::UNKNOWN_CORE_EFF;
+    }
+  }
+}
+
+static void __kmp_affinity_get_thread_topology_info(kmp_info_t *th) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+  const kmp_affin_mask_t *mask = th->th.th_affin_mask;
+  kmp_affinity_ids_t &ids = th->th.th_topology_ids;
+  kmp_affinity_attrs_t &attrs = th->th.th_topology_attrs;
+  __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
+}
+
+// Assign the topology information to each place in the place list
+// A thread can then grab not only its affinity mask, but the topology
+// information associated with that mask. e.g., Which socket is a thread on
+static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+  if (affinity.type != affinity_none) {
+    KMP_ASSERT(affinity.num_os_id_masks);
+    KMP_ASSERT(affinity.os_id_masks);
+  }
+  KMP_ASSERT(affinity.num_masks);
+  KMP_ASSERT(affinity.masks);
+  KMP_ASSERT(__kmp_affin_fullMask);
+
+  int max_cpu = __kmp_affin_fullMask->get_max_cpu();
+  int num_hw_threads = __kmp_topology->get_num_hw_threads();
+
+  // Allocate thread topology information
+  if (!affinity.ids) {
+    affinity.ids = (kmp_affinity_ids_t *)__kmp_allocate(
+        sizeof(kmp_affinity_ids_t) * affinity.num_masks);
+  }
+  if (!affinity.attrs) {
+    affinity.attrs = (kmp_affinity_attrs_t *)__kmp_allocate(
+        sizeof(kmp_affinity_attrs_t) * affinity.num_masks);
+  }
+  if (!__kmp_osid_to_hwthread_map) {
+    // Want the +1 because max_cpu should be valid index into map
+    __kmp_osid_to_hwthread_map =
+        (int *)__kmp_allocate(sizeof(int) * (max_cpu + 1));
+  }
+
+  // Create the OS proc to hardware thread map
+  for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) {
+    int os_id = __kmp_topology->at(hw_thread).os_id;
+    if (KMP_CPU_ISSET(os_id, __kmp_affin_fullMask))
+      __kmp_osid_to_hwthread_map[os_id] = hw_thread;
+  }
+
+  for (unsigned i = 0; i < affinity.num_masks; ++i) {
+    kmp_affinity_ids_t &ids = affinity.ids[i];
+    kmp_affinity_attrs_t &attrs = affinity.attrs[i];
+    kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.masks, i);
+    __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
+  }
+}
+
+// Called when __kmp_topology is ready
+static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
+  // Initialize other data structures which depend on the topology
+  if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
+    machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
+    __kmp_affinity_get_topology_info(affinity);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+    __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
+#endif
+  }
+}
 
 // Create a one element mask array (set of places) which only contains the
 // initial process's affinity mask
-static void __kmp_create_affinity_none_places() {
+static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {
   KMP_ASSERT(__kmp_affin_fullMask != NULL);
-  KMP_ASSERT(__kmp_affinity_type == affinity_none);
-  __kmp_affinity_num_masks = 1;
-  KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
-  kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0);
+  KMP_ASSERT(affinity.type == affinity_none);
+  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+  affinity.num_masks = 1;
+  KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
+  kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0);
   KMP_CPU_COPY(dest, __kmp_affin_fullMask);
+  __kmp_aux_affinity_initialize_other_data(affinity);
 }
 
-static void __kmp_aux_affinity_initialize(void) {
-  if (__kmp_affinity_masks != NULL) {
-    KMP_ASSERT(__kmp_affin_fullMask != NULL);
-    return;
-  }
-
+static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) {
   // Create the "full" mask - this defines all of the processors that we
   // consider to be in the machine model. If respect is set, then it is the
   // initialization thread's affinity mask. Otherwise, it is all processors that
   // we know about on the machine.
+  int verbose = affinity.flags.verbose;
+  const char *env_var = affinity.env_var;
+
+  // Already initialized
+  if (__kmp_affin_fullMask && __kmp_affin_origMask)
+    return;
+
   if (__kmp_affin_fullMask == NULL) {
     KMP_CPU_ALLOC(__kmp_affin_fullMask);
   }
@@ -4060,7 +4782,7 @@ static void __kmp_aux_affinity_initialize(void) {
     __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
     // Make a copy before possible expanding to the entire machine mask
     __kmp_affin_origMask->copy(__kmp_affin_fullMask);
-    if (__kmp_affinity_respect_mask) {
+    if (affinity.flags.respect) {
       // Count the number of available processors.
       unsigned i;
       __kmp_avail_proc = 0;
@@ -4071,24 +4793,24 @@ static void __kmp_aux_affinity_initialize(void) {
         __kmp_avail_proc++;
       }
       if (__kmp_avail_proc > __kmp_xproc) {
-        KMP_AFF_WARNING(ErrorInitializeAffinity);
-        __kmp_affinity_type = affinity_none;
+        KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
+        affinity.type = affinity_none;
         KMP_AFFINITY_DISABLE();
         return;
       }
 
-      if (__kmp_affinity_verbose) {
+      if (verbose) {
         char buf[KMP_AFFIN_MASK_PRINT_LEN];
         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                                   __kmp_affin_fullMask);
-        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+        KMP_INFORM(InitOSProcSetRespect, env_var, buf);
       }
     } else {
-      if (__kmp_affinity_verbose) {
+      if (verbose) {
         char buf[KMP_AFFIN_MASK_PRINT_LEN];
         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                                   __kmp_affin_fullMask);
-        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+        KMP_INFORM(InitOSProcSetNotRespect, env_var, buf);
       }
       __kmp_avail_proc =
           __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
@@ -4103,8 +4825,13 @@ static void __kmp_aux_affinity_initialize(void) {
 #endif
     }
   }
+}
 
+static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
+  bool success = false;
+  const char *env_var = affinity.env_var;
   kmp_i18n_id_t msg_id = kmp_i18n_null;
+  int verbose = affinity.flags.verbose;
 
   // For backward compatibility, setting KMP_CPUINFO_FILE =>
   // KMP_TOPOLOGY_METHOD=cpuinfo
@@ -4113,7 +4840,6 @@ static void __kmp_aux_affinity_initialize(void) {
     __kmp_affinity_top_method = affinity_top_method_cpuinfo;
   }
 
-  bool success = false;
   if (__kmp_affinity_top_method == affinity_top_method_all) {
 // In the default code path, errors are not fatal - we just try using
 // another method. We only emit a warning message if affinity is on, or the
@@ -4123,11 +4849,11 @@ static void __kmp_aux_affinity_initialize(void) {
         __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
       if (!__kmp_hwloc_error) {
         success = __kmp_affinity_create_hwloc_map(&msg_id);
-        if (!success && __kmp_affinity_verbose) {
-          KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+        if (!success && verbose) {
+          KMP_INFORM(AffIgnoringHwloc, env_var);
         }
-      } else if (__kmp_affinity_verbose) {
-        KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+      } else if (verbose) {
+        KMP_INFORM(AffIgnoringHwloc, env_var);
       }
     }
 #endif
@@ -4135,24 +4861,24 @@ static void __kmp_aux_affinity_initialize(void) {
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
     if (!success) {
       success = __kmp_affinity_create_x2apicid_map(&msg_id);
-      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
-        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
       }
     }
     if (!success) {
       success = __kmp_affinity_create_apicid_map(&msg_id);
-      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
-        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
       }
     }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-#if KMP_OS_LINUX
+#if KMP_OS_LINUX || KMP_OS_AIX
     if (!success) {
       int line = 0;
       success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
-      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
-        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
       }
     }
 #endif /* KMP_OS_LINUX */
@@ -4160,16 +4886,16 @@ static void __kmp_aux_affinity_initialize(void) {
 #if KMP_GROUP_AFFINITY
     if (!success && (__kmp_num_proc_groups > 1)) {
       success = __kmp_affinity_create_proc_group_map(&msg_id);
-      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
-        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
       }
     }
 #endif /* KMP_GROUP_AFFINITY */
 
     if (!success) {
       success = __kmp_affinity_create_flat_map(&msg_id);
-      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
-        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
       }
       KMP_ASSERT(success);
     }
@@ -4241,130 +4967,187 @@ static void __kmp_aux_affinity_initialize(void) {
   // Early exit if topology could not be created
   if (!__kmp_topology) {
     if (KMP_AFFINITY_CAPABLE()) {
-      KMP_AFF_WARNING(ErrorInitializeAffinity);
+      KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
     }
     if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
         __kmp_ncores > 0) {
       __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
       __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
                                    __kmp_nThreadsPerCore, __kmp_ncores);
-      if (__kmp_affinity_verbose) {
-        __kmp_topology->print("KMP_AFFINITY");
+      if (verbose) {
+        __kmp_topology->print(env_var);
       }
     }
-    __kmp_affinity_type = affinity_none;
-    __kmp_create_affinity_none_places();
-#if KMP_USE_HIER_SCHED
-    __kmp_dispatch_set_hierarchy_values();
-#endif
-    KMP_AFFINITY_DISABLE();
-    return;
+    return false;
   }
 
-  // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and
-  // initialize other data structures which depend on the topology
+  // Canonicalize, print (if requested), apply KMP_HW_SUBSET
   __kmp_topology->canonicalize();
-  if (__kmp_affinity_verbose)
-    __kmp_topology->print("KMP_AFFINITY");
+  if (verbose)
+    __kmp_topology->print(env_var);
   bool filtered = __kmp_topology->filter_hw_subset();
-  if (filtered) {
-#if KMP_OS_WINDOWS
-    // Copy filtered full mask if topology has single processor group
-    if (__kmp_num_proc_groups <= 1)
-#endif
-      __kmp_affin_origMask->copy(__kmp_affin_fullMask);
-  }
-  if (filtered && __kmp_affinity_verbose)
+  if (filtered && verbose)
     __kmp_topology->print("KMP_HW_SUBSET");
-  machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
-  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+  return success;
+}
+
+static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
+  bool is_regular_affinity = (&affinity == &__kmp_affinity);
+  bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity);
+  const char *env_var = __kmp_get_affinity_env_var(affinity);
+
+  if (affinity.flags.initialized) {
+    KMP_ASSERT(__kmp_affin_fullMask != NULL);
+    return;
+  }
+
+  if (is_regular_affinity && (!__kmp_affin_fullMask || !__kmp_affin_origMask))
+    __kmp_aux_affinity_initialize_masks(affinity);
+
+  if (is_regular_affinity && !__kmp_topology) {
+    bool success = __kmp_aux_affinity_initialize_topology(affinity);
+    if (success) {
+      KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+    } else {
+      affinity.type = affinity_none;
+      KMP_AFFINITY_DISABLE();
+    }
+  }
+
   // If KMP_AFFINITY=none, then only create the single "none" place
   // which is the process's initial affinity mask or the number of
   // hardware threads depending on respect,norespect
-  if (__kmp_affinity_type == affinity_none) {
-    __kmp_create_affinity_none_places();
+  if (affinity.type == affinity_none) {
+    __kmp_create_affinity_none_places(affinity);
 #if KMP_USE_HIER_SCHED
     __kmp_dispatch_set_hierarchy_values();
 #endif
+    affinity.flags.initialized = TRUE;
     return;
   }
+
+  __kmp_topology->set_granularity(affinity);
   int depth = __kmp_topology->get_depth();
 
   // Create the table of masks, indexed by thread Id.
-  unsigned maxIndex;
-  unsigned numUnique;
-  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique);
-  if (__kmp_affinity_gran_levels == 0) {
-    KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
-  }
-
-  switch (__kmp_affinity_type) {
+  unsigned numUnique = 0;
+  int numAddrs = __kmp_topology->get_num_hw_threads();
+  // If OMP_PLACES=cores:<attribute> specified, then attempt
+  // to make OS Id mask table using those attributes
+  if (affinity.core_attr_gran.valid) {
+    __kmp_create_os_id_masks(&numUnique, affinity, [&](int idx) {
+      KMP_ASSERT(idx >= -1);
+      for (int i = idx + 1; i < numAddrs; ++i)
+        if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran))
+          return i;
+      return numAddrs;
+    });
+    if (!affinity.os_id_masks) {
+      const char *core_attribute;
+      if (affinity.core_attr_gran.core_eff != kmp_hw_attr_t::UNKNOWN_CORE_EFF)
+        core_attribute = "core_efficiency";
+      else
+        core_attribute = "core_type";
+      KMP_AFF_WARNING(affinity, AffIgnoringNotAvailable, env_var,
+                      core_attribute,
+                      __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true))
+    }
+  }
+  // If core attributes did not work, or none were specified,
+  // then make OS Id mask table using typical incremental way with
+  // checking for validity of each id at granularity level specified.
+  if (!affinity.os_id_masks) {
+    int gran = affinity.gran_levels;
+    int gran_level = depth - 1 - affinity.gran_levels;
+    if (gran >= 0 && gran_level >= 0 && gran_level < depth) {
+      __kmp_create_os_id_masks(
+          &numUnique, affinity, [depth, numAddrs, &affinity](int idx) {
+            KMP_ASSERT(idx >= -1);
+            int gran = affinity.gran_levels;
+            int gran_level = depth - 1 - affinity.gran_levels;
+            for (int i = idx + 1; i < numAddrs; ++i)
+              if ((gran >= depth) ||
+                  (gran < depth && __kmp_topology->at(i).ids[gran_level] !=
+                                       kmp_hw_thread_t::UNKNOWN_ID))
+                return i;
+            return numAddrs;
+          });
+    }
+  }
+  // Final attempt to make OS Id mask table using typical incremental way.
+  if (!affinity.os_id_masks) {
+    __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) {
+      KMP_ASSERT(idx >= -1);
+      return idx + 1;
+    });
+  }
+
+  switch (affinity.type) {
 
   case affinity_explicit:
-    KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
-    if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
-      __kmp_affinity_process_proclist(
-          &__kmp_affinity_masks, &__kmp_affinity_num_masks,
-          __kmp_affinity_proclist, osId2Mask, maxIndex);
+    KMP_DEBUG_ASSERT(affinity.proclist != NULL);
+    if (is_hidden_helper_affinity ||
+        __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
+      __kmp_affinity_process_proclist(affinity);
     } else {
-      __kmp_affinity_process_placelist(
-          &__kmp_affinity_masks, &__kmp_affinity_num_masks,
-          __kmp_affinity_proclist, osId2Mask, maxIndex);
-    }
-    if (__kmp_affinity_num_masks == 0) {
-      KMP_AFF_WARNING(AffNoValidProcID);
-      __kmp_affinity_type = affinity_none;
-      __kmp_create_affinity_none_places();
+      __kmp_affinity_process_placelist(affinity);
+    }
+    if (affinity.num_masks == 0) {
+      KMP_AFF_WARNING(affinity, AffNoValidProcID);
+      affinity.type = affinity_none;
+      __kmp_create_affinity_none_places(affinity);
+      affinity.flags.initialized = TRUE;
       return;
     }
     break;
 
   // The other affinity types rely on sorting the hardware threads according to
-  // some permutation of the machine topology tree. Set __kmp_affinity_compact
-  // and __kmp_affinity_offset appropriately, then jump to a common code
+  // some permutation of the machine topology tree. Set affinity.compact
+  // and affinity.offset appropriately, then jump to a common code
   // fragment to do the sort and create the array of affinity masks.
   case affinity_logical:
-    __kmp_affinity_compact = 0;
-    if (__kmp_affinity_offset) {
-      __kmp_affinity_offset =
-          __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
+    affinity.compact = 0;
+    if (affinity.offset) {
+      affinity.offset =
+          __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
     }
     goto sortTopology;
 
   case affinity_physical:
     if (__kmp_nThreadsPerCore > 1) {
-      __kmp_affinity_compact = 1;
-      if (__kmp_affinity_compact >= depth) {
-        __kmp_affinity_compact = 0;
+      affinity.compact = 1;
+      if (affinity.compact >= depth) {
+        affinity.compact = 0;
       }
     } else {
-      __kmp_affinity_compact = 0;
+      affinity.compact = 0;
     }
-    if (__kmp_affinity_offset) {
-      __kmp_affinity_offset =
-          __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
+    if (affinity.offset) {
+      affinity.offset =
+          __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
     }
     goto sortTopology;
 
   case affinity_scatter:
-    if (__kmp_affinity_compact >= depth) {
-      __kmp_affinity_compact = 0;
+    if (affinity.compact >= depth) {
+      affinity.compact = 0;
     } else {
-      __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
+      affinity.compact = depth - 1 - affinity.compact;
     }
     goto sortTopology;
 
   case affinity_compact:
-    if (__kmp_affinity_compact >= depth) {
-      __kmp_affinity_compact = depth - 1;
+    if (affinity.compact >= depth) {
+      affinity.compact = depth - 1;
     }
     goto sortTopology;
 
   case affinity_balanced:
-    if (depth <= 1) {
-      KMP_AFF_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
-      __kmp_affinity_type = affinity_none;
-      __kmp_create_affinity_none_places();
+    if (depth <= 1 || is_hidden_helper_affinity) {
+      KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
+      affinity.type = affinity_none;
+      __kmp_create_affinity_none_places(affinity);
+      affinity.flags.initialized = TRUE;
       return;
     } else if (!__kmp_topology->is_uniform()) {
       // Save the depth for further usage
@@ -4379,8 +5162,10 @@ static void __kmp_aux_affinity_initialize(void) {
 
       int nproc = ncores * maxprocpercore;
       if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
-        KMP_AFF_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
-        __kmp_affinity_type = affinity_none;
+        KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
+        affinity.type = affinity_none;
+        __kmp_create_affinity_none_places(affinity);
+        affinity.flags.initialized = TRUE;
         return;
       }
 
@@ -4405,48 +5190,57 @@ static void __kmp_aux_affinity_initialize(void) {
         procarr[core * maxprocpercore + inlastcore] = proc;
       }
     }
-    if (__kmp_affinity_compact >= depth) {
-      __kmp_affinity_compact = depth - 1;
+    if (affinity.compact >= depth) {
+      affinity.compact = depth - 1;
     }
 
   sortTopology:
     // Allocate the gtid->affinity mask table.
-    if (__kmp_affinity_dups) {
-      __kmp_affinity_num_masks = __kmp_avail_proc;
+    if (affinity.flags.dups) {
+      affinity.num_masks = __kmp_avail_proc;
     } else {
-      __kmp_affinity_num_masks = numUnique;
+      affinity.num_masks = numUnique;
     }
 
     if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
         (__kmp_affinity_num_places > 0) &&
-        ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
-      __kmp_affinity_num_masks = __kmp_affinity_num_places;
+        ((unsigned)__kmp_affinity_num_places < affinity.num_masks) &&
+        !is_hidden_helper_affinity) {
+      affinity.num_masks = __kmp_affinity_num_places;
     }
 
-    KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
+    KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
 
     // Sort the topology table according to the current setting of
-    // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
-    __kmp_topology->sort_compact();
+    // affinity.compact, then fill out affinity.masks.
+    __kmp_topology->sort_compact(affinity);
     {
       int i;
       unsigned j;
       int num_hw_threads = __kmp_topology->get_num_hw_threads();
+      kmp_full_mask_modifier_t full_mask;
       for (i = 0, j = 0; i < num_hw_threads; i++) {
-        if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) {
+        if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) {
           continue;
         }
         int osId = __kmp_topology->at(i).os_id;
 
-        kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
-        kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
+        kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId);
+        if (KMP_CPU_ISEMPTY(src))
+          continue;
+        kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
         KMP_ASSERT(KMP_CPU_ISSET(osId, src));
         KMP_CPU_COPY(dest, src);
-        if (++j >= __kmp_affinity_num_masks) {
+        full_mask.include(src);
+        if (++j >= affinity.num_masks) {
           break;
         }
       }
-      KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
+      KMP_DEBUG_ASSERT(j == affinity.num_masks);
+      // See if the places list further restricts or changes the full mask
+      if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
+        __kmp_topology->print(env_var);
+      }
     }
     // Sort the topology back using ids
     __kmp_topology->sort_ids();
@@ -4455,56 +5249,64 @@ static void __kmp_aux_affinity_initialize(void) {
   default:
     KMP_ASSERT2(0, "Unexpected affinity setting");
   }
-
-  KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
+  __kmp_aux_affinity_initialize_other_data(affinity);
+  affinity.flags.initialized = TRUE;
 }
 
-void __kmp_affinity_initialize(void) {
+void __kmp_affinity_initialize(kmp_affinity_t &affinity) {
   // Much of the code above was written assuming that if a machine was not
-  // affinity capable, then __kmp_affinity_type == affinity_none.  We now
-  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
-  // There are too many checks for __kmp_affinity_type == affinity_none
-  // in this code.  Instead of trying to change them all, check if
-  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
-  // affinity_none, call the real initialization routine, then restore
-  // __kmp_affinity_type to affinity_disabled.
-  int disabled = (__kmp_affinity_type == affinity_disabled);
-  if (!KMP_AFFINITY_CAPABLE()) {
+  // affinity capable, then affinity type == affinity_none.
+  // We now explicitly represent this as affinity type == affinity_disabled.
+  // There are too many checks for affinity type == affinity_none in this code.
+  // Instead of trying to change them all, check if
+  // affinity type == affinity_disabled, and if so, slam it with affinity_none,
+  // call the real initialization routine, then restore affinity type to
+  // affinity_disabled.
+  int disabled = (affinity.type == affinity_disabled);
+  if (!KMP_AFFINITY_CAPABLE())
     KMP_ASSERT(disabled);
-  }
-  if (disabled) {
-    __kmp_affinity_type = affinity_none;
-  }
-  __kmp_aux_affinity_initialize();
-  if (disabled) {
-    __kmp_affinity_type = affinity_disabled;
-  }
+  if (disabled)
+    affinity.type = affinity_none;
+  __kmp_aux_affinity_initialize(affinity);
+  if (disabled)
+    affinity.type = affinity_disabled;
 }
 
 void __kmp_affinity_uninitialize(void) {
-  if (__kmp_affinity_masks != NULL) {
-    KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
-    __kmp_affinity_masks = NULL;
-  }
-  if (__kmp_affin_fullMask != NULL) {
-    KMP_CPU_FREE(__kmp_affin_fullMask);
-    __kmp_affin_fullMask = NULL;
+  for (kmp_affinity_t *affinity : __kmp_affinities) {
+    if (affinity->masks != NULL)
+      KMP_CPU_FREE_ARRAY(affinity->masks, affinity->num_masks);
+    if (affinity->os_id_masks != NULL)
+      KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks);
+    if (affinity->proclist != NULL)
+      __kmp_free(affinity->proclist);
+    if (affinity->ids != NULL)
+      __kmp_free(affinity->ids);
+    if (affinity->attrs != NULL)
+      __kmp_free(affinity->attrs);
+    *affinity = KMP_AFFINITY_INIT(affinity->env_var);
   }
   if (__kmp_affin_origMask != NULL) {
+    if (KMP_AFFINITY_CAPABLE()) {
+#if KMP_OS_AIX
+      // Uninitialize by unbinding the thread.
+      bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
+#else
+      __kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
+#endif
+    }
     KMP_CPU_FREE(__kmp_affin_origMask);
     __kmp_affin_origMask = NULL;
   }
-  __kmp_affinity_num_masks = 0;
-  __kmp_affinity_type = affinity_default;
   __kmp_affinity_num_places = 0;
-  if (__kmp_affinity_proclist != NULL) {
-    __kmp_free(__kmp_affinity_proclist);
-    __kmp_affinity_proclist = NULL;
-  }
   if (procarr != NULL) {
     __kmp_free(procarr);
     procarr = NULL;
   }
+  if (__kmp_osid_to_hwthread_map) {
+    __kmp_free(__kmp_osid_to_hwthread_map);
+    __kmp_osid_to_hwthread_map = NULL;
+  }
 #if KMP_USE_HWLOC
   if (__kmp_hwloc_topology != NULL) {
     hwloc_topology_destroy(__kmp_hwloc_topology);
@@ -4522,12 +5324,36 @@ void __kmp_affinity_uninitialize(void) {
   KMPAffinity::destroy_api();
 }
 
+static void __kmp_select_mask_by_gtid(int gtid, const kmp_affinity_t *affinity,
+                                      int *place, kmp_affin_mask_t **mask) {
+  int mask_idx;
+  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
+  if (is_hidden_helper)
+    // The first gtid is the regular primary thread, the second gtid is the main
+    // thread of hidden team which does not participate in task execution.
+    mask_idx = gtid - 2;
+  else
+    mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
+  KMP_DEBUG_ASSERT(affinity->num_masks > 0);
+  *place = (mask_idx + affinity->offset) % affinity->num_masks;
+  *mask = KMP_CPU_INDEX(affinity->masks, *place);
+}
+
+// This function initializes the per-thread data concerning affinity including
+// the mask and topology information
 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
+
+  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+
+  // Set the thread topology information to default of unknown
+  for (int id = 0; id < KMP_HW_LAST; ++id)
+    th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
+  th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
+
   if (!KMP_AFFINITY_CAPABLE()) {
     return;
   }
 
-  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
   if (th->th.th_affin_mask == NULL) {
     KMP_CPU_ALLOC(th->th.th_affin_mask);
   } else {
@@ -4535,16 +5361,24 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
   }
 
   // Copy the thread mask to the kmp_info_t structure. If
-  // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
-  // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
-  // then the full mask is the same as the mask of the initialization thread.
+  // __kmp_affinity.type == affinity_none, copy the "full" mask, i.e.
+  // one that has all of the OS proc ids set, or if
+  // __kmp_affinity.flags.respect is set, then the full mask is the
+  // same as the mask of the initialization thread.
   kmp_affin_mask_t *mask;
   int i;
+  const kmp_affinity_t *affinity;
+  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
+
+  if (is_hidden_helper)
+    affinity = &__kmp_hh_affinity;
+  else
+    affinity = &__kmp_affinity;
 
-  if (KMP_AFFINITY_NON_PROC_BIND) {
-    if ((__kmp_affinity_type == affinity_none) ||
-        (__kmp_affinity_type == affinity_balanced) ||
-        KMP_HIDDEN_HELPER_THREAD(gtid)) {
+  if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) {
+    if ((affinity->type == affinity_none) ||
+        (affinity->type == affinity_balanced) ||
+        KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
 #if KMP_GROUP_AFFINITY
       if (__kmp_num_proc_groups > 1) {
         return;
@@ -4554,14 +5388,10 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
       i = 0;
       mask = __kmp_affin_fullMask;
     } else {
-      int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
-      KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
-      i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
-      mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+      __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
     }
   } else {
-    if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) ||
-        (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
+    if (!isa_root || __kmp_nested_proc_bind.bind_types[0] == proc_bind_false) {
 #if KMP_GROUP_AFFINITY
       if (__kmp_num_proc_groups > 1) {
         return;
@@ -4571,85 +5401,94 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
       i = KMP_PLACE_ALL;
       mask = __kmp_affin_fullMask;
     } else {
-      // int i = some hash function or just a counter that doesn't
-      // always start at 0.  Use adjusted gtid for now.
-      int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
-      KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
-      i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
-      mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+      __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
     }
   }
 
   th->th.th_current_place = i;
-  if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) {
+  if (isa_root && !is_hidden_helper) {
     th->th.th_new_place = i;
     th->th.th_first_place = 0;
-    th->th.th_last_place = __kmp_affinity_num_masks - 1;
+    th->th.th_last_place = affinity->num_masks - 1;
   } else if (KMP_AFFINITY_NON_PROC_BIND) {
     // When using a Non-OMP_PROC_BIND affinity method,
     // set all threads' place-partition-var to the entire place list
     th->th.th_first_place = 0;
-    th->th.th_last_place = __kmp_affinity_num_masks - 1;
+    th->th.th_last_place = affinity->num_masks - 1;
+  }
+  // Copy topology information associated with the place
+  if (i >= 0) {
+    th->th.th_topology_ids = __kmp_affinity.ids[i];
+    th->th.th_topology_attrs = __kmp_affinity.attrs[i];
   }
 
   if (i == KMP_PLACE_ALL) {
-    KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
+    KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n",
                    gtid));
   } else {
-    KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
+    KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n",
                    gtid, i));
   }
 
   KMP_CPU_COPY(th->th.th_affin_mask, mask);
+}
 
-  if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid)
-      /* to avoid duplicate printing (will be correctly printed on barrier) */
-      && (__kmp_affinity_type == affinity_none ||
-          (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) {
-    char buf[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-                              th->th.th_affin_mask);
-    KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
-               __kmp_gettid(), gtid, buf);
+void __kmp_affinity_bind_init_mask(int gtid) {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return;
   }
+  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+  const kmp_affinity_t *affinity;
+  const char *env_var;
+  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
 
-#if KMP_DEBUG
-  // Hidden helper thread affinity only printed for debug builds
-  if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) {
+  if (is_hidden_helper)
+    affinity = &__kmp_hh_affinity;
+  else
+    affinity = &__kmp_affinity;
+  env_var = __kmp_get_affinity_env_var(*affinity, /*for_binding=*/true);
+  /* to avoid duplicate printing (will be correctly printed on barrier) */
+  if (affinity->flags.verbose && (affinity->type == affinity_none ||
+                                  (th->th.th_current_place != KMP_PLACE_ALL &&
+                                   affinity->type != affinity_balanced)) &&
+      !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               th->th.th_affin_mask);
-    KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)",
-               (kmp_int32)getpid(), __kmp_gettid(), gtid, buf);
+    KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
+               gtid, buf);
   }
-#endif
 
 #if KMP_OS_WINDOWS
   // On Windows* OS, the process affinity mask might have changed. If the user
   // didn't request affinity and this call fails, just continue silently.
   // See CQ171393.
-  if (__kmp_affinity_type == affinity_none) {
+  if (affinity->type == affinity_none) {
     __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
   } else
 #endif
+#ifndef KMP_OS_AIX
+    // Do not set the full mask as the init mask on AIX.
     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
+#endif
 }
 
-void __kmp_affinity_set_place(int gtid) {
-  if (!KMP_AFFINITY_CAPABLE()) {
+void __kmp_affinity_bind_place(int gtid) {
+  // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND
+  if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) {
     return;
   }
 
   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
 
-  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
+  KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current "
                  "place = %d)\n",
                  gtid, th->th.th_new_place, th->th.th_current_place));
 
   // Check that the new place is within this thread's partition.
   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
   KMP_ASSERT(th->th.th_new_place >= 0);
-  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
+  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity.num_masks);
   if (th->th.th_first_place <= th->th.th_last_place) {
     KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
                (th->th.th_new_place <= th->th.th_last_place));
@@ -4661,11 +5500,11 @@ void __kmp_affinity_set_place(int gtid) {
   // Copy the thread mask to the kmp_info_t structure,
   // and set this thread's affinity.
   kmp_affin_mask_t *mask =
-      KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
+      KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place);
   KMP_CPU_COPY(th->th.th_affin_mask, mask);
   th->th.th_current_place = th->th.th_new_place;
 
-  if (__kmp_affinity_verbose) {
+  if (__kmp_affinity.flags.verbose) {
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               th->th.th_affin_mask);
@@ -4733,7 +5572,7 @@ int __kmp_aux_set_affinity(void **mask) {
   th->th.th_current_place = KMP_PLACE_UNDEFINED;
   th->th.th_new_place = KMP_PLACE_UNDEFINED;
   th->th.th_first_place = 0;
-  th->th.th_last_place = __kmp_affinity_num_masks - 1;
+  th->th.th_last_place = __kmp_affinity.num_masks - 1;
 
   // Turn off 4.0 affinity for the current tread at this parallel level.
   th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
@@ -4744,7 +5583,7 @@ int __kmp_aux_set_affinity(void **mask) {
 int __kmp_aux_get_affinity(void **mask) {
   int gtid;
   int retval;
-#if KMP_OS_WINDOWS || KMP_DEBUG
+#if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
   kmp_info_t *th;
 #endif
   if (!KMP_AFFINITY_CAPABLE()) {
@@ -4752,7 +5591,7 @@ int __kmp_aux_get_affinity(void **mask) {
   }
 
   gtid = __kmp_entry_gtid();
-#if KMP_OS_WINDOWS || KMP_DEBUG
+#if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
   th = __kmp_threads[gtid];
 #else
   (void)gtid; // unused variable
@@ -4775,7 +5614,7 @@ int __kmp_aux_get_affinity(void **mask) {
     }
   }
 
-#if !KMP_OS_WINDOWS
+#if !KMP_OS_WINDOWS && !KMP_OS_AIX
 
   retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
   KA_TRACE(
@@ -4795,7 +5634,7 @@ int __kmp_aux_get_affinity(void **mask) {
   KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
   return 0;
 
-#endif /* KMP_OS_WINDOWS */
+#endif /* !KMP_OS_WINDOWS && !KMP_OS_AIX */
 }
 
 int __kmp_aux_get_affinity_max_proc() {
@@ -4908,17 +5747,40 @@ int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
   return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
 }
 
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+// Returns first os proc id with ATOM core
+int __kmp_get_first_osid_with_ecore(void) {
+  int low = 0;
+  int high = __kmp_topology->get_num_hw_threads() - 1;
+  int mid = 0;
+  while (high - low > 1) {
+    mid = (high + low) / 2;
+    if (__kmp_topology->at(mid).attrs.get_core_type() ==
+        KMP_HW_CORE_TYPE_CORE) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
+    return mid;
+  }
+  return -1;
+}
+#endif
+
 // Dynamic affinity settings - Affinity balanced
 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
   KMP_DEBUG_ASSERT(th);
   bool fine_gran = true;
   int tid = th->th.th_info.ds.ds_tid;
+  const char *env_var = "KMP_AFFINITY";
 
   // Do not perform balanced affinity for the hidden helper threads
   if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
     return;
 
-  switch (__kmp_affinity_gran) {
+  switch (__kmp_affinity.gran) {
   case KMP_HW_THREAD:
     break;
   case KMP_HW_CORE:
@@ -4976,12 +5838,13 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
         KMP_CPU_SET(osID, mask);
       }
     }
-    if (__kmp_affinity_verbose) {
+    if (__kmp_affinity.flags.verbose) {
       char buf[KMP_AFFIN_MASK_PRINT_LEN];
       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
-      KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
-                 __kmp_gettid(), tid, buf);
+      KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
+                 tid, buf);
     }
+    __kmp_affinity_get_thread_topology_info(th);
     __kmp_set_system_affinity(mask, TRUE);
   } else { // Non-uniform topology
 
@@ -5142,17 +6005,19 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
       __kmp_free(newarr);
     }
 
-    if (__kmp_affinity_verbose) {
+    if (__kmp_affinity.flags.verbose) {
       char buf[KMP_AFFIN_MASK_PRINT_LEN];
       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
-      KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
-                 __kmp_gettid(), tid, buf);
+      KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
+                 tid, buf);
     }
+    __kmp_affinity_get_thread_topology_info(th);
     __kmp_set_system_affinity(mask, TRUE);
   }
 }
 
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
+    KMP_OS_AIX
 // We don't need this entry for Windows because
 // there is GetProcessAffinityMask() api
 //
@@ -5187,7 +6052,11 @@ extern "C"
                 "set full mask for thread %d\n",
                 gtid));
   KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
+#if KMP_OS_AIX
+  return bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
+#else
   return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
+#endif
 }
 #endif
author	mikhnenko <[email protected]>	2025-07-15 20:05:43 +0300
committer	mikhnenko <[email protected]>	2025-07-15 20:52:16 +0300
commit	a40bd4f45bbc18fd95b1596e655b8942ceb2cf4b (patch)
tree	bce599ca02c778c277198de6d131d37db71997d0 /contrib/libs/cxxsupp/openmp/kmp_affinity.cpp
parent	728e0eaef4dc1f1152d2c3a4cc1bbdf597f3ef3d (diff)