diff options
| author | mikhnenko <[email protected]> | 2025-07-15 20:05:43 +0300 |
|---|---|---|
| committer | mikhnenko <[email protected]> | 2025-07-15 20:52:16 +0300 |
| commit | a40bd4f45bbc18fd95b1596e655b8942ceb2cf4b (patch) | |
| tree | bce599ca02c778c277198de6d131d37db71997d0 /contrib/libs/cxxsupp/openmp/kmp_collapse.cpp | |
| parent | 728e0eaef4dc1f1152d2c3a4cc1bbdf597f3ef3d (diff) | |
Update contrib/libs/cxxsupp/openmp to 20.1.7
commit_hash:722dd5fe79203d22ad4a0be288ac0caeb6b3dd68
Diffstat (limited to 'contrib/libs/cxxsupp/openmp/kmp_collapse.cpp')
| -rw-r--r-- | contrib/libs/cxxsupp/openmp/kmp_collapse.cpp | 1781 |
1 files changed, 1781 insertions, 0 deletions
diff --git a/contrib/libs/cxxsupp/openmp/kmp_collapse.cpp b/contrib/libs/cxxsupp/openmp/kmp_collapse.cpp new file mode 100644 index 00000000000..f1bf04901dc --- /dev/null +++ b/contrib/libs/cxxsupp/openmp/kmp_collapse.cpp @@ -0,0 +1,1781 @@ +/* + * kmp_collapse.cpp -- loop collapse feature + */ + +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "kmp.h" +#include "kmp_error.h" +#include "kmp_i18n.h" +#include "kmp_itt.h" +#include "kmp_stats.h" +#include "kmp_str.h" +#include "kmp_collapse.h" + +#if OMPT_SUPPORT +#include "ompt-specific.h" +#endif + +// OMPTODO: different style of comments (see kmp_sched) +// OMPTODO: OMPT/OMPD + +// avoid inadevertently using a library based abs +template <typename T> T __kmp_abs(const T val) { + return (val < 0) ? -val : val; +} +kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; } +kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; } + +//---------------------------------------------------------------------------- +// Common functions for working with rectangular and non-rectangular loops +//---------------------------------------------------------------------------- + +template <typename T> int __kmp_sign(T val) { + return (T(0) < val) - (val < T(0)); +} + +template <typename T> class CollapseAllocator { + typedef T *pT; + +private: + static const size_t allocaSize = 32; // size limit for stack allocations + // (8 bytes x 4 nested loops) + char stackAlloc[allocaSize]; + static constexpr size_t maxElemCount = allocaSize / sizeof(T); + pT pTAlloc; + +public: + CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) { + if (n > maxElemCount) { + pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T))); + } + } + ~CollapseAllocator() { + if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) { + __kmp_free(pTAlloc); + } + } + T &operator[](int index) { return pTAlloc[index]; } + operator const pT() { return pTAlloc; } +}; + +//----------Loop canonicalization--------------------------------------------- + +// For loop nest (any shape): +// convert != to < or >; +// switch from using < or > to <= or >=. +// "bounds" array has to be allocated per thread. +// All other internal functions will work only with canonicalized loops. +template <typename T> +void kmp_canonicalize_one_loop_XX( + ident_t *loc, + /*in/out*/ bounds_infoXX_template<T> *bounds) { + + if (__kmp_env_consistency_check) { + if (bounds->step == 0) { + __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, + loc); + } + } + + if (bounds->comparison == comparison_t::comp_not_eq) { + // We can convert this to < or >, depends on the sign of the step: + if (bounds->step > 0) { + bounds->comparison = comparison_t::comp_less; + } else { + bounds->comparison = comparison_t::comp_greater; + } + } + + if (bounds->comparison == comparison_t::comp_less) { + // Note: ub0 can be unsigned. Should be Ok to hit overflow here, + // because ub0 + ub1*j should be still positive (otherwise loop was not + // well formed) + bounds->ub0 -= 1; + bounds->comparison = comparison_t::comp_less_or_eq; + } else if (bounds->comparison == comparison_t::comp_greater) { + bounds->ub0 += 1; + bounds->comparison = comparison_t::comp_greater_or_eq; + } +} + +// Canonicalize loop nest. original_bounds_nest is an array of length n. +void kmp_canonicalize_loop_nest(ident_t *loc, + /*in/out*/ bounds_info_t *original_bounds_nest, + kmp_index_t n) { + + for (kmp_index_t ind = 0; ind < n; ++ind) { + auto bounds = &(original_bounds_nest[ind]); + + switch (bounds->loop_type) { + case loop_type_t::loop_type_int32: + kmp_canonicalize_one_loop_XX<kmp_int32>( + loc, + /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds)); + break; + case loop_type_t::loop_type_uint32: + kmp_canonicalize_one_loop_XX<kmp_uint32>( + loc, + /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds)); + break; + case loop_type_t::loop_type_int64: + kmp_canonicalize_one_loop_XX<kmp_int64>( + loc, + /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds)); + break; + case loop_type_t::loop_type_uint64: + kmp_canonicalize_one_loop_XX<kmp_uint64>( + loc, + /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds)); + break; + default: + KMP_ASSERT(false); + } + } +} + +//----------Calculating trip count on one level------------------------------- + +// Calculate trip count on this loop level. +// We do this either for a rectangular loop nest, +// or after an adjustment bringing the loops to a parallelepiped shape. +// This number should not depend on the value of outer IV +// even if the formular has lb1 and ub1. +// Note: for non-rectangular loops don't use span for this, it's too big. + +template <typename T> +kmp_loop_nest_iv_t kmp_calculate_trip_count_XX( + /*in/out*/ bounds_infoXX_template<T> *bounds) { + + if (bounds->comparison == comparison_t::comp_less_or_eq) { + if (bounds->ub0 < bounds->lb0) { + // Note: after this we don't need to calculate inner loops, + // but that should be an edge case: + bounds->trip_count = 0; + } else { + // ub - lb may exceed signed type range; we need to cast to + // kmp_loop_nest_iv_t anyway + bounds->trip_count = + static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) / + __kmp_abs(bounds->step) + + 1; + } + } else if (bounds->comparison == comparison_t::comp_greater_or_eq) { + if (bounds->lb0 < bounds->ub0) { + // Note: after this we don't need to calculate inner loops, + // but that should be an edge case: + bounds->trip_count = 0; + } else { + // lb - ub may exceed signed type range; we need to cast to + // kmp_loop_nest_iv_t anyway + bounds->trip_count = + static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) / + __kmp_abs(bounds->step) + + 1; + } + } else { + KMP_ASSERT(false); + } + return bounds->trip_count; +} + +// Calculate trip count on this loop level. +kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) { + + kmp_loop_nest_iv_t trip_count = 0; + + switch (bounds->loop_type) { + case loop_type_t::loop_type_int32: + trip_count = kmp_calculate_trip_count_XX<kmp_int32>( + /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds)); + break; + case loop_type_t::loop_type_uint32: + trip_count = kmp_calculate_trip_count_XX<kmp_uint32>( + /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds)); + break; + case loop_type_t::loop_type_int64: + trip_count = kmp_calculate_trip_count_XX<kmp_int64>( + /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds)); + break; + case loop_type_t::loop_type_uint64: + trip_count = kmp_calculate_trip_count_XX<kmp_uint64>( + /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds)); + break; + default: + KMP_ASSERT(false); + } + + return trip_count; +} + +//----------Trim original iv according to its type---------------------------- + +// Trim original iv according to its type. +// Return kmp_uint64 value which can be easily used in all internal calculations +// And can be statically cast back to original type in user code. +kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) { + kmp_uint64 res = 0; + + switch (loop_iv_type) { + case loop_type_t::loop_type_int8: + res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv)); + break; + case loop_type_t::loop_type_uint8: + res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv)); + break; + case loop_type_t::loop_type_int16: + res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv)); + break; + case loop_type_t::loop_type_uint16: + res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv)); + break; + case loop_type_t::loop_type_int32: + res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv)); + break; + case loop_type_t::loop_type_uint32: + res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv)); + break; + case loop_type_t::loop_type_int64: + res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv)); + break; + case loop_type_t::loop_type_uint64: + res = static_cast<kmp_uint64>(original_iv); + break; + default: + KMP_ASSERT(false); + } + + return res; +} + +//----------Compare two IVs (remember they have a type)----------------------- + +bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1, + kmp_uint64 original_iv2) { + bool res = false; + + switch (loop_iv_type) { + case loop_type_t::loop_type_int8: + res = static_cast<kmp_int8>(original_iv1) == + static_cast<kmp_int8>(original_iv2); + break; + case loop_type_t::loop_type_uint8: + res = static_cast<kmp_uint8>(original_iv1) == + static_cast<kmp_uint8>(original_iv2); + break; + case loop_type_t::loop_type_int16: + res = static_cast<kmp_int16>(original_iv1) == + static_cast<kmp_int16>(original_iv2); + break; + case loop_type_t::loop_type_uint16: + res = static_cast<kmp_uint16>(original_iv1) == + static_cast<kmp_uint16>(original_iv2); + break; + case loop_type_t::loop_type_int32: + res = static_cast<kmp_int32>(original_iv1) == + static_cast<kmp_int32>(original_iv2); + break; + case loop_type_t::loop_type_uint32: + res = static_cast<kmp_uint32>(original_iv1) == + static_cast<kmp_uint32>(original_iv2); + break; + case loop_type_t::loop_type_int64: + res = static_cast<kmp_int64>(original_iv1) == + static_cast<kmp_int64>(original_iv2); + break; + case loop_type_t::loop_type_uint64: + res = static_cast<kmp_uint64>(original_iv1) == + static_cast<kmp_uint64>(original_iv2); + break; + default: + KMP_ASSERT(false); + } + + return res; +} + +//----------Calculate original iv on one level-------------------------------- + +// Return true if the point fits into upper bounds on this level, +// false otherwise +template <typename T> +bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds, + const kmp_point_t original_ivs, + kmp_index_t ind) { + + T iv = static_cast<T>(original_ivs[ind]); + T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); + + if (((bounds->comparison == comparison_t::comp_less_or_eq) && + (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) || + ((bounds->comparison == comparison_t::comp_greater_or_eq) && + (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) { + // The calculated point is outside of loop upper boundary: + return false; + } + + return true; +} + +// Calculate one iv corresponding to iteration on the level ind. +// Return true if it fits into lower-upper bounds on this level +// (if not, we need to re-calculate) +template <typename T> +bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds, + /*in/out*/ kmp_point_t original_ivs, + const kmp_iterations_t iterations, kmp_index_t ind, + bool start_with_lower_bound, bool checkBounds) { + + kmp_uint64 temp = 0; + T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); + + if (start_with_lower_bound) { + // we moved to the next iteration on one of outer loops, should start + // with the lower bound here: + temp = bounds->lb0 + bounds->lb1 * outer_iv; + } else { + auto iteration = iterations[ind]; + temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step; + } + + // Now trim original iv according to its type: + original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); + + if (checkBounds) { + return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind); + } else { + return true; + } +} + +bool kmp_calc_one_iv(const bounds_info_t *bounds, + /*in/out*/ kmp_point_t original_ivs, + const kmp_iterations_t iterations, kmp_index_t ind, + bool start_with_lower_bound, bool checkBounds) { + + switch (bounds->loop_type) { + case loop_type_t::loop_type_int32: + return kmp_calc_one_iv_XX<kmp_int32>( + (bounds_infoXX_template<kmp_int32> *)(bounds), + /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, + checkBounds); + break; + case loop_type_t::loop_type_uint32: + return kmp_calc_one_iv_XX<kmp_uint32>( + (bounds_infoXX_template<kmp_uint32> *)(bounds), + /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, + checkBounds); + break; + case loop_type_t::loop_type_int64: + return kmp_calc_one_iv_XX<kmp_int64>( + (bounds_infoXX_template<kmp_int64> *)(bounds), + /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, + checkBounds); + break; + case loop_type_t::loop_type_uint64: + return kmp_calc_one_iv_XX<kmp_uint64>( + (bounds_infoXX_template<kmp_uint64> *)(bounds), + /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, + checkBounds); + break; + default: + KMP_ASSERT(false); + return false; + } +} + +//----------Calculate original iv on one level for rectangular loop nest------ + +// Calculate one iv corresponding to iteration on the level ind. +// Return true if it fits into lower-upper bounds on this level +// (if not, we need to re-calculate) +template <typename T> +void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds, + /*in/out*/ kmp_uint64 *original_ivs, + const kmp_iterations_t iterations, + kmp_index_t ind) { + + auto iteration = iterations[ind]; + + kmp_uint64 temp = + bounds->lb0 + + bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) + + iteration * bounds->step; + + // Now trim original iv according to its type: + original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); +} + +void kmp_calc_one_iv_rectang(const bounds_info_t *bounds, + /*in/out*/ kmp_uint64 *original_ivs, + const kmp_iterations_t iterations, + kmp_index_t ind) { + + switch (bounds->loop_type) { + case loop_type_t::loop_type_int32: + kmp_calc_one_iv_rectang_XX<kmp_int32>( + (bounds_infoXX_template<kmp_int32> *)(bounds), + /*in/out*/ original_ivs, iterations, ind); + break; + case loop_type_t::loop_type_uint32: + kmp_calc_one_iv_rectang_XX<kmp_uint32>( + (bounds_infoXX_template<kmp_uint32> *)(bounds), + /*in/out*/ original_ivs, iterations, ind); + break; + case loop_type_t::loop_type_int64: + kmp_calc_one_iv_rectang_XX<kmp_int64>( + (bounds_infoXX_template<kmp_int64> *)(bounds), + /*in/out*/ original_ivs, iterations, ind); + break; + case loop_type_t::loop_type_uint64: + kmp_calc_one_iv_rectang_XX<kmp_uint64>( + (bounds_infoXX_template<kmp_uint64> *)(bounds), + /*in/out*/ original_ivs, iterations, ind); + break; + default: + KMP_ASSERT(false); + } +} + +//---------------------------------------------------------------------------- +// Rectangular loop nest +//---------------------------------------------------------------------------- + +//----------Canonicalize loop nest and calculate trip count------------------- + +// Canonicalize loop nest and calculate overall trip count. +// "bounds_nest" has to be allocated per thread. +// API will modify original bounds_nest array to bring it to a canonical form +// (only <= and >=, no !=, <, >). If the original loop nest was already in a +// canonical form there will be no changes to bounds in bounds_nest array +// (only trip counts will be calculated). +// Returns trip count of overall space. +extern "C" kmp_loop_nest_iv_t +__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid, + /*in/out*/ bounds_info_t *original_bounds_nest, + kmp_index_t n) { + + kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); + + kmp_loop_nest_iv_t total = 1; + + for (kmp_index_t ind = 0; ind < n; ++ind) { + auto bounds = &(original_bounds_nest[ind]); + + kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds); + total *= trip_count; + } + + return total; +} + +//----------Calculate old induction variables--------------------------------- + +// Calculate old induction variables corresponding to overall new_iv. +// Note: original IV will be returned as if it had kmp_uint64 type, +// will have to be converted to original type in user code. +// Note: trip counts should be already calculated by +// __kmpc_process_loop_nest_rectang. +// OMPTODO: special case 2, 3 nested loops: either do different +// interface without array or possibly template this over n +extern "C" void +__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv, + const bounds_info_t *original_bounds_nest, + /*out*/ kmp_uint64 *original_ivs, + kmp_index_t n) { + + CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); + + // First, calc corresponding iteration in every original loop: + for (kmp_index_t ind = n; ind > 0;) { + --ind; + auto bounds = &(original_bounds_nest[ind]); + + // should be optimized to OPDIVREM: + auto temp = new_iv / bounds->trip_count; + auto iteration = new_iv % bounds->trip_count; + new_iv = temp; + + iterations[ind] = iteration; + } + KMP_ASSERT(new_iv == 0); + + for (kmp_index_t ind = 0; ind < n; ++ind) { + auto bounds = &(original_bounds_nest[ind]); + + kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind); + } +} + +//---------------------------------------------------------------------------- +// Non-rectangular loop nest +//---------------------------------------------------------------------------- + +//----------Calculate maximum possible span of iv values on one level--------- + +// Calculate span for IV on this loop level for "<=" case. +// Note: it's for <= on this loop nest level, so lower bound should be smallest +// value, upper bound should be the biggest value. If the loop won't execute, +// 'smallest' may be bigger than 'biggest', but we'd better not switch them +// around. +template <typename T> +void kmp_calc_span_lessoreq_XX( + /* in/out*/ bounds_info_internalXX_template<T> *bounds, + /* in/out*/ bounds_info_internal_t *bounds_nest) { + + typedef typename traits_t<T>::unsigned_t UT; + // typedef typename traits_t<T>::signed_t ST; + + // typedef typename big_span_t span_t; + typedef T span_t; + + auto &bbounds = bounds->b; + + if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { + // This dimention depends on one of previous ones; can't be the outermost + // one. + bounds_info_internalXX_template<T> *previous = + reinterpret_cast<bounds_info_internalXX_template<T> *>( + &(bounds_nest[bbounds.outer_iv])); + + // OMPTODO: assert that T is compatible with loop variable type on + // 'previous' loop + + { + span_t bound_candidate1 = + bbounds.lb0 + bbounds.lb1 * previous->span_smallest; + span_t bound_candidate2 = + bbounds.lb0 + bbounds.lb1 * previous->span_biggest; + if (bound_candidate1 < bound_candidate2) { + bounds->span_smallest = bound_candidate1; + } else { + bounds->span_smallest = bound_candidate2; + } + } + + { + // We can't adjust the upper bound with respect to step, because + // lower bound might be off after adjustments + + span_t bound_candidate1 = + bbounds.ub0 + bbounds.ub1 * previous->span_smallest; + span_t bound_candidate2 = + bbounds.ub0 + bbounds.ub1 * previous->span_biggest; + if (bound_candidate1 < bound_candidate2) { + bounds->span_biggest = bound_candidate2; + } else { + bounds->span_biggest = bound_candidate1; + } + } + } else { + // Rectangular: + bounds->span_smallest = bbounds.lb0; + bounds->span_biggest = bbounds.ub0; + } + if (!bounds->loop_bounds_adjusted) { + // Here it's safe to reduce the space to the multiply of step. + // OMPTODO: check if the formular is correct. + // Also check if it would be safe to do this if we didn't adjust left side. + bounds->span_biggest -= + (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? + } +} + +// Calculate span for IV on this loop level for ">=" case. +template <typename T> +void kmp_calc_span_greateroreq_XX( + /* in/out*/ bounds_info_internalXX_template<T> *bounds, + /* in/out*/ bounds_info_internal_t *bounds_nest) { + + typedef typename traits_t<T>::unsigned_t UT; + // typedef typename traits_t<T>::signed_t ST; + + // typedef typename big_span_t span_t; + typedef T span_t; + + auto &bbounds = bounds->b; + + if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { + // This dimention depends on one of previous ones; can't be the outermost + // one. + bounds_info_internalXX_template<T> *previous = + reinterpret_cast<bounds_info_internalXX_template<T> *>( + &(bounds_nest[bbounds.outer_iv])); + + // OMPTODO: assert that T is compatible with loop variable type on + // 'previous' loop + + { + span_t bound_candidate1 = + bbounds.lb0 + bbounds.lb1 * previous->span_smallest; + span_t bound_candidate2 = + bbounds.lb0 + bbounds.lb1 * previous->span_biggest; + if (bound_candidate1 >= bound_candidate2) { + bounds->span_smallest = bound_candidate1; + } else { + bounds->span_smallest = bound_candidate2; + } + } + + { + // We can't adjust the upper bound with respect to step, because + // lower bound might be off after adjustments + + span_t bound_candidate1 = + bbounds.ub0 + bbounds.ub1 * previous->span_smallest; + span_t bound_candidate2 = + bbounds.ub0 + bbounds.ub1 * previous->span_biggest; + if (bound_candidate1 >= bound_candidate2) { + bounds->span_biggest = bound_candidate2; + } else { + bounds->span_biggest = bound_candidate1; + } + } + + } else { + // Rectangular: + bounds->span_biggest = bbounds.lb0; + bounds->span_smallest = bbounds.ub0; + } + if (!bounds->loop_bounds_adjusted) { + // Here it's safe to reduce the space to the multiply of step. + // OMPTODO: check if the formular is correct. + // Also check if it would be safe to do this if we didn't adjust left side. + bounds->span_biggest -= + (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? + } +} + +// Calculate maximum possible span for IV on this loop level. +template <typename T> +void kmp_calc_span_XX( + /* in/out*/ bounds_info_internalXX_template<T> *bounds, + /* in/out*/ bounds_info_internal_t *bounds_nest) { + + if (bounds->b.comparison == comparison_t::comp_less_or_eq) { + kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); + } else { + KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq); + kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); + } +} + +//----------All initial processing of the loop nest--------------------------- + +// Calculate new bounds for this loop level. +// To be able to work with the nest we need to get it to a parallelepiped shape. +// We need to stay in the original range of values, so that there will be no +// overflow, for that we'll adjust both upper and lower bounds as needed. +template <typename T> +void kmp_calc_new_bounds_XX( + /* in/out*/ bounds_info_internalXX_template<T> *bounds, + /* in/out*/ bounds_info_internal_t *bounds_nest) { + + auto &bbounds = bounds->b; + + if (bbounds.lb1 == bbounds.ub1) { + // Already parallel, no need to adjust: + bounds->loop_bounds_adjusted = false; + } else { + bounds->loop_bounds_adjusted = true; + + T old_lb1 = bbounds.lb1; + T old_ub1 = bbounds.ub1; + + if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) { + // With this shape we can adjust to a rectangle: + bbounds.lb1 = 0; + bbounds.ub1 = 0; + } else { + // get upper and lower bounds to be parallel + // with values in the old range. + // Note: abs didn't work here. + if (((old_lb1 < 0) && (old_lb1 < old_ub1)) || + ((old_lb1 > 0) && (old_lb1 > old_ub1))) { + bbounds.lb1 = old_ub1; + } else { + bbounds.ub1 = old_lb1; + } + } + + // Now need to adjust lb0, ub0, otherwise in some cases space will shrink. + // The idea here that for this IV we are now getting the same span + // irrespective of the previous IV value. + bounds_info_internalXX_template<T> *previous = + reinterpret_cast<bounds_info_internalXX_template<T> *>( + &bounds_nest[bbounds.outer_iv]); + + if (bbounds.comparison == comparison_t::comp_less_or_eq) { + if (old_lb1 < bbounds.lb1) { + KMP_ASSERT(old_lb1 < 0); + // The length is good on outer_iv biggest number, + // can use it to find where to move the lower bound: + + T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest; + bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space? + // e.g. it was 0?? (same below) + } else if (old_lb1 > bbounds.lb1) { + // still need to move lower bound: + T add = (old_lb1 - bbounds.lb1) * previous->span_smallest; + bbounds.lb0 += add; + } + + if (old_ub1 > bbounds.ub1) { + KMP_ASSERT(old_ub1 > 0); + // The length is good on outer_iv biggest number, + // can use it to find where to move upper bound: + + T add = (old_ub1 - bbounds.ub1) * previous->span_biggest; + bbounds.ub0 += add; + } else if (old_ub1 < bbounds.ub1) { + // still need to move upper bound: + T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest; + bbounds.ub0 -= sub; + } + } else { + KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq); + if (old_lb1 < bbounds.lb1) { + KMP_ASSERT(old_lb1 < 0); + T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest; + bbounds.lb0 -= sub; + } else if (old_lb1 > bbounds.lb1) { + T add = (old_lb1 - bbounds.lb1) * previous->span_biggest; + bbounds.lb0 += add; + } + + if (old_ub1 > bbounds.ub1) { + KMP_ASSERT(old_ub1 > 0); + T add = (old_ub1 - bbounds.ub1) * previous->span_smallest; + bbounds.ub0 += add; + } else if (old_ub1 < bbounds.ub1) { + T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest; + bbounds.ub0 -= sub; + } + } + } +} + +// Do all processing for one canonicalized loop in the nest +// (assuming that outer loops already were processed): +template <typename T> +kmp_loop_nest_iv_t kmp_process_one_loop_XX( + /* in/out*/ bounds_info_internalXX_template<T> *bounds, + /*in/out*/ bounds_info_internal_t *bounds_nest) { + + kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); + kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); + return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b)); +} + +// Non-rectangular loop nest, canonicalized to use <= or >=. +// Process loop nest to have a parallelepiped shape, +// calculate biggest spans for IV's on all levels and calculate overall trip +// count. "bounds_nest" has to be allocated per thread. +// Returns overall trip count (for adjusted space). +kmp_loop_nest_iv_t kmp_process_loop_nest( + /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) { + + kmp_loop_nest_iv_t total = 1; + + for (kmp_index_t ind = 0; ind < n; ++ind) { + auto bounds = &(bounds_nest[ind]); + kmp_loop_nest_iv_t trip_count = 0; + + switch (bounds->b.loop_type) { + case loop_type_t::loop_type_int32: + trip_count = kmp_process_one_loop_XX<kmp_int32>( + /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds), + /*in/out*/ bounds_nest); + break; + case loop_type_t::loop_type_uint32: + trip_count = kmp_process_one_loop_XX<kmp_uint32>( + /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds), + /*in/out*/ bounds_nest); + break; + case loop_type_t::loop_type_int64: + trip_count = kmp_process_one_loop_XX<kmp_int64>( + /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds), + /*in/out*/ bounds_nest); + break; + case loop_type_t::loop_type_uint64: + trip_count = kmp_process_one_loop_XX<kmp_uint64>( + /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds), + /*in/out*/ bounds_nest); + break; + default: + KMP_ASSERT(false); + } + total *= trip_count; + } + + return total; +} + +//----------Calculate iterations (in the original or updated space)----------- + +// Calculate number of iterations in original or updated space resulting in +// original_ivs[ind] (only on this level, non-negative) +// (not counting initial iteration) +template <typename T> +kmp_loop_nest_iv_t +kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds, + const kmp_point_t original_ivs, + kmp_index_t ind) { + + kmp_loop_nest_iv_t iterations = 0; + + if (bounds->comparison == comparison_t::comp_less_or_eq) { + iterations = + (static_cast<T>(original_ivs[ind]) - bounds->lb0 - + bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) / + __kmp_abs(bounds->step); + } else { + KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq); + iterations = (bounds->lb0 + + bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) - + static_cast<T>(original_ivs[ind])) / + __kmp_abs(bounds->step); + } + + return iterations; +} + +// Calculate number of iterations in the original or updated space resulting in +// original_ivs[ind] (only on this level, non-negative) +kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds, + const kmp_point_t original_ivs, + kmp_index_t ind) { + + switch (bounds->loop_type) { + case loop_type_t::loop_type_int32: + return kmp_calc_number_of_iterations_XX<kmp_int32>( + (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind); + break; + case loop_type_t::loop_type_uint32: + return kmp_calc_number_of_iterations_XX<kmp_uint32>( + (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind); + break; + case loop_type_t::loop_type_int64: + return kmp_calc_number_of_iterations_XX<kmp_int64>( + (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind); + break; + case loop_type_t::loop_type_uint64: + return kmp_calc_number_of_iterations_XX<kmp_uint64>( + (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind); + break; + default: + KMP_ASSERT(false); + return 0; + } +} + +//----------Calculate new iv corresponding to original ivs-------------------- + +// We got a point in the original loop nest. +// Take updated bounds and calculate what new_iv will correspond to this point. +// When we are getting original IVs from new_iv, we have to adjust to fit into +// original loops bounds. Getting new_iv for the adjusted original IVs will help +// with making more chunks non-empty. +kmp_loop_nest_iv_t +kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest, + const kmp_point_t original_ivs, + kmp_index_t n) { + + kmp_loop_nest_iv_t new_iv = 0; + + for (kmp_index_t ind = 0; ind < n; ++ind) { + auto bounds = &(bounds_nest[ind].b); + + new_iv = new_iv * bounds->trip_count + + kmp_calc_number_of_iterations(bounds, original_ivs, ind); + } + + return new_iv; +} + +//----------Calculate original ivs for provided iterations-------------------- + +// Calculate original IVs for provided iterations, assuming iterations are +// calculated in the original space. +// Loop nest is in canonical form (with <= / >=). +bool kmp_calc_original_ivs_from_iterations( + const bounds_info_t *original_bounds_nest, kmp_index_t n, + /*in/out*/ kmp_point_t original_ivs, + /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) { + + kmp_index_t lengthened_ind = n; + + for (; ind < n;) { + auto bounds = &(original_bounds_nest[ind]); + bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations, + ind, (lengthened_ind < ind), true); + + if (!good) { + // The calculated iv value is too big (or too small for >=): + if (ind == 0) { + // Space is empty: + return false; + } else { + // Go to next iteration on the outer loop: + --ind; + ++iterations[ind]; + lengthened_ind = ind; + for (kmp_index_t i = ind + 1; i < n; ++i) { + iterations[i] = 0; + } + continue; + } + } + ++ind; + } + + return true; +} + +//----------Calculate original ivs for the beginning of the loop nest--------- + +// Calculate IVs for the beginning of the loop nest. +// Note: lower bounds of all loops may not work - +// if on some of the iterations of the outer loops inner loops are empty. +// Loop nest is in canonical form (with <= / >=). +bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest, + kmp_index_t n, + /*out*/ kmp_point_t original_ivs) { + + // Iterations in the original space, multiplied by step: + CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); + for (kmp_index_t ind = n; ind > 0;) { + --ind; + iterations[ind] = 0; + } + + // Now calculate the point: + bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n, + /*in/out*/ original_ivs, + /*in/out*/ iterations, 0); + return b; +} + +//----------Calculate next point in the original loop space------------------- + +// From current set of original IVs calculate next point. +// Return false if there is no next point in the loop bounds. +bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest, + kmp_index_t n, const kmp_point_t original_ivs, + /*out*/ kmp_point_t next_original_ivs) { + // Iterations in the original space, multiplied by step (so can be negative): + CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); + // First, calc corresponding iteration in every original loop: + for (kmp_index_t ind = 0; ind < n; ++ind) { + auto bounds = &(original_bounds_nest[ind]); + iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind); + } + + for (kmp_index_t ind = 0; ind < n; ++ind) { + next_original_ivs[ind] = original_ivs[ind]; + } + + // Next add one step to the iterations on the inner-most level, and see if we + // need to move up the nest: + kmp_index_t ind = n - 1; + ++iterations[ind]; + + bool b = kmp_calc_original_ivs_from_iterations( + original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind); + + return b; +} + +//----------Calculate chunk end in the original loop space-------------------- + +// For one level calculate old induction variable corresponding to overall +// new_iv for the chunk end. +// Return true if it fits into upper bound on this level +// (if not, we need to re-calculate) +template <typename T> +bool kmp_calc_one_iv_for_chunk_end_XX( + const bounds_infoXX_template<T> *bounds, + const bounds_infoXX_template<T> *updated_bounds, + /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations, + kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start, + const kmp_point_t original_ivs_start) { + + // typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64> + // big_span_t; + + // OMPTODO: is it good enough, or do we need ST or do we need big_span_t? + T temp = 0; + + T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); + + if (start_with_lower_bound) { + // we moved to the next iteration on one of outer loops, may as well use + // the lower bound here: + temp = bounds->lb0 + bounds->lb1 * outer_iv; + } else { + // Start in expanded space, but: + // - we need to hit original space lower bound, so need to account for + // that + // - we have to go into original space, even if that means adding more + // iterations than was planned + // - we have to go past (or equal to) previous point (which is the chunk + // starting point) + + auto iteration = iterations[ind]; + + auto step = bounds->step; + + // In case of >= it's negative: + auto accountForStep = + ((bounds->lb0 + bounds->lb1 * outer_iv) - + (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) % + step; + + temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv + + accountForStep + iteration * step; + + if (((bounds->comparison == comparison_t::comp_less_or_eq) && + (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) || + ((bounds->comparison == comparison_t::comp_greater_or_eq) && + (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) { + // Too small (or too big), didn't reach the original lower bound. Use + // heuristic: + temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step; + } + + if (compare_with_start) { + + T start = static_cast<T>(original_ivs_start[ind]); + + temp = kmp_fix_iv(bounds->loop_iv_type, temp); + + // On all previous levels start of the chunk is same as the end, need to + // be really careful here: + if (((bounds->comparison == comparison_t::comp_less_or_eq) && + (temp < start)) || + ((bounds->comparison == comparison_t::comp_greater_or_eq) && + (temp > start))) { + // End of the chunk can't be smaller (for >= bigger) than it's start. + // Use heuristic: + temp = start + iteration / 4 * step; + } + } + } + + original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp); + + if (((bounds->comparison == comparison_t::comp_less_or_eq) && + (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) || + ((bounds->comparison == comparison_t::comp_greater_or_eq) && + (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) { + // Too big (or too small for >=). + return false; + } + + return true; +} + +// For one level calculate old induction variable corresponding to overall +// new_iv for the chunk end. +bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds, + const bounds_info_t *updated_bounds, + /*in/out*/ kmp_point_t original_ivs, + const kmp_iterations_t iterations, + kmp_index_t ind, bool start_with_lower_bound, + bool compare_with_start, + const kmp_point_t original_ivs_start) { + + switch (bounds->loop_type) { + case loop_type_t::loop_type_int32: + return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>( + (bounds_infoXX_template<kmp_int32> *)(bounds), + (bounds_infoXX_template<kmp_int32> *)(updated_bounds), + /*in/out*/ + original_ivs, iterations, ind, start_with_lower_bound, + compare_with_start, original_ivs_start); + break; + case loop_type_t::loop_type_uint32: + return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>( + (bounds_infoXX_template<kmp_uint32> *)(bounds), + (bounds_infoXX_template<kmp_uint32> *)(updated_bounds), + /*in/out*/ + original_ivs, iterations, ind, start_with_lower_bound, + compare_with_start, original_ivs_start); + break; + case loop_type_t::loop_type_int64: + return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>( + (bounds_infoXX_template<kmp_int64> *)(bounds), + (bounds_infoXX_template<kmp_int64> *)(updated_bounds), + /*in/out*/ + original_ivs, iterations, ind, start_with_lower_bound, + compare_with_start, original_ivs_start); + break; + case loop_type_t::loop_type_uint64: + return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>( + (bounds_infoXX_template<kmp_uint64> *)(bounds), + (bounds_infoXX_template<kmp_uint64> *)(updated_bounds), + /*in/out*/ + original_ivs, iterations, ind, start_with_lower_bound, + compare_with_start, original_ivs_start); + break; + default: + KMP_ASSERT(false); + return false; + } +} + +// Calculate old induction variables corresponding to overall new_iv for the +// chunk end. If due to space extension we are getting old IVs outside of the +// boundaries, bring them into the boundaries. Need to do this in the runtime, +// esp. on the lower bounds side. When getting result need to make sure that the +// new chunk starts at next position to old chunk, not overlaps with it (this is +// done elsewhere), and need to make sure end of the chunk is further than the +// beginning of the chunk. We don't need an exact ending point here, just +// something more-or-less close to the desired chunk length, bigger is fine +// (smaller would be fine, but we risk going into infinite loop, so do smaller +// only at the very end of the space). result: false if could not find the +// ending point in the original loop space. In this case the caller can use +// original upper bounds as the end of the chunk. Chunk won't be empty, because +// it'll have at least the starting point, which is by construction in the +// original space. +bool kmp_calc_original_ivs_for_chunk_end( + const bounds_info_t *original_bounds_nest, kmp_index_t n, + const bounds_info_internal_t *updated_bounds_nest, + const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv, + /*out*/ kmp_point_t original_ivs) { + + // Iterations in the expanded space: + CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); + // First, calc corresponding iteration in every modified loop: + for (kmp_index_t ind = n; ind > 0;) { + --ind; + auto &updated_bounds = updated_bounds_nest[ind]; + + // should be optimized to OPDIVREM: + auto new_ind = new_iv / updated_bounds.b.trip_count; + auto iteration = new_iv % updated_bounds.b.trip_count; + + new_iv = new_ind; + iterations[ind] = iteration; + } + KMP_DEBUG_ASSERT(new_iv == 0); + + kmp_index_t lengthened_ind = n; + kmp_index_t equal_ind = -1; + + // Next calculate the point, but in original loop nest. + for (kmp_index_t ind = 0; ind < n;) { + auto bounds = &(original_bounds_nest[ind]); + auto updated_bounds = &(updated_bounds_nest[ind].b); + + bool good = kmp_calc_one_iv_for_chunk_end( + bounds, updated_bounds, + /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind), + (equal_ind >= ind - 1), original_ivs_start); + + if (!good) { + // Too big (or too small for >=). + if (ind == 0) { + // Need to reduce to the end. + return false; + } else { + // Go to next iteration on outer loop: + --ind; + ++(iterations[ind]); + lengthened_ind = ind; + if (equal_ind >= lengthened_ind) { + // We've changed the number of iterations here, + // can't be same anymore: + equal_ind = lengthened_ind - 1; + } + for (kmp_index_t i = ind + 1; i < n; ++i) { + iterations[i] = 0; + } + continue; + } + } + + if ((equal_ind == ind - 1) && + (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], + original_ivs_start[ind]))) { + equal_ind = ind; + } else if ((equal_ind > ind - 1) && + !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], + original_ivs_start[ind]))) { + equal_ind = ind - 1; + } + ++ind; + } + + return true; +} + +//----------Calculate upper bounds for the last chunk------------------------- + +// Calculate one upper bound for the end. +template <typename T> +void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds, + /*in/out*/ kmp_point_t original_ivs, + kmp_index_t ind) { + + T temp = bounds->ub0 + + bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]); + + original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); +} + +void kmp_calc_one_iv_end(const bounds_info_t *bounds, + /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) { + + switch (bounds->loop_type) { + default: + KMP_ASSERT(false); + break; + case loop_type_t::loop_type_int32: + kmp_calc_one_iv_end_XX<kmp_int32>( + (bounds_infoXX_template<kmp_int32> *)(bounds), + /*in/out*/ original_ivs, ind); + break; + case loop_type_t::loop_type_uint32: + kmp_calc_one_iv_end_XX<kmp_uint32>( + (bounds_infoXX_template<kmp_uint32> *)(bounds), + /*in/out*/ original_ivs, ind); + break; + case loop_type_t::loop_type_int64: + kmp_calc_one_iv_end_XX<kmp_int64>( + (bounds_infoXX_template<kmp_int64> *)(bounds), + /*in/out*/ original_ivs, ind); + break; + case loop_type_t::loop_type_uint64: + kmp_calc_one_iv_end_XX<kmp_uint64>( + (bounds_infoXX_template<kmp_uint64> *)(bounds), + /*in/out*/ original_ivs, ind); + break; + } +} + +// Calculate upper bounds for the last loop iteration. Just use original upper +// bounds (adjusted when canonicalized to use <= / >=). No need to check that +// this point is in the original space (it's likely not) +void kmp_calc_original_ivs_for_end( + const bounds_info_t *const original_bounds_nest, kmp_index_t n, + /*out*/ kmp_point_t original_ivs) { + for (kmp_index_t ind = 0; ind < n; ++ind) { + auto bounds = &(original_bounds_nest[ind]); + kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind); + } +} + +/************************************************************************** + * Identify nested loop structure - loops come in the canonical form + * Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0} + * j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1} + * Upper Triangle matrix + * i = 0; i <= N; i++ {0,0}:{N,0} + * j = 0+1*i; j <= N; j++ {0,1}:{N,0} + * ************************************************************************/ +nested_loop_type_t +kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest, + /*in*/ kmp_index_t n) { + // only 2-level nested loops are supported + if (n != 2) { + return nested_loop_type_unkown; + } + // loops must be canonical + KMP_ASSERT( + (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) && + (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq)); + // check outer loop bounds: for triangular need to be {0,0}:{N,0} + kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].lb0_u64); + kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].ub0_u64); + kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].lb1_u64); + kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].ub1_u64); + if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) { + return nested_loop_type_unkown; + } + // check inner bounds to determine triangle type + kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, + original_bounds_nest[1].lb0_u64); + kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, + original_bounds_nest[1].ub0_u64); + kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, + original_bounds_nest[1].lb1_u64); + kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, + original_bounds_nest[1].ub1_u64); + // lower triangle loop inner bounds need to be {0,0}:{0/-1,1} + if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 && + (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) { + return nested_loop_type_lower_triangular_matrix; + } + // upper triangle loop inner bounds need to be {0,1}:{N,0} + if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 && + inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) { + return nested_loop_type_upper_triangular_matrix; + } + return nested_loop_type_unkown; +} + +/************************************************************************** + * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf + * Start point is x so the result is always > sqrt(x) + * The method has uniform convergence, PRECISION is set to 0.1 + * ************************************************************************/ +#define level_of_precision 0.1 +double sqrt_newton_approx(/*in*/ kmp_uint64 x) { + double sqrt_old = 0.; + double sqrt_new = (double)x; + do { + sqrt_old = sqrt_new; + sqrt_new = (sqrt_old + x / sqrt_old) / 2; + } while ((sqrt_old - sqrt_new) > level_of_precision); + return sqrt_new; +} + +/************************************************************************** + * Handle lower triangle matrix in the canonical form + * i = 0; i <= N; i++ {0,0}:{N,0} + * j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1} + * ************************************************************************/ +void kmp_handle_lower_triangle_matrix( + /*in*/ kmp_uint32 nth, + /*in*/ kmp_uint32 tid, + /*in */ kmp_index_t n, + /*in/out*/ bounds_info_t *original_bounds_nest, + /*out*/ bounds_info_t *chunk_bounds_nest) { + + // transfer loop types from the original loop to the chunks + for (kmp_index_t i = 0; i < n; ++i) { + chunk_bounds_nest[i] = original_bounds_nest[i]; + } + // cleanup iv variables + kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].ub0_u64); + kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].lb0_u64); + kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, + original_bounds_nest[1].ub0_u64); + // calculate the chunk's lower and upper bounds + // the total number of iterations in the loop is the sum of the arithmetic + // progression from the outer lower to outer upper bound (inclusive since the + // loop is canonical) note that less_than inner loops (inner_ub0 = -1) + // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0 + // + 1) -> N - 1 + kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0; + kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2; + // the current thread's number of iterations: + // each thread gets an equal number of iterations: total number of iterations + // divided by the number of threads plus, if there's a remainder, + // the first threads with the number up to the remainder get an additional + // iteration each to cover it + kmp_uint64 iter_current = + iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0); + // cumulative number of iterations executed by all the previous threads: + // threads with the tid below the remainder will have (iter_total/nth+1) + // elements, and so will all threads before them so the cumulative number of + // iterations executed by the all previous will be the current thread's number + // of iterations multiplied by the number of previous threads which is equal + // to the current thread's tid; threads with the number equal or above the + // remainder will have (iter_total/nth) elements so the cumulative number of + // iterations previously executed is its number of iterations multipled by the + // number of previous threads which is again equal to the current thread's tid + // PLUS all the remainder iterations that will have been executed by the + // previous threads + kmp_uint64 iter_before_current = + tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth)); + // cumulative number of iterations executed with the current thread is + // the cumulative number executed before it plus its own + kmp_uint64 iter_with_current = iter_before_current + iter_current; + // calculate the outer loop lower bound (lbo) which is the max outer iv value + // that gives the number of iterations that is equal or just below the total + // number of iterations executed by the previous threads, for less_than + // (1-based) inner loops (inner_ub0 == -1) it will be i.e. + // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0 + // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be: + // i.e. lbo*(lbo+1)/2<=iter_before_current => + // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily + // using a parameter to control the equation sign + kmp_int64 inner_adjustment = 1 + 2 * inner_ub0; + kmp_uint64 lower_bound_outer = + (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment + + 8 * iter_before_current) + + inner_adjustment) / + 2 - + inner_adjustment; + // calculate the inner loop lower bound which is the remaining number of + // iterations required to hit the total number of iterations executed by the + // previous threads giving the starting point of this thread + kmp_uint64 lower_bound_inner = + iter_before_current - + ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2; + // calculate the outer loop upper bound using the same approach as for the + // inner bound except using the total number of iterations executed with the + // current thread + kmp_uint64 upper_bound_outer = + (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment + + 8 * iter_with_current) + + inner_adjustment) / + 2 - + inner_adjustment; + // calculate the inner loop upper bound which is the remaining number of + // iterations required to hit the total number of iterations executed after + // the current thread giving the starting point of the next thread + kmp_uint64 upper_bound_inner = + iter_with_current - + ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2; + // adjust the upper bounds down by 1 element to point at the last iteration of + // the current thread the first iteration of the next thread + if (upper_bound_inner == 0) { + // {n,0} => {n-1,n-1} + upper_bound_outer -= 1; + upper_bound_inner = upper_bound_outer; + } else { + // {n,m} => {n,m-1} (m!=0) + upper_bound_inner -= 1; + } + + // assign the values, zeroing out lb1 and ub1 values since the iteration space + // is now one-dimensional + chunk_bounds_nest[0].lb0_u64 = lower_bound_outer; + chunk_bounds_nest[1].lb0_u64 = lower_bound_inner; + chunk_bounds_nest[0].ub0_u64 = upper_bound_outer; + chunk_bounds_nest[1].ub0_u64 = upper_bound_inner; + chunk_bounds_nest[0].lb1_u64 = 0; + chunk_bounds_nest[0].ub1_u64 = 0; + chunk_bounds_nest[1].lb1_u64 = 0; + chunk_bounds_nest[1].ub1_u64 = 0; + +#if 0 + printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n", + tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64, + chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total); +#endif +} + +/************************************************************************** + * Handle upper triangle matrix in the canonical form + * i = 0; i <= N; i++ {0,0}:{N,0} + * j = 0+1*i; j <= N; j++ {0,1}:{N,0} + * ************************************************************************/ +void kmp_handle_upper_triangle_matrix( + /*in*/ kmp_uint32 nth, + /*in*/ kmp_uint32 tid, + /*in */ kmp_index_t n, + /*in/out*/ bounds_info_t *original_bounds_nest, + /*out*/ bounds_info_t *chunk_bounds_nest) { + + // transfer loop types from the original loop to the chunks + for (kmp_index_t i = 0; i < n; ++i) { + chunk_bounds_nest[i] = original_bounds_nest[i]; + } + // cleanup iv variables + kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].ub0_u64); + kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].lb0_u64); + [[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv( + original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64); + // calculate the chunk's lower and upper bounds + // the total number of iterations in the loop is the sum of the arithmetic + // progression from the outer lower to outer upper bound (inclusive since the + // loop is canonical) note that less_than inner loops (inner_ub0 = -1) + // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0 + // + 1) -> N - 1 + kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1); + kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2; + // the current thread's number of iterations: + // each thread gets an equal number of iterations: total number of iterations + // divided by the number of threads plus, if there's a remainder, + // the first threads with the number up to the remainder get an additional + // iteration each to cover it + kmp_uint64 iter_current = + iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0); + // cumulative number of iterations executed by all the previous threads: + // threads with the tid below the remainder will have (iter_total/nth+1) + // elements, and so will all threads before them so the cumulative number of + // iterations executed by the all previous will be the current thread's number + // of iterations multiplied by the number of previous threads which is equal + // to the current thread's tid; threads with the number equal or above the + // remainder will have (iter_total/nth) elements so the cumulative number of + // iterations previously executed is its number of iterations multipled by the + // number of previous threads which is again equal to the current thread's tid + // PLUS all the remainder iterations that will have been executed by the + // previous threads + kmp_uint64 iter_before_current = + tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth)); + // cumulative number of iterations executed with the current thread is + // the cumulative number executed before it plus its own + kmp_uint64 iter_with_current = iter_before_current + iter_current; + // calculate the outer loop lower bound (lbo) which is the max outer iv value + // that gives the number of iterations that is equal or just below the total + // number of iterations executed by the previous threads: + // lbo*(lbo+1)/2<=iter_before_current => + // lbo^2+lbo-2*iter_before_current<=0 + kmp_uint64 lower_bound_outer = + (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1; + // calculate the inner loop lower bound which is the remaining number of + // iterations required to hit the total number of iterations executed by the + // previous threads giving the starting point of this thread + kmp_uint64 lower_bound_inner = + iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2; + // calculate the outer loop upper bound using the same approach as for the + // inner bound except using the total number of iterations executed with the + // current thread + kmp_uint64 upper_bound_outer = + (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1; + // calculate the inner loop upper bound which is the remaining number of + // iterations required to hit the total number of iterations executed after + // the current thread giving the starting point of the next thread + kmp_uint64 upper_bound_inner = + iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2; + // adjust the upper bounds down by 1 element to point at the last iteration of + // the current thread the first iteration of the next thread + if (upper_bound_inner == 0) { + // {n,0} => {n-1,n-1} + upper_bound_outer -= 1; + upper_bound_inner = upper_bound_outer; + } else { + // {n,m} => {n,m-1} (m!=0) + upper_bound_inner -= 1; + } + + // assign the values, zeroing out lb1 and ub1 values since the iteration space + // is now one-dimensional + chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer; + chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner; + chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer; + chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner; + chunk_bounds_nest[0].lb1_u64 = 0; + chunk_bounds_nest[0].ub1_u64 = 0; + chunk_bounds_nest[1].lb1_u64 = 0; + chunk_bounds_nest[1].ub1_u64 = 0; + +#if 0 + printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n", + tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64, + chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total); +#endif +} +//----------Init API for non-rectangular loops-------------------------------- + +// Init API for collapsed loops (static, no chunks defined). +// "bounds_nest" has to be allocated per thread. +// API will modify original bounds_nest array to bring it to a canonical form +// (only <= and >=, no !=, <, >). If the original loop nest was already in a +// canonical form there will be no changes to bounds in bounds_nest array +// (only trip counts will be calculated). Internally API will expand the space +// to parallelogram/parallelepiped, calculate total, calculate bounds for the +// chunks in terms of the new IV, re-calc them in terms of old IVs (especially +// important on the left side, to hit the lower bounds and not step over), and +// pick the correct chunk for this thread (so it will calculate chunks up to the +// needed one). It could be optimized to calculate just this chunk, potentially +// a bit less well distributed among threads. It is designed to make sure that +// threads will receive predictable chunks, deterministically (so that next nest +// of loops with similar characteristics will get exactly same chunks on same +// threads). +// Current contract: chunk_bounds_nest has only lb0 and ub0, +// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future). +extern "C" kmp_int32 +__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, + /*in/out*/ bounds_info_t *original_bounds_nest, + /*out*/ bounds_info_t *chunk_bounds_nest, + kmp_index_t n, /*out*/ kmp_int32 *plastiter) { + + KMP_DEBUG_ASSERT(plastiter && original_bounds_nest); + KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid)); + + if (__kmp_env_consistency_check) { + __kmp_push_workshare(gtid, ct_pdo, loc); + } + + kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); + + CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n); + + for (kmp_index_t i = 0; i < n; ++i) { + updated_bounds_nest[i].b = original_bounds_nest[i]; + } + + kmp_loop_nest_iv_t total = + kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n); + + if (plastiter != NULL) { + *plastiter = FALSE; + } + + if (total == 0) { + // Loop won't execute: + return FALSE; + } + + // OMPTODO: DISTRIBUTE is not supported yet + __kmp_assert_valid_gtid(gtid); + kmp_uint32 tid = __kmp_tid_from_gtid(gtid); + + kmp_info_t *th = __kmp_threads[gtid]; + kmp_team_t *team = th->th.th_team; + kmp_uint32 nth = team->t.t_nproc; // Number of threads + + KMP_DEBUG_ASSERT(tid < nth); + + // Handle special cases + nested_loop_type_t loop_type = + kmp_identify_nested_loop_structure(original_bounds_nest, n); + if (loop_type == nested_loop_type_lower_triangular_matrix) { + kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest, + chunk_bounds_nest); + return TRUE; + } else if (loop_type == nested_loop_type_upper_triangular_matrix) { + kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest, + chunk_bounds_nest); + return TRUE; + } + + CollapseAllocator<kmp_uint64> original_ivs_start(n); + + if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n, + /*out*/ original_ivs_start)) { + // Loop won't execute: + return FALSE; + } + + // Not doing this optimization for one thread: + // (1) more to test + // (2) without it current contract that chunk_bounds_nest has only lb0 and + // ub0, lb1 and ub1 are set to 0 and can be ignored. + // if (nth == 1) { + // // One thread: + // // Copy all info from original_bounds_nest, it'll be good enough. + + // for (kmp_index_t i = 0; i < n; ++i) { + // chunk_bounds_nest[i] = original_bounds_nest[i]; + // } + + // if (plastiter != NULL) { + // *plastiter = TRUE; + // } + // return TRUE; + //} + + kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs( + updated_bounds_nest, original_ivs_start, n); + + bool last_iter = false; + + for (; nth > 0;) { + // We could calculate chunk size once, but this is to compensate that the + // original space is not parallelepiped and some threads can be left + // without work: + KMP_DEBUG_ASSERT(total >= new_iv); + + kmp_loop_nest_iv_t total_left = total - new_iv; + kmp_loop_nest_iv_t chunk_size = total_left / nth; + kmp_loop_nest_iv_t remainder = total_left % nth; + + kmp_loop_nest_iv_t curr_chunk_size = chunk_size; + + if (remainder > 0) { + ++curr_chunk_size; + --remainder; + } + +#if defined(KMP_DEBUG) + kmp_loop_nest_iv_t new_iv_for_start = new_iv; +#endif + + if (curr_chunk_size > 1) { + new_iv += curr_chunk_size - 1; + } + + CollapseAllocator<kmp_uint64> original_ivs_end(n); + if ((nth == 1) || (new_iv >= total - 1)) { + // Do this one till the end - just in case we miscalculated + // and either too much is left to process or new_iv is a bit too big: + kmp_calc_original_ivs_for_end(original_bounds_nest, n, + /*out*/ original_ivs_end); + + last_iter = true; + } else { + // Note: here we make sure it's past (or equal to) the previous point. + if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n, + updated_bounds_nest, + original_ivs_start, new_iv, + /*out*/ original_ivs_end)) { + // We could not find the ending point, use the original upper bounds: + kmp_calc_original_ivs_for_end(original_bounds_nest, n, + /*out*/ original_ivs_end); + + last_iter = true; + } + } + +#if defined(KMP_DEBUG) + auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs( + updated_bounds_nest, original_ivs_end, n); + KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start); +#endif + + if (last_iter && (tid != 0)) { + // We are done, this was last chunk, but no chunk for current thread was + // found: + return FALSE; + } + + if (tid == 0) { + // We found the chunk for this thread, now we need to check if it's the + // last chunk or not: + + CollapseAllocator<kmp_uint64> original_ivs_next_start(n); + if (last_iter || + !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end, + /*out*/ original_ivs_next_start)) { + // no more loop iterations left to process, + // this means that currently found chunk is the last chunk: + if (plastiter != NULL) { + *plastiter = TRUE; + } + } + + // Fill in chunk bounds: + for (kmp_index_t i = 0; i < n; ++i) { + chunk_bounds_nest[i] = + original_bounds_nest[i]; // To fill in types, etc. - optional + chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i]; + chunk_bounds_nest[i].lb1_u64 = 0; + + chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i]; + chunk_bounds_nest[i].ub1_u64 = 0; + } + + return TRUE; + } + + --tid; + --nth; + + bool next_chunk = kmp_calc_next_original_ivs( + original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start); + if (!next_chunk) { + // no more loop iterations to process, + // the prevoius chunk was the last chunk + break; + } + + // original_ivs_start is next to previous chunk original_ivs_end, + // we need to start new chunk here, so chunks will be one after another + // without any gap or overlap: + new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest, + original_ivs_start, n); + } + + return FALSE; +} |
