Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_collapse.cpp
35258 views
/*1* kmp_collapse.cpp -- loop collapse feature2*/34//===----------------------------------------------------------------------===//5//6// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.7// See https://llvm.org/LICENSE.txt for license information.8// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception9//10//===----------------------------------------------------------------------===//1112#include "kmp.h"13#include "kmp_error.h"14#include "kmp_i18n.h"15#include "kmp_itt.h"16#include "kmp_stats.h"17#include "kmp_str.h"18#include "kmp_collapse.h"1920#if OMPT_SUPPORT21#include "ompt-specific.h"22#endif2324// OMPTODO: different style of comments (see kmp_sched)25// OMPTODO: OMPT/OMPD2627// avoid inadevertently using a library based abs28template <typename T> T __kmp_abs(const T val) {29return (val < 0) ? -val : val;30}31kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }32kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }3334//----------------------------------------------------------------------------35// Common functions for working with rectangular and non-rectangular loops36//----------------------------------------------------------------------------3738template <typename T> int __kmp_sign(T val) {39return (T(0) < val) - (val < T(0));40}4142template <typename T> class CollapseAllocator {43typedef T *pT;4445private:46static const size_t allocaSize = 32; // size limit for stack allocations47// (8 bytes x 4 nested loops)48char stackAlloc[allocaSize];49static constexpr size_t maxElemCount = allocaSize / sizeof(T);50pT pTAlloc;5152public:53CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {54if (n > maxElemCount) {55pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));56}57}58~CollapseAllocator() {59if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {60__kmp_free(pTAlloc);61}62}63T &operator[](int index) { return pTAlloc[index]; }64operator const pT() { return pTAlloc; }65};6667//----------Loop canonicalization---------------------------------------------6869// For loop nest (any shape):70// convert != to < or >;71// switch from using < or > to <= or >=.72// "bounds" array has to be allocated per thread.73// All other internal functions will work only with canonicalized loops.74template <typename T>75void kmp_canonicalize_one_loop_XX(76ident_t *loc,77/*in/out*/ bounds_infoXX_template<T> *bounds) {7879if (__kmp_env_consistency_check) {80if (bounds->step == 0) {81__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,82loc);83}84}8586if (bounds->comparison == comparison_t::comp_not_eq) {87// We can convert this to < or >, depends on the sign of the step:88if (bounds->step > 0) {89bounds->comparison = comparison_t::comp_less;90} else {91bounds->comparison = comparison_t::comp_greater;92}93}9495if (bounds->comparison == comparison_t::comp_less) {96// Note: ub0 can be unsigned. Should be Ok to hit overflow here,97// because ub0 + ub1*j should be still positive (otherwise loop was not98// well formed)99bounds->ub0 -= 1;100bounds->comparison = comparison_t::comp_less_or_eq;101} else if (bounds->comparison == comparison_t::comp_greater) {102bounds->ub0 += 1;103bounds->comparison = comparison_t::comp_greater_or_eq;104}105}106107// Canonicalize loop nest. original_bounds_nest is an array of length n.108void kmp_canonicalize_loop_nest(ident_t *loc,109/*in/out*/ bounds_info_t *original_bounds_nest,110kmp_index_t n) {111112for (kmp_index_t ind = 0; ind < n; ++ind) {113auto bounds = &(original_bounds_nest[ind]);114115switch (bounds->loop_type) {116case loop_type_t::loop_type_int32:117kmp_canonicalize_one_loop_XX<kmp_int32>(118loc,119/*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));120break;121case loop_type_t::loop_type_uint32:122kmp_canonicalize_one_loop_XX<kmp_uint32>(123loc,124/*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));125break;126case loop_type_t::loop_type_int64:127kmp_canonicalize_one_loop_XX<kmp_int64>(128loc,129/*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));130break;131case loop_type_t::loop_type_uint64:132kmp_canonicalize_one_loop_XX<kmp_uint64>(133loc,134/*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));135break;136default:137KMP_ASSERT(false);138}139}140}141142//----------Calculating trip count on one level-------------------------------143144// Calculate trip count on this loop level.145// We do this either for a rectangular loop nest,146// or after an adjustment bringing the loops to a parallelepiped shape.147// This number should not depend on the value of outer IV148// even if the formular has lb1 and ub1.149// Note: for non-rectangular loops don't use span for this, it's too big.150151template <typename T>152kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(153/*in/out*/ bounds_infoXX_template<T> *bounds) {154155if (bounds->comparison == comparison_t::comp_less_or_eq) {156if (bounds->ub0 < bounds->lb0) {157// Note: after this we don't need to calculate inner loops,158// but that should be an edge case:159bounds->trip_count = 0;160} else {161// ub - lb may exceed signed type range; we need to cast to162// kmp_loop_nest_iv_t anyway163bounds->trip_count =164static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /165__kmp_abs(bounds->step) +1661;167}168} else if (bounds->comparison == comparison_t::comp_greater_or_eq) {169if (bounds->lb0 < bounds->ub0) {170// Note: after this we don't need to calculate inner loops,171// but that should be an edge case:172bounds->trip_count = 0;173} else {174// lb - ub may exceed signed type range; we need to cast to175// kmp_loop_nest_iv_t anyway176bounds->trip_count =177static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /178__kmp_abs(bounds->step) +1791;180}181} else {182KMP_ASSERT(false);183}184return bounds->trip_count;185}186187// Calculate trip count on this loop level.188kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {189190kmp_loop_nest_iv_t trip_count = 0;191192switch (bounds->loop_type) {193case loop_type_t::loop_type_int32:194trip_count = kmp_calculate_trip_count_XX<kmp_int32>(195/*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));196break;197case loop_type_t::loop_type_uint32:198trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(199/*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));200break;201case loop_type_t::loop_type_int64:202trip_count = kmp_calculate_trip_count_XX<kmp_int64>(203/*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));204break;205case loop_type_t::loop_type_uint64:206trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(207/*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));208break;209default:210KMP_ASSERT(false);211}212213return trip_count;214}215216//----------Trim original iv according to its type----------------------------217218// Trim original iv according to its type.219// Return kmp_uint64 value which can be easily used in all internal calculations220// And can be statically cast back to original type in user code.221kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {222kmp_uint64 res = 0;223224switch (loop_iv_type) {225case loop_type_t::loop_type_int8:226res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));227break;228case loop_type_t::loop_type_uint8:229res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));230break;231case loop_type_t::loop_type_int16:232res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));233break;234case loop_type_t::loop_type_uint16:235res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));236break;237case loop_type_t::loop_type_int32:238res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));239break;240case loop_type_t::loop_type_uint32:241res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));242break;243case loop_type_t::loop_type_int64:244res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));245break;246case loop_type_t::loop_type_uint64:247res = static_cast<kmp_uint64>(original_iv);248break;249default:250KMP_ASSERT(false);251}252253return res;254}255256//----------Compare two IVs (remember they have a type)-----------------------257258bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,259kmp_uint64 original_iv2) {260bool res = false;261262switch (loop_iv_type) {263case loop_type_t::loop_type_int8:264res = static_cast<kmp_int8>(original_iv1) ==265static_cast<kmp_int8>(original_iv2);266break;267case loop_type_t::loop_type_uint8:268res = static_cast<kmp_uint8>(original_iv1) ==269static_cast<kmp_uint8>(original_iv2);270break;271case loop_type_t::loop_type_int16:272res = static_cast<kmp_int16>(original_iv1) ==273static_cast<kmp_int16>(original_iv2);274break;275case loop_type_t::loop_type_uint16:276res = static_cast<kmp_uint16>(original_iv1) ==277static_cast<kmp_uint16>(original_iv2);278break;279case loop_type_t::loop_type_int32:280res = static_cast<kmp_int32>(original_iv1) ==281static_cast<kmp_int32>(original_iv2);282break;283case loop_type_t::loop_type_uint32:284res = static_cast<kmp_uint32>(original_iv1) ==285static_cast<kmp_uint32>(original_iv2);286break;287case loop_type_t::loop_type_int64:288res = static_cast<kmp_int64>(original_iv1) ==289static_cast<kmp_int64>(original_iv2);290break;291case loop_type_t::loop_type_uint64:292res = static_cast<kmp_uint64>(original_iv1) ==293static_cast<kmp_uint64>(original_iv2);294break;295default:296KMP_ASSERT(false);297}298299return res;300}301302//----------Calculate original iv on one level--------------------------------303304// Return true if the point fits into upper bounds on this level,305// false otherwise306template <typename T>307bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,308const kmp_point_t original_ivs,309kmp_index_t ind) {310311T iv = static_cast<T>(original_ivs[ind]);312T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);313314if (((bounds->comparison == comparison_t::comp_less_or_eq) &&315(iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||316((bounds->comparison == comparison_t::comp_greater_or_eq) &&317(iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {318// The calculated point is outside of loop upper boundary:319return false;320}321322return true;323}324325// Calculate one iv corresponding to iteration on the level ind.326// Return true if it fits into lower-upper bounds on this level327// (if not, we need to re-calculate)328template <typename T>329bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,330/*in/out*/ kmp_point_t original_ivs,331const kmp_iterations_t iterations, kmp_index_t ind,332bool start_with_lower_bound, bool checkBounds) {333334kmp_uint64 temp = 0;335T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);336337if (start_with_lower_bound) {338// we moved to the next iteration on one of outer loops, should start339// with the lower bound here:340temp = bounds->lb0 + bounds->lb1 * outer_iv;341} else {342auto iteration = iterations[ind];343temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;344}345346// Now trim original iv according to its type:347original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);348349if (checkBounds) {350return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);351} else {352return true;353}354}355356bool kmp_calc_one_iv(const bounds_info_t *bounds,357/*in/out*/ kmp_point_t original_ivs,358const kmp_iterations_t iterations, kmp_index_t ind,359bool start_with_lower_bound, bool checkBounds) {360361switch (bounds->loop_type) {362case loop_type_t::loop_type_int32:363return kmp_calc_one_iv_XX<kmp_int32>(364(bounds_infoXX_template<kmp_int32> *)(bounds),365/*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,366checkBounds);367break;368case loop_type_t::loop_type_uint32:369return kmp_calc_one_iv_XX<kmp_uint32>(370(bounds_infoXX_template<kmp_uint32> *)(bounds),371/*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,372checkBounds);373break;374case loop_type_t::loop_type_int64:375return kmp_calc_one_iv_XX<kmp_int64>(376(bounds_infoXX_template<kmp_int64> *)(bounds),377/*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,378checkBounds);379break;380case loop_type_t::loop_type_uint64:381return kmp_calc_one_iv_XX<kmp_uint64>(382(bounds_infoXX_template<kmp_uint64> *)(bounds),383/*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,384checkBounds);385break;386default:387KMP_ASSERT(false);388return false;389}390}391392//----------Calculate original iv on one level for rectangular loop nest------393394// Calculate one iv corresponding to iteration on the level ind.395// Return true if it fits into lower-upper bounds on this level396// (if not, we need to re-calculate)397template <typename T>398void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,399/*in/out*/ kmp_uint64 *original_ivs,400const kmp_iterations_t iterations,401kmp_index_t ind) {402403auto iteration = iterations[ind];404405kmp_uint64 temp =406bounds->lb0 +407bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +408iteration * bounds->step;409410// Now trim original iv according to its type:411original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);412}413414void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,415/*in/out*/ kmp_uint64 *original_ivs,416const kmp_iterations_t iterations,417kmp_index_t ind) {418419switch (bounds->loop_type) {420case loop_type_t::loop_type_int32:421kmp_calc_one_iv_rectang_XX<kmp_int32>(422(bounds_infoXX_template<kmp_int32> *)(bounds),423/*in/out*/ original_ivs, iterations, ind);424break;425case loop_type_t::loop_type_uint32:426kmp_calc_one_iv_rectang_XX<kmp_uint32>(427(bounds_infoXX_template<kmp_uint32> *)(bounds),428/*in/out*/ original_ivs, iterations, ind);429break;430case loop_type_t::loop_type_int64:431kmp_calc_one_iv_rectang_XX<kmp_int64>(432(bounds_infoXX_template<kmp_int64> *)(bounds),433/*in/out*/ original_ivs, iterations, ind);434break;435case loop_type_t::loop_type_uint64:436kmp_calc_one_iv_rectang_XX<kmp_uint64>(437(bounds_infoXX_template<kmp_uint64> *)(bounds),438/*in/out*/ original_ivs, iterations, ind);439break;440default:441KMP_ASSERT(false);442}443}444445//----------------------------------------------------------------------------446// Rectangular loop nest447//----------------------------------------------------------------------------448449//----------Canonicalize loop nest and calculate trip count-------------------450451// Canonicalize loop nest and calculate overall trip count.452// "bounds_nest" has to be allocated per thread.453// API will modify original bounds_nest array to bring it to a canonical form454// (only <= and >=, no !=, <, >). If the original loop nest was already in a455// canonical form there will be no changes to bounds in bounds_nest array456// (only trip counts will be calculated).457// Returns trip count of overall space.458extern "C" kmp_loop_nest_iv_t459__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,460/*in/out*/ bounds_info_t *original_bounds_nest,461kmp_index_t n) {462463kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);464465kmp_loop_nest_iv_t total = 1;466467for (kmp_index_t ind = 0; ind < n; ++ind) {468auto bounds = &(original_bounds_nest[ind]);469470kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);471total *= trip_count;472}473474return total;475}476477//----------Calculate old induction variables---------------------------------478479// Calculate old induction variables corresponding to overall new_iv.480// Note: original IV will be returned as if it had kmp_uint64 type,481// will have to be converted to original type in user code.482// Note: trip counts should be already calculated by483// __kmpc_process_loop_nest_rectang.484// OMPTODO: special case 2, 3 nested loops: either do different485// interface without array or possibly template this over n486extern "C" void487__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,488const bounds_info_t *original_bounds_nest,489/*out*/ kmp_uint64 *original_ivs,490kmp_index_t n) {491492CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);493494// First, calc corresponding iteration in every original loop:495for (kmp_index_t ind = n; ind > 0;) {496--ind;497auto bounds = &(original_bounds_nest[ind]);498499// should be optimized to OPDIVREM:500auto temp = new_iv / bounds->trip_count;501auto iteration = new_iv % bounds->trip_count;502new_iv = temp;503504iterations[ind] = iteration;505}506KMP_ASSERT(new_iv == 0);507508for (kmp_index_t ind = 0; ind < n; ++ind) {509auto bounds = &(original_bounds_nest[ind]);510511kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);512}513}514515//----------------------------------------------------------------------------516// Non-rectangular loop nest517//----------------------------------------------------------------------------518519//----------Calculate maximum possible span of iv values on one level---------520521// Calculate span for IV on this loop level for "<=" case.522// Note: it's for <= on this loop nest level, so lower bound should be smallest523// value, upper bound should be the biggest value. If the loop won't execute,524// 'smallest' may be bigger than 'biggest', but we'd better not switch them525// around.526template <typename T>527void kmp_calc_span_lessoreq_XX(528/* in/out*/ bounds_info_internalXX_template<T> *bounds,529/* in/out*/ bounds_info_internal_t *bounds_nest) {530531typedef typename traits_t<T>::unsigned_t UT;532// typedef typename traits_t<T>::signed_t ST;533534// typedef typename big_span_t span_t;535typedef T span_t;536537auto &bbounds = bounds->b;538539if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {540// This dimention depends on one of previous ones; can't be the outermost541// one.542bounds_info_internalXX_template<T> *previous =543reinterpret_cast<bounds_info_internalXX_template<T> *>(544&(bounds_nest[bbounds.outer_iv]));545546// OMPTODO: assert that T is compatible with loop variable type on547// 'previous' loop548549{550span_t bound_candidate1 =551bbounds.lb0 + bbounds.lb1 * previous->span_smallest;552span_t bound_candidate2 =553bbounds.lb0 + bbounds.lb1 * previous->span_biggest;554if (bound_candidate1 < bound_candidate2) {555bounds->span_smallest = bound_candidate1;556} else {557bounds->span_smallest = bound_candidate2;558}559}560561{562// We can't adjust the upper bound with respect to step, because563// lower bound might be off after adjustments564565span_t bound_candidate1 =566bbounds.ub0 + bbounds.ub1 * previous->span_smallest;567span_t bound_candidate2 =568bbounds.ub0 + bbounds.ub1 * previous->span_biggest;569if (bound_candidate1 < bound_candidate2) {570bounds->span_biggest = bound_candidate2;571} else {572bounds->span_biggest = bound_candidate1;573}574}575} else {576// Rectangular:577bounds->span_smallest = bbounds.lb0;578bounds->span_biggest = bbounds.ub0;579}580if (!bounds->loop_bounds_adjusted) {581// Here it's safe to reduce the space to the multiply of step.582// OMPTODO: check if the formular is correct.583// Also check if it would be safe to do this if we didn't adjust left side.584bounds->span_biggest -=585(static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?586}587}588589// Calculate span for IV on this loop level for ">=" case.590template <typename T>591void kmp_calc_span_greateroreq_XX(592/* in/out*/ bounds_info_internalXX_template<T> *bounds,593/* in/out*/ bounds_info_internal_t *bounds_nest) {594595typedef typename traits_t<T>::unsigned_t UT;596// typedef typename traits_t<T>::signed_t ST;597598// typedef typename big_span_t span_t;599typedef T span_t;600601auto &bbounds = bounds->b;602603if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {604// This dimention depends on one of previous ones; can't be the outermost605// one.606bounds_info_internalXX_template<T> *previous =607reinterpret_cast<bounds_info_internalXX_template<T> *>(608&(bounds_nest[bbounds.outer_iv]));609610// OMPTODO: assert that T is compatible with loop variable type on611// 'previous' loop612613{614span_t bound_candidate1 =615bbounds.lb0 + bbounds.lb1 * previous->span_smallest;616span_t bound_candidate2 =617bbounds.lb0 + bbounds.lb1 * previous->span_biggest;618if (bound_candidate1 >= bound_candidate2) {619bounds->span_smallest = bound_candidate1;620} else {621bounds->span_smallest = bound_candidate2;622}623}624625{626// We can't adjust the upper bound with respect to step, because627// lower bound might be off after adjustments628629span_t bound_candidate1 =630bbounds.ub0 + bbounds.ub1 * previous->span_smallest;631span_t bound_candidate2 =632bbounds.ub0 + bbounds.ub1 * previous->span_biggest;633if (bound_candidate1 >= bound_candidate2) {634bounds->span_biggest = bound_candidate2;635} else {636bounds->span_biggest = bound_candidate1;637}638}639640} else {641// Rectangular:642bounds->span_biggest = bbounds.lb0;643bounds->span_smallest = bbounds.ub0;644}645if (!bounds->loop_bounds_adjusted) {646// Here it's safe to reduce the space to the multiply of step.647// OMPTODO: check if the formular is correct.648// Also check if it would be safe to do this if we didn't adjust left side.649bounds->span_biggest -=650(static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?651}652}653654// Calculate maximum possible span for IV on this loop level.655template <typename T>656void kmp_calc_span_XX(657/* in/out*/ bounds_info_internalXX_template<T> *bounds,658/* in/out*/ bounds_info_internal_t *bounds_nest) {659660if (bounds->b.comparison == comparison_t::comp_less_or_eq) {661kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);662} else {663KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);664kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);665}666}667668//----------All initial processing of the loop nest---------------------------669670// Calculate new bounds for this loop level.671// To be able to work with the nest we need to get it to a parallelepiped shape.672// We need to stay in the original range of values, so that there will be no673// overflow, for that we'll adjust both upper and lower bounds as needed.674template <typename T>675void kmp_calc_new_bounds_XX(676/* in/out*/ bounds_info_internalXX_template<T> *bounds,677/* in/out*/ bounds_info_internal_t *bounds_nest) {678679auto &bbounds = bounds->b;680681if (bbounds.lb1 == bbounds.ub1) {682// Already parallel, no need to adjust:683bounds->loop_bounds_adjusted = false;684} else {685bounds->loop_bounds_adjusted = true;686687T old_lb1 = bbounds.lb1;688T old_ub1 = bbounds.ub1;689690if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {691// With this shape we can adjust to a rectangle:692bbounds.lb1 = 0;693bbounds.ub1 = 0;694} else {695// get upper and lower bounds to be parallel696// with values in the old range.697// Note: abs didn't work here.698if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||699((old_lb1 > 0) && (old_lb1 > old_ub1))) {700bbounds.lb1 = old_ub1;701} else {702bbounds.ub1 = old_lb1;703}704}705706// Now need to adjust lb0, ub0, otherwise in some cases space will shrink.707// The idea here that for this IV we are now getting the same span708// irrespective of the previous IV value.709bounds_info_internalXX_template<T> *previous =710reinterpret_cast<bounds_info_internalXX_template<T> *>(711&bounds_nest[bbounds.outer_iv]);712713if (bbounds.comparison == comparison_t::comp_less_or_eq) {714if (old_lb1 < bbounds.lb1) {715KMP_ASSERT(old_lb1 < 0);716// The length is good on outer_iv biggest number,717// can use it to find where to move the lower bound:718719T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;720bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?721// e.g. it was 0?? (same below)722} else if (old_lb1 > bbounds.lb1) {723// still need to move lower bound:724T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;725bbounds.lb0 += add;726}727728if (old_ub1 > bbounds.ub1) {729KMP_ASSERT(old_ub1 > 0);730// The length is good on outer_iv biggest number,731// can use it to find where to move upper bound:732733T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;734bbounds.ub0 += add;735} else if (old_ub1 < bbounds.ub1) {736// still need to move upper bound:737T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;738bbounds.ub0 -= sub;739}740} else {741KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);742if (old_lb1 < bbounds.lb1) {743KMP_ASSERT(old_lb1 < 0);744T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;745bbounds.lb0 -= sub;746} else if (old_lb1 > bbounds.lb1) {747T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;748bbounds.lb0 += add;749}750751if (old_ub1 > bbounds.ub1) {752KMP_ASSERT(old_ub1 > 0);753T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;754bbounds.ub0 += add;755} else if (old_ub1 < bbounds.ub1) {756T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;757bbounds.ub0 -= sub;758}759}760}761}762763// Do all processing for one canonicalized loop in the nest764// (assuming that outer loops already were processed):765template <typename T>766kmp_loop_nest_iv_t kmp_process_one_loop_XX(767/* in/out*/ bounds_info_internalXX_template<T> *bounds,768/*in/out*/ bounds_info_internal_t *bounds_nest) {769770kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);771kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);772return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));773}774775// Non-rectangular loop nest, canonicalized to use <= or >=.776// Process loop nest to have a parallelepiped shape,777// calculate biggest spans for IV's on all levels and calculate overall trip778// count. "bounds_nest" has to be allocated per thread.779// Returns overall trip count (for adjusted space).780kmp_loop_nest_iv_t kmp_process_loop_nest(781/*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {782783kmp_loop_nest_iv_t total = 1;784785for (kmp_index_t ind = 0; ind < n; ++ind) {786auto bounds = &(bounds_nest[ind]);787kmp_loop_nest_iv_t trip_count = 0;788789switch (bounds->b.loop_type) {790case loop_type_t::loop_type_int32:791trip_count = kmp_process_one_loop_XX<kmp_int32>(792/*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),793/*in/out*/ bounds_nest);794break;795case loop_type_t::loop_type_uint32:796trip_count = kmp_process_one_loop_XX<kmp_uint32>(797/*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),798/*in/out*/ bounds_nest);799break;800case loop_type_t::loop_type_int64:801trip_count = kmp_process_one_loop_XX<kmp_int64>(802/*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),803/*in/out*/ bounds_nest);804break;805case loop_type_t::loop_type_uint64:806trip_count = kmp_process_one_loop_XX<kmp_uint64>(807/*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),808/*in/out*/ bounds_nest);809break;810default:811KMP_ASSERT(false);812}813total *= trip_count;814}815816return total;817}818819//----------Calculate iterations (in the original or updated space)-----------820821// Calculate number of iterations in original or updated space resulting in822// original_ivs[ind] (only on this level, non-negative)823// (not counting initial iteration)824template <typename T>825kmp_loop_nest_iv_t826kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,827const kmp_point_t original_ivs,828kmp_index_t ind) {829830kmp_loop_nest_iv_t iterations = 0;831832if (bounds->comparison == comparison_t::comp_less_or_eq) {833iterations =834(static_cast<T>(original_ivs[ind]) - bounds->lb0 -835bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /836__kmp_abs(bounds->step);837} else {838KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);839iterations = (bounds->lb0 +840bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -841static_cast<T>(original_ivs[ind])) /842__kmp_abs(bounds->step);843}844845return iterations;846}847848// Calculate number of iterations in the original or updated space resulting in849// original_ivs[ind] (only on this level, non-negative)850kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,851const kmp_point_t original_ivs,852kmp_index_t ind) {853854switch (bounds->loop_type) {855case loop_type_t::loop_type_int32:856return kmp_calc_number_of_iterations_XX<kmp_int32>(857(bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);858break;859case loop_type_t::loop_type_uint32:860return kmp_calc_number_of_iterations_XX<kmp_uint32>(861(bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);862break;863case loop_type_t::loop_type_int64:864return kmp_calc_number_of_iterations_XX<kmp_int64>(865(bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);866break;867case loop_type_t::loop_type_uint64:868return kmp_calc_number_of_iterations_XX<kmp_uint64>(869(bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);870break;871default:872KMP_ASSERT(false);873return 0;874}875}876877//----------Calculate new iv corresponding to original ivs--------------------878879// We got a point in the original loop nest.880// Take updated bounds and calculate what new_iv will correspond to this point.881// When we are getting original IVs from new_iv, we have to adjust to fit into882// original loops bounds. Getting new_iv for the adjusted original IVs will help883// with making more chunks non-empty.884kmp_loop_nest_iv_t885kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,886const kmp_point_t original_ivs,887kmp_index_t n) {888889kmp_loop_nest_iv_t new_iv = 0;890891for (kmp_index_t ind = 0; ind < n; ++ind) {892auto bounds = &(bounds_nest[ind].b);893894new_iv = new_iv * bounds->trip_count +895kmp_calc_number_of_iterations(bounds, original_ivs, ind);896}897898return new_iv;899}900901//----------Calculate original ivs for provided iterations--------------------902903// Calculate original IVs for provided iterations, assuming iterations are904// calculated in the original space.905// Loop nest is in canonical form (with <= / >=).906bool kmp_calc_original_ivs_from_iterations(907const bounds_info_t *original_bounds_nest, kmp_index_t n,908/*in/out*/ kmp_point_t original_ivs,909/*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {910911kmp_index_t lengthened_ind = n;912913for (; ind < n;) {914auto bounds = &(original_bounds_nest[ind]);915bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,916ind, (lengthened_ind < ind), true);917918if (!good) {919// The calculated iv value is too big (or too small for >=):920if (ind == 0) {921// Space is empty:922return false;923} else {924// Go to next iteration on the outer loop:925--ind;926++iterations[ind];927lengthened_ind = ind;928for (kmp_index_t i = ind + 1; i < n; ++i) {929iterations[i] = 0;930}931continue;932}933}934++ind;935}936937return true;938}939940//----------Calculate original ivs for the beginning of the loop nest---------941942// Calculate IVs for the beginning of the loop nest.943// Note: lower bounds of all loops may not work -944// if on some of the iterations of the outer loops inner loops are empty.945// Loop nest is in canonical form (with <= / >=).946bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,947kmp_index_t n,948/*out*/ kmp_point_t original_ivs) {949950// Iterations in the original space, multiplied by step:951CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);952for (kmp_index_t ind = n; ind > 0;) {953--ind;954iterations[ind] = 0;955}956957// Now calculate the point:958bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,959/*in/out*/ original_ivs,960/*in/out*/ iterations, 0);961return b;962}963964//----------Calculate next point in the original loop space-------------------965966// From current set of original IVs calculate next point.967// Return false if there is no next point in the loop bounds.968bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,969kmp_index_t n, const kmp_point_t original_ivs,970/*out*/ kmp_point_t next_original_ivs) {971// Iterations in the original space, multiplied by step (so can be negative):972CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);973// First, calc corresponding iteration in every original loop:974for (kmp_index_t ind = 0; ind < n; ++ind) {975auto bounds = &(original_bounds_nest[ind]);976iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);977}978979for (kmp_index_t ind = 0; ind < n; ++ind) {980next_original_ivs[ind] = original_ivs[ind];981}982983// Next add one step to the iterations on the inner-most level, and see if we984// need to move up the nest:985kmp_index_t ind = n - 1;986++iterations[ind];987988bool b = kmp_calc_original_ivs_from_iterations(989original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);990991return b;992}993994//----------Calculate chunk end in the original loop space--------------------995996// For one level calculate old induction variable corresponding to overall997// new_iv for the chunk end.998// Return true if it fits into upper bound on this level999// (if not, we need to re-calculate)1000template <typename T>1001bool kmp_calc_one_iv_for_chunk_end_XX(1002const bounds_infoXX_template<T> *bounds,1003const bounds_infoXX_template<T> *updated_bounds,1004/*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,1005kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,1006const kmp_point_t original_ivs_start) {10071008// typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>1009// big_span_t;10101011// OMPTODO: is it good enough, or do we need ST or do we need big_span_t?1012T temp = 0;10131014T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);10151016if (start_with_lower_bound) {1017// we moved to the next iteration on one of outer loops, may as well use1018// the lower bound here:1019temp = bounds->lb0 + bounds->lb1 * outer_iv;1020} else {1021// Start in expanded space, but:1022// - we need to hit original space lower bound, so need to account for1023// that1024// - we have to go into original space, even if that means adding more1025// iterations than was planned1026// - we have to go past (or equal to) previous point (which is the chunk1027// starting point)10281029auto iteration = iterations[ind];10301031auto step = bounds->step;10321033// In case of >= it's negative:1034auto accountForStep =1035((bounds->lb0 + bounds->lb1 * outer_iv) -1036(updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %1037step;10381039temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +1040accountForStep + iteration * step;10411042if (((bounds->comparison == comparison_t::comp_less_or_eq) &&1043(temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||1044((bounds->comparison == comparison_t::comp_greater_or_eq) &&1045(temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {1046// Too small (or too big), didn't reach the original lower bound. Use1047// heuristic:1048temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;1049}10501051if (compare_with_start) {10521053T start = static_cast<T>(original_ivs_start[ind]);10541055temp = kmp_fix_iv(bounds->loop_iv_type, temp);10561057// On all previous levels start of the chunk is same as the end, need to1058// be really careful here:1059if (((bounds->comparison == comparison_t::comp_less_or_eq) &&1060(temp < start)) ||1061((bounds->comparison == comparison_t::comp_greater_or_eq) &&1062(temp > start))) {1063// End of the chunk can't be smaller (for >= bigger) than it's start.1064// Use heuristic:1065temp = start + iteration / 4 * step;1066}1067}1068}10691070original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);10711072if (((bounds->comparison == comparison_t::comp_less_or_eq) &&1073(temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||1074((bounds->comparison == comparison_t::comp_greater_or_eq) &&1075(temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {1076// Too big (or too small for >=).1077return false;1078}10791080return true;1081}10821083// For one level calculate old induction variable corresponding to overall1084// new_iv for the chunk end.1085bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,1086const bounds_info_t *updated_bounds,1087/*in/out*/ kmp_point_t original_ivs,1088const kmp_iterations_t iterations,1089kmp_index_t ind, bool start_with_lower_bound,1090bool compare_with_start,1091const kmp_point_t original_ivs_start) {10921093switch (bounds->loop_type) {1094case loop_type_t::loop_type_int32:1095return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(1096(bounds_infoXX_template<kmp_int32> *)(bounds),1097(bounds_infoXX_template<kmp_int32> *)(updated_bounds),1098/*in/out*/1099original_ivs, iterations, ind, start_with_lower_bound,1100compare_with_start, original_ivs_start);1101break;1102case loop_type_t::loop_type_uint32:1103return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(1104(bounds_infoXX_template<kmp_uint32> *)(bounds),1105(bounds_infoXX_template<kmp_uint32> *)(updated_bounds),1106/*in/out*/1107original_ivs, iterations, ind, start_with_lower_bound,1108compare_with_start, original_ivs_start);1109break;1110case loop_type_t::loop_type_int64:1111return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(1112(bounds_infoXX_template<kmp_int64> *)(bounds),1113(bounds_infoXX_template<kmp_int64> *)(updated_bounds),1114/*in/out*/1115original_ivs, iterations, ind, start_with_lower_bound,1116compare_with_start, original_ivs_start);1117break;1118case loop_type_t::loop_type_uint64:1119return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(1120(bounds_infoXX_template<kmp_uint64> *)(bounds),1121(bounds_infoXX_template<kmp_uint64> *)(updated_bounds),1122/*in/out*/1123original_ivs, iterations, ind, start_with_lower_bound,1124compare_with_start, original_ivs_start);1125break;1126default:1127KMP_ASSERT(false);1128return false;1129}1130}11311132// Calculate old induction variables corresponding to overall new_iv for the1133// chunk end. If due to space extension we are getting old IVs outside of the1134// boundaries, bring them into the boundaries. Need to do this in the runtime,1135// esp. on the lower bounds side. When getting result need to make sure that the1136// new chunk starts at next position to old chunk, not overlaps with it (this is1137// done elsewhere), and need to make sure end of the chunk is further than the1138// beginning of the chunk. We don't need an exact ending point here, just1139// something more-or-less close to the desired chunk length, bigger is fine1140// (smaller would be fine, but we risk going into infinite loop, so do smaller1141// only at the very end of the space). result: false if could not find the1142// ending point in the original loop space. In this case the caller can use1143// original upper bounds as the end of the chunk. Chunk won't be empty, because1144// it'll have at least the starting point, which is by construction in the1145// original space.1146bool kmp_calc_original_ivs_for_chunk_end(1147const bounds_info_t *original_bounds_nest, kmp_index_t n,1148const bounds_info_internal_t *updated_bounds_nest,1149const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,1150/*out*/ kmp_point_t original_ivs) {11511152// Iterations in the expanded space:1153CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);1154// First, calc corresponding iteration in every modified loop:1155for (kmp_index_t ind = n; ind > 0;) {1156--ind;1157auto &updated_bounds = updated_bounds_nest[ind];11581159// should be optimized to OPDIVREM:1160auto new_ind = new_iv / updated_bounds.b.trip_count;1161auto iteration = new_iv % updated_bounds.b.trip_count;11621163new_iv = new_ind;1164iterations[ind] = iteration;1165}1166KMP_DEBUG_ASSERT(new_iv == 0);11671168kmp_index_t lengthened_ind = n;1169kmp_index_t equal_ind = -1;11701171// Next calculate the point, but in original loop nest.1172for (kmp_index_t ind = 0; ind < n;) {1173auto bounds = &(original_bounds_nest[ind]);1174auto updated_bounds = &(updated_bounds_nest[ind].b);11751176bool good = kmp_calc_one_iv_for_chunk_end(1177bounds, updated_bounds,1178/*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),1179(equal_ind >= ind - 1), original_ivs_start);11801181if (!good) {1182// Too big (or too small for >=).1183if (ind == 0) {1184// Need to reduce to the end.1185return false;1186} else {1187// Go to next iteration on outer loop:1188--ind;1189++(iterations[ind]);1190lengthened_ind = ind;1191if (equal_ind >= lengthened_ind) {1192// We've changed the number of iterations here,1193// can't be same anymore:1194equal_ind = lengthened_ind - 1;1195}1196for (kmp_index_t i = ind + 1; i < n; ++i) {1197iterations[i] = 0;1198}1199continue;1200}1201}12021203if ((equal_ind == ind - 1) &&1204(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],1205original_ivs_start[ind]))) {1206equal_ind = ind;1207} else if ((equal_ind > ind - 1) &&1208!(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],1209original_ivs_start[ind]))) {1210equal_ind = ind - 1;1211}1212++ind;1213}12141215return true;1216}12171218//----------Calculate upper bounds for the last chunk-------------------------12191220// Calculate one upper bound for the end.1221template <typename T>1222void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,1223/*in/out*/ kmp_point_t original_ivs,1224kmp_index_t ind) {12251226T temp = bounds->ub0 +1227bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);12281229original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);1230}12311232void kmp_calc_one_iv_end(const bounds_info_t *bounds,1233/*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {12341235switch (bounds->loop_type) {1236default:1237KMP_ASSERT(false);1238break;1239case loop_type_t::loop_type_int32:1240kmp_calc_one_iv_end_XX<kmp_int32>(1241(bounds_infoXX_template<kmp_int32> *)(bounds),1242/*in/out*/ original_ivs, ind);1243break;1244case loop_type_t::loop_type_uint32:1245kmp_calc_one_iv_end_XX<kmp_uint32>(1246(bounds_infoXX_template<kmp_uint32> *)(bounds),1247/*in/out*/ original_ivs, ind);1248break;1249case loop_type_t::loop_type_int64:1250kmp_calc_one_iv_end_XX<kmp_int64>(1251(bounds_infoXX_template<kmp_int64> *)(bounds),1252/*in/out*/ original_ivs, ind);1253break;1254case loop_type_t::loop_type_uint64:1255kmp_calc_one_iv_end_XX<kmp_uint64>(1256(bounds_infoXX_template<kmp_uint64> *)(bounds),1257/*in/out*/ original_ivs, ind);1258break;1259}1260}12611262// Calculate upper bounds for the last loop iteration. Just use original upper1263// bounds (adjusted when canonicalized to use <= / >=). No need to check that1264// this point is in the original space (it's likely not)1265void kmp_calc_original_ivs_for_end(1266const bounds_info_t *const original_bounds_nest, kmp_index_t n,1267/*out*/ kmp_point_t original_ivs) {1268for (kmp_index_t ind = 0; ind < n; ++ind) {1269auto bounds = &(original_bounds_nest[ind]);1270kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);1271}1272}12731274/**************************************************************************1275* Identify nested loop structure - loops come in the canonical form1276* Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0}1277* j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}1278* Upper Triangle matrix1279* i = 0; i <= N; i++ {0,0}:{N,0}1280* j = 0+1*i; j <= N; j++ {0,1}:{N,0}1281* ************************************************************************/1282nested_loop_type_t1283kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,1284/*in*/ kmp_index_t n) {1285// only 2-level nested loops are supported1286if (n != 2) {1287return nested_loop_type_unkown;1288}1289// loops must be canonical1290KMP_ASSERT(1291(original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&1292(original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));1293// check outer loop bounds: for triangular need to be {0,0}:{N,0}1294kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,1295original_bounds_nest[0].lb0_u64);1296kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,1297original_bounds_nest[0].ub0_u64);1298kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,1299original_bounds_nest[0].lb1_u64);1300kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,1301original_bounds_nest[0].ub1_u64);1302if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {1303return nested_loop_type_unkown;1304}1305// check inner bounds to determine triangle type1306kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,1307original_bounds_nest[1].lb0_u64);1308kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,1309original_bounds_nest[1].ub0_u64);1310kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,1311original_bounds_nest[1].lb1_u64);1312kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,1313original_bounds_nest[1].ub1_u64);1314// lower triangle loop inner bounds need to be {0,0}:{0/-1,1}1315if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&1316(inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {1317return nested_loop_type_lower_triangular_matrix;1318}1319// upper triangle loop inner bounds need to be {0,1}:{N,0}1320if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&1321inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {1322return nested_loop_type_upper_triangular_matrix;1323}1324return nested_loop_type_unkown;1325}13261327/**************************************************************************1328* SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf1329* Start point is x so the result is always > sqrt(x)1330* The method has uniform convergence, PRECISION is set to 0.11331* ************************************************************************/1332#define level_of_precision 0.11333double sqrt_newton_approx(/*in*/ kmp_uint64 x) {1334double sqrt_old = 0.;1335double sqrt_new = (double)x;1336do {1337sqrt_old = sqrt_new;1338sqrt_new = (sqrt_old + x / sqrt_old) / 2;1339} while ((sqrt_old - sqrt_new) > level_of_precision);1340return sqrt_new;1341}13421343/**************************************************************************1344* Handle lower triangle matrix in the canonical form1345* i = 0; i <= N; i++ {0,0}:{N,0}1346* j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}1347* ************************************************************************/1348void kmp_handle_lower_triangle_matrix(1349/*in*/ kmp_uint32 nth,1350/*in*/ kmp_uint32 tid,1351/*in */ kmp_index_t n,1352/*in/out*/ bounds_info_t *original_bounds_nest,1353/*out*/ bounds_info_t *chunk_bounds_nest) {13541355// transfer loop types from the original loop to the chunks1356for (kmp_index_t i = 0; i < n; ++i) {1357chunk_bounds_nest[i] = original_bounds_nest[i];1358}1359// cleanup iv variables1360kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,1361original_bounds_nest[0].ub0_u64);1362kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,1363original_bounds_nest[0].lb0_u64);1364kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,1365original_bounds_nest[1].ub0_u64);1366// calculate the chunk's lower and upper bounds1367// the total number of iterations in the loop is the sum of the arithmetic1368// progression from the outer lower to outer upper bound (inclusive since the1369// loop is canonical) note that less_than inner loops (inner_ub0 = -1)1370// effectively make the progression 1-based making N = (outer_ub0 - inner_lb01371// + 1) -> N - 11372kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;1373kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;1374// the current thread's number of iterations:1375// each thread gets an equal number of iterations: total number of iterations1376// divided by the number of threads plus, if there's a remainder,1377// the first threads with the number up to the remainder get an additional1378// iteration each to cover it1379kmp_uint64 iter_current =1380iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);1381// cumulative number of iterations executed by all the previous threads:1382// threads with the tid below the remainder will have (iter_total/nth+1)1383// elements, and so will all threads before them so the cumulative number of1384// iterations executed by the all previous will be the current thread's number1385// of iterations multiplied by the number of previous threads which is equal1386// to the current thread's tid; threads with the number equal or above the1387// remainder will have (iter_total/nth) elements so the cumulative number of1388// iterations previously executed is its number of iterations multipled by the1389// number of previous threads which is again equal to the current thread's tid1390// PLUS all the remainder iterations that will have been executed by the1391// previous threads1392kmp_uint64 iter_before_current =1393tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));1394// cumulative number of iterations executed with the current thread is1395// the cumulative number executed before it plus its own1396kmp_uint64 iter_with_current = iter_before_current + iter_current;1397// calculate the outer loop lower bound (lbo) which is the max outer iv value1398// that gives the number of iterations that is equal or just below the total1399// number of iterations executed by the previous threads, for less_than1400// (1-based) inner loops (inner_ub0 == -1) it will be i.e.1401// lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=01402// for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:1403// i.e. lbo*(lbo+1)/2<=iter_before_current =>1404// lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily1405// using a parameter to control the equation sign1406kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;1407kmp_uint64 lower_bound_outer =1408(kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +14098 * iter_before_current) +1410inner_adjustment) /14112 -1412inner_adjustment;1413// calculate the inner loop lower bound which is the remaining number of1414// iterations required to hit the total number of iterations executed by the1415// previous threads giving the starting point of this thread1416kmp_uint64 lower_bound_inner =1417iter_before_current -1418((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;1419// calculate the outer loop upper bound using the same approach as for the1420// inner bound except using the total number of iterations executed with the1421// current thread1422kmp_uint64 upper_bound_outer =1423(kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +14248 * iter_with_current) +1425inner_adjustment) /14262 -1427inner_adjustment;1428// calculate the inner loop upper bound which is the remaining number of1429// iterations required to hit the total number of iterations executed after1430// the current thread giving the starting point of the next thread1431kmp_uint64 upper_bound_inner =1432iter_with_current -1433((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;1434// adjust the upper bounds down by 1 element to point at the last iteration of1435// the current thread the first iteration of the next thread1436if (upper_bound_inner == 0) {1437// {n,0} => {n-1,n-1}1438upper_bound_outer -= 1;1439upper_bound_inner = upper_bound_outer;1440} else {1441// {n,m} => {n,m-1} (m!=0)1442upper_bound_inner -= 1;1443}14441445// assign the values, zeroing out lb1 and ub1 values since the iteration space1446// is now one-dimensional1447chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;1448chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;1449chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;1450chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;1451chunk_bounds_nest[0].lb1_u64 = 0;1452chunk_bounds_nest[0].ub1_u64 = 0;1453chunk_bounds_nest[1].lb1_u64 = 0;1454chunk_bounds_nest[1].ub1_u64 = 0;14551456#if 01457printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",1458tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,1459chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);1460#endif1461}14621463/**************************************************************************1464* Handle upper triangle matrix in the canonical form1465* i = 0; i <= N; i++ {0,0}:{N,0}1466* j = 0+1*i; j <= N; j++ {0,1}:{N,0}1467* ************************************************************************/1468void kmp_handle_upper_triangle_matrix(1469/*in*/ kmp_uint32 nth,1470/*in*/ kmp_uint32 tid,1471/*in */ kmp_index_t n,1472/*in/out*/ bounds_info_t *original_bounds_nest,1473/*out*/ bounds_info_t *chunk_bounds_nest) {14741475// transfer loop types from the original loop to the chunks1476for (kmp_index_t i = 0; i < n; ++i) {1477chunk_bounds_nest[i] = original_bounds_nest[i];1478}1479// cleanup iv variables1480kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,1481original_bounds_nest[0].ub0_u64);1482kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,1483original_bounds_nest[0].lb0_u64);1484[[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv(1485original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);1486// calculate the chunk's lower and upper bounds1487// the total number of iterations in the loop is the sum of the arithmetic1488// progression from the outer lower to outer upper bound (inclusive since the1489// loop is canonical) note that less_than inner loops (inner_ub0 = -1)1490// effectively make the progression 1-based making N = (outer_ub0 - inner_lb01491// + 1) -> N - 11492kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);1493kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;1494// the current thread's number of iterations:1495// each thread gets an equal number of iterations: total number of iterations1496// divided by the number of threads plus, if there's a remainder,1497// the first threads with the number up to the remainder get an additional1498// iteration each to cover it1499kmp_uint64 iter_current =1500iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);1501// cumulative number of iterations executed by all the previous threads:1502// threads with the tid below the remainder will have (iter_total/nth+1)1503// elements, and so will all threads before them so the cumulative number of1504// iterations executed by the all previous will be the current thread's number1505// of iterations multiplied by the number of previous threads which is equal1506// to the current thread's tid; threads with the number equal or above the1507// remainder will have (iter_total/nth) elements so the cumulative number of1508// iterations previously executed is its number of iterations multipled by the1509// number of previous threads which is again equal to the current thread's tid1510// PLUS all the remainder iterations that will have been executed by the1511// previous threads1512kmp_uint64 iter_before_current =1513tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));1514// cumulative number of iterations executed with the current thread is1515// the cumulative number executed before it plus its own1516kmp_uint64 iter_with_current = iter_before_current + iter_current;1517// calculate the outer loop lower bound (lbo) which is the max outer iv value1518// that gives the number of iterations that is equal or just below the total1519// number of iterations executed by the previous threads:1520// lbo*(lbo+1)/2<=iter_before_current =>1521// lbo^2+lbo-2*iter_before_current<=01522kmp_uint64 lower_bound_outer =1523(kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;1524// calculate the inner loop lower bound which is the remaining number of1525// iterations required to hit the total number of iterations executed by the1526// previous threads giving the starting point of this thread1527kmp_uint64 lower_bound_inner =1528iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;1529// calculate the outer loop upper bound using the same approach as for the1530// inner bound except using the total number of iterations executed with the1531// current thread1532kmp_uint64 upper_bound_outer =1533(kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;1534// calculate the inner loop upper bound which is the remaining number of1535// iterations required to hit the total number of iterations executed after1536// the current thread giving the starting point of the next thread1537kmp_uint64 upper_bound_inner =1538iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;1539// adjust the upper bounds down by 1 element to point at the last iteration of1540// the current thread the first iteration of the next thread1541if (upper_bound_inner == 0) {1542// {n,0} => {n-1,n-1}1543upper_bound_outer -= 1;1544upper_bound_inner = upper_bound_outer;1545} else {1546// {n,m} => {n,m-1} (m!=0)1547upper_bound_inner -= 1;1548}15491550// assign the values, zeroing out lb1 and ub1 values since the iteration space1551// is now one-dimensional1552chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;1553chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;1554chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;1555chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;1556chunk_bounds_nest[0].lb1_u64 = 0;1557chunk_bounds_nest[0].ub1_u64 = 0;1558chunk_bounds_nest[1].lb1_u64 = 0;1559chunk_bounds_nest[1].ub1_u64 = 0;15601561#if 01562printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",1563tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,1564chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);1565#endif1566}1567//----------Init API for non-rectangular loops--------------------------------15681569// Init API for collapsed loops (static, no chunks defined).1570// "bounds_nest" has to be allocated per thread.1571// API will modify original bounds_nest array to bring it to a canonical form1572// (only <= and >=, no !=, <, >). If the original loop nest was already in a1573// canonical form there will be no changes to bounds in bounds_nest array1574// (only trip counts will be calculated). Internally API will expand the space1575// to parallelogram/parallelepiped, calculate total, calculate bounds for the1576// chunks in terms of the new IV, re-calc them in terms of old IVs (especially1577// important on the left side, to hit the lower bounds and not step over), and1578// pick the correct chunk for this thread (so it will calculate chunks up to the1579// needed one). It could be optimized to calculate just this chunk, potentially1580// a bit less well distributed among threads. It is designed to make sure that1581// threads will receive predictable chunks, deterministically (so that next nest1582// of loops with similar characteristics will get exactly same chunks on same1583// threads).1584// Current contract: chunk_bounds_nest has only lb0 and ub0,1585// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).1586extern "C" kmp_int321587__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,1588/*in/out*/ bounds_info_t *original_bounds_nest,1589/*out*/ bounds_info_t *chunk_bounds_nest,1590kmp_index_t n, /*out*/ kmp_int32 *plastiter) {15911592KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);1593KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));15941595if (__kmp_env_consistency_check) {1596__kmp_push_workshare(gtid, ct_pdo, loc);1597}15981599kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);16001601CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);16021603for (kmp_index_t i = 0; i < n; ++i) {1604updated_bounds_nest[i].b = original_bounds_nest[i];1605}16061607kmp_loop_nest_iv_t total =1608kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);16091610if (plastiter != NULL) {1611*plastiter = FALSE;1612}16131614if (total == 0) {1615// Loop won't execute:1616return FALSE;1617}16181619// OMPTODO: DISTRIBUTE is not supported yet1620__kmp_assert_valid_gtid(gtid);1621kmp_uint32 tid = __kmp_tid_from_gtid(gtid);16221623kmp_info_t *th = __kmp_threads[gtid];1624kmp_team_t *team = th->th.th_team;1625kmp_uint32 nth = team->t.t_nproc; // Number of threads16261627KMP_DEBUG_ASSERT(tid < nth);16281629// Handle special cases1630nested_loop_type_t loop_type =1631kmp_identify_nested_loop_structure(original_bounds_nest, n);1632if (loop_type == nested_loop_type_lower_triangular_matrix) {1633kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,1634chunk_bounds_nest);1635return TRUE;1636} else if (loop_type == nested_loop_type_upper_triangular_matrix) {1637kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,1638chunk_bounds_nest);1639return TRUE;1640}16411642CollapseAllocator<kmp_uint64> original_ivs_start(n);16431644if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,1645/*out*/ original_ivs_start)) {1646// Loop won't execute:1647return FALSE;1648}16491650// Not doing this optimization for one thread:1651// (1) more to test1652// (2) without it current contract that chunk_bounds_nest has only lb0 and1653// ub0, lb1 and ub1 are set to 0 and can be ignored.1654// if (nth == 1) {1655// // One thread:1656// // Copy all info from original_bounds_nest, it'll be good enough.16571658// for (kmp_index_t i = 0; i < n; ++i) {1659// chunk_bounds_nest[i] = original_bounds_nest[i];1660// }16611662// if (plastiter != NULL) {1663// *plastiter = TRUE;1664// }1665// return TRUE;1666//}16671668kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(1669updated_bounds_nest, original_ivs_start, n);16701671bool last_iter = false;16721673for (; nth > 0;) {1674// We could calculate chunk size once, but this is to compensate that the1675// original space is not parallelepiped and some threads can be left1676// without work:1677KMP_DEBUG_ASSERT(total >= new_iv);16781679kmp_loop_nest_iv_t total_left = total - new_iv;1680kmp_loop_nest_iv_t chunk_size = total_left / nth;1681kmp_loop_nest_iv_t remainder = total_left % nth;16821683kmp_loop_nest_iv_t curr_chunk_size = chunk_size;16841685if (remainder > 0) {1686++curr_chunk_size;1687--remainder;1688}16891690#if defined(KMP_DEBUG)1691kmp_loop_nest_iv_t new_iv_for_start = new_iv;1692#endif16931694if (curr_chunk_size > 1) {1695new_iv += curr_chunk_size - 1;1696}16971698CollapseAllocator<kmp_uint64> original_ivs_end(n);1699if ((nth == 1) || (new_iv >= total - 1)) {1700// Do this one till the end - just in case we miscalculated1701// and either too much is left to process or new_iv is a bit too big:1702kmp_calc_original_ivs_for_end(original_bounds_nest, n,1703/*out*/ original_ivs_end);17041705last_iter = true;1706} else {1707// Note: here we make sure it's past (or equal to) the previous point.1708if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,1709updated_bounds_nest,1710original_ivs_start, new_iv,1711/*out*/ original_ivs_end)) {1712// We could not find the ending point, use the original upper bounds:1713kmp_calc_original_ivs_for_end(original_bounds_nest, n,1714/*out*/ original_ivs_end);17151716last_iter = true;1717}1718}17191720#if defined(KMP_DEBUG)1721auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(1722updated_bounds_nest, original_ivs_end, n);1723KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);1724#endif17251726if (last_iter && (tid != 0)) {1727// We are done, this was last chunk, but no chunk for current thread was1728// found:1729return FALSE;1730}17311732if (tid == 0) {1733// We found the chunk for this thread, now we need to check if it's the1734// last chunk or not:17351736CollapseAllocator<kmp_uint64> original_ivs_next_start(n);1737if (last_iter ||1738!kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,1739/*out*/ original_ivs_next_start)) {1740// no more loop iterations left to process,1741// this means that currently found chunk is the last chunk:1742if (plastiter != NULL) {1743*plastiter = TRUE;1744}1745}17461747// Fill in chunk bounds:1748for (kmp_index_t i = 0; i < n; ++i) {1749chunk_bounds_nest[i] =1750original_bounds_nest[i]; // To fill in types, etc. - optional1751chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];1752chunk_bounds_nest[i].lb1_u64 = 0;17531754chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];1755chunk_bounds_nest[i].ub1_u64 = 0;1756}17571758return TRUE;1759}17601761--tid;1762--nth;17631764bool next_chunk = kmp_calc_next_original_ivs(1765original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);1766if (!next_chunk) {1767// no more loop iterations to process,1768// the prevoius chunk was the last chunk1769break;1770}17711772// original_ivs_start is next to previous chunk original_ivs_end,1773// we need to start new chunk here, so chunks will be one after another1774// without any gap or overlap:1775new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,1776original_ivs_start, n);1777}17781779return FALSE;1780}178117821783