CoCalc -- kmp_collapse.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_collapse.cpp
³⁵²⁵⁸ views
1
/*
2
 * kmp_collapse.cpp -- loop collapse feature
3
 */
4

5
//===----------------------------------------------------------------------===//
6
//
7
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8
// See https://llvm.org/LICENSE.txt for license information.
9
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10
//
11
//===----------------------------------------------------------------------===//
12

13
#include "kmp.h"
14
#include "kmp_error.h"
15
#include "kmp_i18n.h"
16
#include "kmp_itt.h"
17
#include "kmp_stats.h"
18
#include "kmp_str.h"
19
#include "kmp_collapse.h"
20

21
#if OMPT_SUPPORT
22
#include "ompt-specific.h"
23
#endif
24

25
// OMPTODO: different style of comments (see kmp_sched)
26
// OMPTODO: OMPT/OMPD
27

28
// avoid inadevertently using a library based abs
29
template <typename T> T __kmp_abs(const T val) {
30
  return (val < 0) ? -val : val;
31
}
32
kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
33
kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
34

35
//----------------------------------------------------------------------------
36
// Common functions for working with rectangular and non-rectangular loops
37
//----------------------------------------------------------------------------
38

39
template <typename T> int __kmp_sign(T val) {
40
  return (T(0) < val) - (val < T(0));
41
}
42

43
template <typename T> class CollapseAllocator {
44
  typedef T *pT;
45

46
private:
47
  static const size_t allocaSize = 32; // size limit for stack allocations
48
                                       // (8 bytes x 4 nested loops)
49
  char stackAlloc[allocaSize];
50
  static constexpr size_t maxElemCount = allocaSize / sizeof(T);
51
  pT pTAlloc;
52

53
public:
54
  CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
55
    if (n > maxElemCount) {
56
      pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
57
    }
58
  }
59
  ~CollapseAllocator() {
60
    if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
61
      __kmp_free(pTAlloc);
62
    }
63
  }
64
  T &operator[](int index) { return pTAlloc[index]; }
65
  operator const pT() { return pTAlloc; }
66
};
67

68
//----------Loop canonicalization---------------------------------------------
69

70
// For loop nest (any shape):
71
// convert != to < or >;
72
// switch from using < or > to <= or >=.
73
// "bounds" array has to be allocated per thread.
74
// All other internal functions will work only with canonicalized loops.
75
template <typename T>
76
void kmp_canonicalize_one_loop_XX(
77
    ident_t *loc,
78
    /*in/out*/ bounds_infoXX_template<T> *bounds) {
79

80
  if (__kmp_env_consistency_check) {
81
    if (bounds->step == 0) {
82
      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
83
                            loc);
84
    }
85
  }
86

87
  if (bounds->comparison == comparison_t::comp_not_eq) {
88
    // We can convert this to < or >, depends on the sign of the step:
89
    if (bounds->step > 0) {
90
      bounds->comparison = comparison_t::comp_less;
91
    } else {
92
      bounds->comparison = comparison_t::comp_greater;
93
    }
94
  }
95

96
  if (bounds->comparison == comparison_t::comp_less) {
97
    // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
98
    // because ub0 + ub1*j should be still positive (otherwise loop was not
99
    // well formed)
100
    bounds->ub0 -= 1;
101
    bounds->comparison = comparison_t::comp_less_or_eq;
102
  } else if (bounds->comparison == comparison_t::comp_greater) {
103
    bounds->ub0 += 1;
104
    bounds->comparison = comparison_t::comp_greater_or_eq;
105
  }
106
}
107

108
// Canonicalize loop nest. original_bounds_nest is an array of length n.
109
void kmp_canonicalize_loop_nest(ident_t *loc,
110
                                /*in/out*/ bounds_info_t *original_bounds_nest,
111
                                kmp_index_t n) {
112

113
  for (kmp_index_t ind = 0; ind < n; ++ind) {
114
    auto bounds = &(original_bounds_nest[ind]);
115

116
    switch (bounds->loop_type) {
117
    case loop_type_t::loop_type_int32:
118
      kmp_canonicalize_one_loop_XX<kmp_int32>(
119
          loc,
120
          /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
121
      break;
122
    case loop_type_t::loop_type_uint32:
123
      kmp_canonicalize_one_loop_XX<kmp_uint32>(
124
          loc,
125
          /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
126
      break;
127
    case loop_type_t::loop_type_int64:
128
      kmp_canonicalize_one_loop_XX<kmp_int64>(
129
          loc,
130
          /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
131
      break;
132
    case loop_type_t::loop_type_uint64:
133
      kmp_canonicalize_one_loop_XX<kmp_uint64>(
134
          loc,
135
          /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
136
      break;
137
    default:
138
      KMP_ASSERT(false);
139
    }
140
  }
141
}
142

143
//----------Calculating trip count on one level-------------------------------
144

145
// Calculate trip count on this loop level.
146
// We do this either for a rectangular loop nest,
147
// or after an adjustment bringing the loops to a parallelepiped shape.
148
// This number should not depend on the value of outer IV
149
// even if the formular has lb1 and ub1.
150
// Note: for non-rectangular loops don't use span for this, it's too big.
151

152
template <typename T>
153
kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
154
    /*in/out*/ bounds_infoXX_template<T> *bounds) {
155

156
  if (bounds->comparison == comparison_t::comp_less_or_eq) {
157
    if (bounds->ub0 < bounds->lb0) {
158
      // Note: after this we don't need to calculate inner loops,
159
      // but that should be an edge case:
160
      bounds->trip_count = 0;
161
    } else {
162
      // ub - lb may exceed signed type range; we need to cast to
163
      // kmp_loop_nest_iv_t anyway
164
      bounds->trip_count =
165
          static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
166
              __kmp_abs(bounds->step) +
167
          1;
168
    }
169
  } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
170
    if (bounds->lb0 < bounds->ub0) {
171
      // Note: after this we don't need to calculate inner loops,
172
      // but that should be an edge case:
173
      bounds->trip_count = 0;
174
    } else {
175
      // lb - ub may exceed signed type range; we need to cast to
176
      // kmp_loop_nest_iv_t anyway
177
      bounds->trip_count =
178
          static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
179
              __kmp_abs(bounds->step) +
180
          1;
181
    }
182
  } else {
183
    KMP_ASSERT(false);
184
  }
185
  return bounds->trip_count;
186
}
187

188
// Calculate trip count on this loop level.
189
kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
190

191
  kmp_loop_nest_iv_t trip_count = 0;
192

193
  switch (bounds->loop_type) {
194
  case loop_type_t::loop_type_int32:
195
    trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
196
        /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
197
    break;
198
  case loop_type_t::loop_type_uint32:
199
    trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
200
        /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
201
    break;
202
  case loop_type_t::loop_type_int64:
203
    trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
204
        /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
205
    break;
206
  case loop_type_t::loop_type_uint64:
207
    trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
208
        /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
209
    break;
210
  default:
211
    KMP_ASSERT(false);
212
  }
213

214
  return trip_count;
215
}
216

217
//----------Trim original iv according to its type----------------------------
218

219
// Trim original iv according to its type.
220
// Return kmp_uint64 value which can be easily used in all internal calculations
221
// And can be statically cast back to original type in user code.
222
kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
223
  kmp_uint64 res = 0;
224

225
  switch (loop_iv_type) {
226
  case loop_type_t::loop_type_int8:
227
    res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
228
    break;
229
  case loop_type_t::loop_type_uint8:
230
    res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
231
    break;
232
  case loop_type_t::loop_type_int16:
233
    res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
234
    break;
235
  case loop_type_t::loop_type_uint16:
236
    res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
237
    break;
238
  case loop_type_t::loop_type_int32:
239
    res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
240
    break;
241
  case loop_type_t::loop_type_uint32:
242
    res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
243
    break;
244
  case loop_type_t::loop_type_int64:
245
    res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
246
    break;
247
  case loop_type_t::loop_type_uint64:
248
    res = static_cast<kmp_uint64>(original_iv);
249
    break;
250
  default:
251
    KMP_ASSERT(false);
252
  }
253

254
  return res;
255
}
256

257
//----------Compare two IVs (remember they have a type)-----------------------
258

259
bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
260
                kmp_uint64 original_iv2) {
261
  bool res = false;
262

263
  switch (loop_iv_type) {
264
  case loop_type_t::loop_type_int8:
265
    res = static_cast<kmp_int8>(original_iv1) ==
266
          static_cast<kmp_int8>(original_iv2);
267
    break;
268
  case loop_type_t::loop_type_uint8:
269
    res = static_cast<kmp_uint8>(original_iv1) ==
270
          static_cast<kmp_uint8>(original_iv2);
271
    break;
272
  case loop_type_t::loop_type_int16:
273
    res = static_cast<kmp_int16>(original_iv1) ==
274
          static_cast<kmp_int16>(original_iv2);
275
    break;
276
  case loop_type_t::loop_type_uint16:
277
    res = static_cast<kmp_uint16>(original_iv1) ==
278
          static_cast<kmp_uint16>(original_iv2);
279
    break;
280
  case loop_type_t::loop_type_int32:
281
    res = static_cast<kmp_int32>(original_iv1) ==
282
          static_cast<kmp_int32>(original_iv2);
283
    break;
284
  case loop_type_t::loop_type_uint32:
285
    res = static_cast<kmp_uint32>(original_iv1) ==
286
          static_cast<kmp_uint32>(original_iv2);
287
    break;
288
  case loop_type_t::loop_type_int64:
289
    res = static_cast<kmp_int64>(original_iv1) ==
290
          static_cast<kmp_int64>(original_iv2);
291
    break;
292
  case loop_type_t::loop_type_uint64:
293
    res = static_cast<kmp_uint64>(original_iv1) ==
294
          static_cast<kmp_uint64>(original_iv2);
295
    break;
296
  default:
297
    KMP_ASSERT(false);
298
  }
299

300
  return res;
301
}
302

303
//----------Calculate original iv on one level--------------------------------
304

305
// Return true if the point fits into upper bounds on this level,
306
// false otherwise
307
template <typename T>
308
bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
309
                                 const kmp_point_t original_ivs,
310
                                 kmp_index_t ind) {
311

312
  T iv = static_cast<T>(original_ivs[ind]);
313
  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
314

315
  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
316
       (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
317
      ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
318
       (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
319
    // The calculated point is outside of loop upper boundary:
320
    return false;
321
  }
322

323
  return true;
324
}
325

326
// Calculate one iv corresponding to iteration on the level ind.
327
// Return true if it fits into lower-upper bounds on this level
328
// (if not, we need to re-calculate)
329
template <typename T>
330
bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
331
                        /*in/out*/ kmp_point_t original_ivs,
332
                        const kmp_iterations_t iterations, kmp_index_t ind,
333
                        bool start_with_lower_bound, bool checkBounds) {
334

335
  kmp_uint64 temp = 0;
336
  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
337

338
  if (start_with_lower_bound) {
339
    // we moved to the next iteration on one of outer loops, should start
340
    // with the lower bound here:
341
    temp = bounds->lb0 + bounds->lb1 * outer_iv;
342
  } else {
343
    auto iteration = iterations[ind];
344
    temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
345
  }
346

347
  // Now trim original iv according to its type:
348
  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
349

350
  if (checkBounds) {
351
    return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
352
  } else {
353
    return true;
354
  }
355
}
356

357
bool kmp_calc_one_iv(const bounds_info_t *bounds,
358
                     /*in/out*/ kmp_point_t original_ivs,
359
                     const kmp_iterations_t iterations, kmp_index_t ind,
360
                     bool start_with_lower_bound, bool checkBounds) {
361

362
  switch (bounds->loop_type) {
363
  case loop_type_t::loop_type_int32:
364
    return kmp_calc_one_iv_XX<kmp_int32>(
365
        (bounds_infoXX_template<kmp_int32> *)(bounds),
366
        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
367
        checkBounds);
368
    break;
369
  case loop_type_t::loop_type_uint32:
370
    return kmp_calc_one_iv_XX<kmp_uint32>(
371
        (bounds_infoXX_template<kmp_uint32> *)(bounds),
372
        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
373
        checkBounds);
374
    break;
375
  case loop_type_t::loop_type_int64:
376
    return kmp_calc_one_iv_XX<kmp_int64>(
377
        (bounds_infoXX_template<kmp_int64> *)(bounds),
378
        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
379
        checkBounds);
380
    break;
381
  case loop_type_t::loop_type_uint64:
382
    return kmp_calc_one_iv_XX<kmp_uint64>(
383
        (bounds_infoXX_template<kmp_uint64> *)(bounds),
384
        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
385
        checkBounds);
386
    break;
387
  default:
388
    KMP_ASSERT(false);
389
    return false;
390
  }
391
}
392

393
//----------Calculate original iv on one level for rectangular loop nest------
394

395
// Calculate one iv corresponding to iteration on the level ind.
396
// Return true if it fits into lower-upper bounds on this level
397
// (if not, we need to re-calculate)
398
template <typename T>
399
void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
400
                                /*in/out*/ kmp_uint64 *original_ivs,
401
                                const kmp_iterations_t iterations,
402
                                kmp_index_t ind) {
403

404
  auto iteration = iterations[ind];
405

406
  kmp_uint64 temp =
407
      bounds->lb0 +
408
      bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
409
      iteration * bounds->step;
410

411
  // Now trim original iv according to its type:
412
  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
413
}
414

415
void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
416
                             /*in/out*/ kmp_uint64 *original_ivs,
417
                             const kmp_iterations_t iterations,
418
                             kmp_index_t ind) {
419

420
  switch (bounds->loop_type) {
421
  case loop_type_t::loop_type_int32:
422
    kmp_calc_one_iv_rectang_XX<kmp_int32>(
423
        (bounds_infoXX_template<kmp_int32> *)(bounds),
424
        /*in/out*/ original_ivs, iterations, ind);
425
    break;
426
  case loop_type_t::loop_type_uint32:
427
    kmp_calc_one_iv_rectang_XX<kmp_uint32>(
428
        (bounds_infoXX_template<kmp_uint32> *)(bounds),
429
        /*in/out*/ original_ivs, iterations, ind);
430
    break;
431
  case loop_type_t::loop_type_int64:
432
    kmp_calc_one_iv_rectang_XX<kmp_int64>(
433
        (bounds_infoXX_template<kmp_int64> *)(bounds),
434
        /*in/out*/ original_ivs, iterations, ind);
435
    break;
436
  case loop_type_t::loop_type_uint64:
437
    kmp_calc_one_iv_rectang_XX<kmp_uint64>(
438
        (bounds_infoXX_template<kmp_uint64> *)(bounds),
439
        /*in/out*/ original_ivs, iterations, ind);
440
    break;
441
  default:
442
    KMP_ASSERT(false);
443
  }
444
}
445

446
//----------------------------------------------------------------------------
447
// Rectangular loop nest
448
//----------------------------------------------------------------------------
449

450
//----------Canonicalize loop nest and calculate trip count-------------------
451

452
// Canonicalize loop nest and calculate overall trip count.
453
// "bounds_nest" has to be allocated per thread.
454
// API will modify original bounds_nest array to bring it to a canonical form
455
// (only <= and >=, no !=, <, >). If the original loop nest was already in a
456
// canonical form there will be no changes to bounds in bounds_nest array
457
// (only trip counts will be calculated).
458
// Returns trip count of overall space.
459
extern "C" kmp_loop_nest_iv_t
460
__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
461
                                 /*in/out*/ bounds_info_t *original_bounds_nest,
462
                                 kmp_index_t n) {
463

464
  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
465

466
  kmp_loop_nest_iv_t total = 1;
467

468
  for (kmp_index_t ind = 0; ind < n; ++ind) {
469
    auto bounds = &(original_bounds_nest[ind]);
470

471
    kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
472
    total *= trip_count;
473
  }
474

475
  return total;
476
}
477

478
//----------Calculate old induction variables---------------------------------
479

480
// Calculate old induction variables corresponding to overall new_iv.
481
// Note: original IV will be returned as if it had kmp_uint64 type,
482
// will have to be converted to original type in user code.
483
// Note: trip counts should be already calculated by
484
// __kmpc_process_loop_nest_rectang.
485
// OMPTODO: special case 2, 3 nested loops: either do different
486
// interface without array or possibly template this over n
487
extern "C" void
488
__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
489
                                 const bounds_info_t *original_bounds_nest,
490
                                 /*out*/ kmp_uint64 *original_ivs,
491
                                 kmp_index_t n) {
492

493
  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
494

495
  // First, calc corresponding iteration in every original loop:
496
  for (kmp_index_t ind = n; ind > 0;) {
497
    --ind;
498
    auto bounds = &(original_bounds_nest[ind]);
499

500
    // should be optimized to OPDIVREM:
501
    auto temp = new_iv / bounds->trip_count;
502
    auto iteration = new_iv % bounds->trip_count;
503
    new_iv = temp;
504

505
    iterations[ind] = iteration;
506
  }
507
  KMP_ASSERT(new_iv == 0);
508

509
  for (kmp_index_t ind = 0; ind < n; ++ind) {
510
    auto bounds = &(original_bounds_nest[ind]);
511

512
    kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
513
  }
514
}
515

516
//----------------------------------------------------------------------------
517
// Non-rectangular loop nest
518
//----------------------------------------------------------------------------
519

520
//----------Calculate maximum possible span of iv values on one level---------
521

522
// Calculate span for IV on this loop level for "<=" case.
523
// Note: it's for <= on this loop nest level, so lower bound should be smallest
524
// value, upper bound should be the biggest value. If the loop won't execute,
525
// 'smallest' may be bigger than 'biggest', but we'd better not switch them
526
// around.
527
template <typename T>
528
void kmp_calc_span_lessoreq_XX(
529
    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
530
    /* in/out*/ bounds_info_internal_t *bounds_nest) {
531

532
  typedef typename traits_t<T>::unsigned_t UT;
533
  // typedef typename traits_t<T>::signed_t ST;
534

535
  // typedef typename big_span_t span_t;
536
  typedef T span_t;
537

538
  auto &bbounds = bounds->b;
539

540
  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
541
    // This dimention depends on one of previous ones; can't be the outermost
542
    // one.
543
    bounds_info_internalXX_template<T> *previous =
544
        reinterpret_cast<bounds_info_internalXX_template<T> *>(
545
            &(bounds_nest[bbounds.outer_iv]));
546

547
    // OMPTODO: assert that T is compatible with loop variable type on
548
    // 'previous' loop
549

550
    {
551
      span_t bound_candidate1 =
552
          bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
553
      span_t bound_candidate2 =
554
          bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
555
      if (bound_candidate1 < bound_candidate2) {
556
        bounds->span_smallest = bound_candidate1;
557
      } else {
558
        bounds->span_smallest = bound_candidate2;
559
      }
560
    }
561

562
    {
563
      // We can't adjust the upper bound with respect to step, because
564
      // lower bound might be off after adjustments
565

566
      span_t bound_candidate1 =
567
          bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
568
      span_t bound_candidate2 =
569
          bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
570
      if (bound_candidate1 < bound_candidate2) {
571
        bounds->span_biggest = bound_candidate2;
572
      } else {
573
        bounds->span_biggest = bound_candidate1;
574
      }
575
    }
576
  } else {
577
    // Rectangular:
578
    bounds->span_smallest = bbounds.lb0;
579
    bounds->span_biggest = bbounds.ub0;
580
  }
581
  if (!bounds->loop_bounds_adjusted) {
582
    // Here it's safe to reduce the space to the multiply of step.
583
    // OMPTODO: check if the formular is correct.
584
    // Also check if it would be safe to do this if we didn't adjust left side.
585
    bounds->span_biggest -=
586
        (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
587
  }
588
}
589

590
// Calculate span for IV on this loop level for ">=" case.
591
template <typename T>
592
void kmp_calc_span_greateroreq_XX(
593
    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
594
    /* in/out*/ bounds_info_internal_t *bounds_nest) {
595

596
  typedef typename traits_t<T>::unsigned_t UT;
597
  // typedef typename traits_t<T>::signed_t ST;
598

599
  // typedef typename big_span_t span_t;
600
  typedef T span_t;
601

602
  auto &bbounds = bounds->b;
603

604
  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
605
    // This dimention depends on one of previous ones; can't be the outermost
606
    // one.
607
    bounds_info_internalXX_template<T> *previous =
608
        reinterpret_cast<bounds_info_internalXX_template<T> *>(
609
            &(bounds_nest[bbounds.outer_iv]));
610

611
    // OMPTODO: assert that T is compatible with loop variable type on
612
    // 'previous' loop
613

614
    {
615
      span_t bound_candidate1 =
616
          bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
617
      span_t bound_candidate2 =
618
          bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
619
      if (bound_candidate1 >= bound_candidate2) {
620
        bounds->span_smallest = bound_candidate1;
621
      } else {
622
        bounds->span_smallest = bound_candidate2;
623
      }
624
    }
625

626
    {
627
      // We can't adjust the upper bound with respect to step, because
628
      // lower bound might be off after adjustments
629

630
      span_t bound_candidate1 =
631
          bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
632
      span_t bound_candidate2 =
633
          bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
634
      if (bound_candidate1 >= bound_candidate2) {
635
        bounds->span_biggest = bound_candidate2;
636
      } else {
637
        bounds->span_biggest = bound_candidate1;
638
      }
639
    }
640

641
  } else {
642
    // Rectangular:
643
    bounds->span_biggest = bbounds.lb0;
644
    bounds->span_smallest = bbounds.ub0;
645
  }
646
  if (!bounds->loop_bounds_adjusted) {
647
    // Here it's safe to reduce the space to the multiply of step.
648
    // OMPTODO: check if the formular is correct.
649
    // Also check if it would be safe to do this if we didn't adjust left side.
650
    bounds->span_biggest -=
651
        (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
652
  }
653
}
654

655
// Calculate maximum possible span for IV on this loop level.
656
template <typename T>
657
void kmp_calc_span_XX(
658
    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
659
    /* in/out*/ bounds_info_internal_t *bounds_nest) {
660

661
  if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
662
    kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
663
  } else {
664
    KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
665
    kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
666
  }
667
}
668

669
//----------All initial processing of the loop nest---------------------------
670

671
// Calculate new bounds for this loop level.
672
// To be able to work with the nest we need to get it to a parallelepiped shape.
673
// We need to stay in the original range of values, so that there will be no
674
// overflow, for that we'll adjust both upper and lower bounds as needed.
675
template <typename T>
676
void kmp_calc_new_bounds_XX(
677
    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
678
    /* in/out*/ bounds_info_internal_t *bounds_nest) {
679

680
  auto &bbounds = bounds->b;
681

682
  if (bbounds.lb1 == bbounds.ub1) {
683
    // Already parallel, no need to adjust:
684
    bounds->loop_bounds_adjusted = false;
685
  } else {
686
    bounds->loop_bounds_adjusted = true;
687

688
    T old_lb1 = bbounds.lb1;
689
    T old_ub1 = bbounds.ub1;
690

691
    if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
692
      // With this shape we can adjust to a rectangle:
693
      bbounds.lb1 = 0;
694
      bbounds.ub1 = 0;
695
    } else {
696
      // get upper and lower bounds to be parallel
697
      // with values in the old range.
698
      // Note: abs didn't work here.
699
      if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
700
          ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
701
        bbounds.lb1 = old_ub1;
702
      } else {
703
        bbounds.ub1 = old_lb1;
704
      }
705
    }
706

707
    // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
708
    // The idea here that for this IV we are now getting the same span
709
    // irrespective of the previous IV value.
710
    bounds_info_internalXX_template<T> *previous =
711
        reinterpret_cast<bounds_info_internalXX_template<T> *>(
712
            &bounds_nest[bbounds.outer_iv]);
713

714
    if (bbounds.comparison == comparison_t::comp_less_or_eq) {
715
      if (old_lb1 < bbounds.lb1) {
716
        KMP_ASSERT(old_lb1 < 0);
717
        // The length is good on outer_iv biggest number,
718
        // can use it to find where to move the lower bound:
719

720
        T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
721
        bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
722
                            // e.g. it was 0?? (same below)
723
      } else if (old_lb1 > bbounds.lb1) {
724
        // still need to move lower bound:
725
        T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
726
        bbounds.lb0 += add;
727
      }
728

729
      if (old_ub1 > bbounds.ub1) {
730
        KMP_ASSERT(old_ub1 > 0);
731
        // The length is good on outer_iv biggest number,
732
        // can use it to find where to move upper bound:
733

734
        T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
735
        bbounds.ub0 += add;
736
      } else if (old_ub1 < bbounds.ub1) {
737
        // still need to move upper bound:
738
        T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
739
        bbounds.ub0 -= sub;
740
      }
741
    } else {
742
      KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
743
      if (old_lb1 < bbounds.lb1) {
744
        KMP_ASSERT(old_lb1 < 0);
745
        T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
746
        bbounds.lb0 -= sub;
747
      } else if (old_lb1 > bbounds.lb1) {
748
        T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
749
        bbounds.lb0 += add;
750
      }
751

752
      if (old_ub1 > bbounds.ub1) {
753
        KMP_ASSERT(old_ub1 > 0);
754
        T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
755
        bbounds.ub0 += add;
756
      } else if (old_ub1 < bbounds.ub1) {
757
        T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
758
        bbounds.ub0 -= sub;
759
      }
760
    }
761
  }
762
}
763

764
// Do all processing for one canonicalized loop in the nest
765
// (assuming that outer loops already were processed):
766
template <typename T>
767
kmp_loop_nest_iv_t kmp_process_one_loop_XX(
768
    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
769
    /*in/out*/ bounds_info_internal_t *bounds_nest) {
770

771
  kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
772
  kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
773
  return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
774
}
775

776
// Non-rectangular loop nest, canonicalized to use <= or >=.
777
// Process loop nest to have a parallelepiped shape,
778
// calculate biggest spans for IV's on all levels and calculate overall trip
779
// count. "bounds_nest" has to be allocated per thread.
780
// Returns overall trip count (for adjusted space).
781
kmp_loop_nest_iv_t kmp_process_loop_nest(
782
    /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
783

784
  kmp_loop_nest_iv_t total = 1;
785

786
  for (kmp_index_t ind = 0; ind < n; ++ind) {
787
    auto bounds = &(bounds_nest[ind]);
788
    kmp_loop_nest_iv_t trip_count = 0;
789

790
    switch (bounds->b.loop_type) {
791
    case loop_type_t::loop_type_int32:
792
      trip_count = kmp_process_one_loop_XX<kmp_int32>(
793
          /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
794
          /*in/out*/ bounds_nest);
795
      break;
796
    case loop_type_t::loop_type_uint32:
797
      trip_count = kmp_process_one_loop_XX<kmp_uint32>(
798
          /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
799
          /*in/out*/ bounds_nest);
800
      break;
801
    case loop_type_t::loop_type_int64:
802
      trip_count = kmp_process_one_loop_XX<kmp_int64>(
803
          /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
804
          /*in/out*/ bounds_nest);
805
      break;
806
    case loop_type_t::loop_type_uint64:
807
      trip_count = kmp_process_one_loop_XX<kmp_uint64>(
808
          /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
809
          /*in/out*/ bounds_nest);
810
      break;
811
    default:
812
      KMP_ASSERT(false);
813
    }
814
    total *= trip_count;
815
  }
816

817
  return total;
818
}
819

820
//----------Calculate iterations (in the original or updated space)-----------
821

822
// Calculate number of iterations in original or updated space resulting in
823
// original_ivs[ind] (only on this level, non-negative)
824
// (not counting initial iteration)
825
template <typename T>
826
kmp_loop_nest_iv_t
827
kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
828
                                 const kmp_point_t original_ivs,
829
                                 kmp_index_t ind) {
830

831
  kmp_loop_nest_iv_t iterations = 0;
832

833
  if (bounds->comparison == comparison_t::comp_less_or_eq) {
834
    iterations =
835
        (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
836
         bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
837
        __kmp_abs(bounds->step);
838
  } else {
839
    KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
840
    iterations = (bounds->lb0 +
841
                  bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
842
                  static_cast<T>(original_ivs[ind])) /
843
                 __kmp_abs(bounds->step);
844
  }
845

846
  return iterations;
847
}
848

849
// Calculate number of iterations in the original or updated space resulting in
850
// original_ivs[ind] (only on this level, non-negative)
851
kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
852
                                                 const kmp_point_t original_ivs,
853
                                                 kmp_index_t ind) {
854

855
  switch (bounds->loop_type) {
856
  case loop_type_t::loop_type_int32:
857
    return kmp_calc_number_of_iterations_XX<kmp_int32>(
858
        (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
859
    break;
860
  case loop_type_t::loop_type_uint32:
861
    return kmp_calc_number_of_iterations_XX<kmp_uint32>(
862
        (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
863
    break;
864
  case loop_type_t::loop_type_int64:
865
    return kmp_calc_number_of_iterations_XX<kmp_int64>(
866
        (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
867
    break;
868
  case loop_type_t::loop_type_uint64:
869
    return kmp_calc_number_of_iterations_XX<kmp_uint64>(
870
        (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
871
    break;
872
  default:
873
    KMP_ASSERT(false);
874
    return 0;
875
  }
876
}
877

878
//----------Calculate new iv corresponding to original ivs--------------------
879

880
// We got a point in the original loop nest.
881
// Take updated bounds and calculate what new_iv will correspond to this point.
882
// When we are getting original IVs from new_iv, we have to adjust to fit into
883
// original loops bounds. Getting new_iv for the adjusted original IVs will help
884
// with making more chunks non-empty.
885
kmp_loop_nest_iv_t
886
kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
887
                                  const kmp_point_t original_ivs,
888
                                  kmp_index_t n) {
889

890
  kmp_loop_nest_iv_t new_iv = 0;
891

892
  for (kmp_index_t ind = 0; ind < n; ++ind) {
893
    auto bounds = &(bounds_nest[ind].b);
894

895
    new_iv = new_iv * bounds->trip_count +
896
             kmp_calc_number_of_iterations(bounds, original_ivs, ind);
897
  }
898

899
  return new_iv;
900
}
901

902
//----------Calculate original ivs for provided iterations--------------------
903

904
// Calculate original IVs for provided iterations, assuming iterations are
905
// calculated in the original space.
906
// Loop nest is in canonical form (with <= / >=).
907
bool kmp_calc_original_ivs_from_iterations(
908
    const bounds_info_t *original_bounds_nest, kmp_index_t n,
909
    /*in/out*/ kmp_point_t original_ivs,
910
    /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
911

912
  kmp_index_t lengthened_ind = n;
913

914
  for (; ind < n;) {
915
    auto bounds = &(original_bounds_nest[ind]);
916
    bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
917
                                ind, (lengthened_ind < ind), true);
918

919
    if (!good) {
920
      // The calculated iv value is too big (or too small for >=):
921
      if (ind == 0) {
922
        // Space is empty:
923
        return false;
924
      } else {
925
        // Go to next iteration on the outer loop:
926
        --ind;
927
        ++iterations[ind];
928
        lengthened_ind = ind;
929
        for (kmp_index_t i = ind + 1; i < n; ++i) {
930
          iterations[i] = 0;
931
        }
932
        continue;
933
      }
934
    }
935
    ++ind;
936
  }
937

938
  return true;
939
}
940

941
//----------Calculate original ivs for the beginning of the loop nest---------
942

943
// Calculate IVs for the beginning of the loop nest.
944
// Note: lower bounds of all loops may not work -
945
// if on some of the iterations of the outer loops inner loops are empty.
946
// Loop nest is in canonical form (with <= / >=).
947
bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
948
                                     kmp_index_t n,
949
                                     /*out*/ kmp_point_t original_ivs) {
950

951
  // Iterations in the original space, multiplied by step:
952
  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
953
  for (kmp_index_t ind = n; ind > 0;) {
954
    --ind;
955
    iterations[ind] = 0;
956
  }
957

958
  // Now calculate the point:
959
  bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
960
                                                 /*in/out*/ original_ivs,
961
                                                 /*in/out*/ iterations, 0);
962
  return b;
963
}
964

965
//----------Calculate next point in the original loop space-------------------
966

967
// From current set of original IVs calculate next point.
968
// Return false if there is no next point in the loop bounds.
969
bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
970
                                kmp_index_t n, const kmp_point_t original_ivs,
971
                                /*out*/ kmp_point_t next_original_ivs) {
972
  // Iterations in the original space, multiplied by step (so can be negative):
973
  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
974
  // First, calc corresponding iteration in every original loop:
975
  for (kmp_index_t ind = 0; ind < n; ++ind) {
976
    auto bounds = &(original_bounds_nest[ind]);
977
    iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
978
  }
979

980
  for (kmp_index_t ind = 0; ind < n; ++ind) {
981
    next_original_ivs[ind] = original_ivs[ind];
982
  }
983

984
  // Next add one step to the iterations on the inner-most level, and see if we
985
  // need to move up the nest:
986
  kmp_index_t ind = n - 1;
987
  ++iterations[ind];
988

989
  bool b = kmp_calc_original_ivs_from_iterations(
990
      original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
991

992
  return b;
993
}
994

995
//----------Calculate chunk end in the original loop space--------------------
996

997
// For one level calculate old induction variable corresponding to overall
998
// new_iv for the chunk end.
999
// Return true if it fits into upper bound on this level
1000
// (if not, we need to re-calculate)
1001
template <typename T>
1002
bool kmp_calc_one_iv_for_chunk_end_XX(
1003
    const bounds_infoXX_template<T> *bounds,
1004
    const bounds_infoXX_template<T> *updated_bounds,
1005
    /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
1006
    kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
1007
    const kmp_point_t original_ivs_start) {
1008

1009
  // typedef  std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
1010
  // big_span_t;
1011

1012
  // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
1013
  T temp = 0;
1014

1015
  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
1016

1017
  if (start_with_lower_bound) {
1018
    // we moved to the next iteration on one of outer loops, may as well use
1019
    // the lower bound here:
1020
    temp = bounds->lb0 + bounds->lb1 * outer_iv;
1021
  } else {
1022
    // Start in expanded space, but:
1023
    // - we need to hit original space lower bound, so need to account for
1024
    // that
1025
    // - we have to go into original space, even if that means adding more
1026
    // iterations than was planned
1027
    // - we have to go past (or equal to) previous point (which is the chunk
1028
    // starting point)
1029

1030
    auto iteration = iterations[ind];
1031

1032
    auto step = bounds->step;
1033

1034
    // In case of >= it's negative:
1035
    auto accountForStep =
1036
        ((bounds->lb0 + bounds->lb1 * outer_iv) -
1037
         (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1038
        step;
1039

1040
    temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1041
           accountForStep + iteration * step;
1042

1043
    if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1044
         (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
1045
        ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1046
         (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1047
      // Too small (or too big), didn't reach the original lower bound. Use
1048
      // heuristic:
1049
      temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
1050
    }
1051

1052
    if (compare_with_start) {
1053

1054
      T start = static_cast<T>(original_ivs_start[ind]);
1055

1056
      temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1057

1058
      // On all previous levels start of the chunk is same as the end, need to
1059
      // be really careful here:
1060
      if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1061
           (temp < start)) ||
1062
          ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1063
           (temp > start))) {
1064
        // End of the chunk can't be smaller (for >= bigger) than it's start.
1065
        // Use heuristic:
1066
        temp = start + iteration / 4 * step;
1067
      }
1068
    }
1069
  }
1070

1071
  original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1072

1073
  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1074
       (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
1075
      ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1076
       (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1077
    // Too big (or too small for >=).
1078
    return false;
1079
  }
1080

1081
  return true;
1082
}
1083

1084
// For one level calculate old induction variable corresponding to overall
1085
// new_iv for the chunk end.
1086
bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
1087
                                   const bounds_info_t *updated_bounds,
1088
                                   /*in/out*/ kmp_point_t original_ivs,
1089
                                   const kmp_iterations_t iterations,
1090
                                   kmp_index_t ind, bool start_with_lower_bound,
1091
                                   bool compare_with_start,
1092
                                   const kmp_point_t original_ivs_start) {
1093

1094
  switch (bounds->loop_type) {
1095
  case loop_type_t::loop_type_int32:
1096
    return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1097
        (bounds_infoXX_template<kmp_int32> *)(bounds),
1098
        (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
1099
        /*in/out*/
1100
        original_ivs, iterations, ind, start_with_lower_bound,
1101
        compare_with_start, original_ivs_start);
1102
    break;
1103
  case loop_type_t::loop_type_uint32:
1104
    return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1105
        (bounds_infoXX_template<kmp_uint32> *)(bounds),
1106
        (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
1107
        /*in/out*/
1108
        original_ivs, iterations, ind, start_with_lower_bound,
1109
        compare_with_start, original_ivs_start);
1110
    break;
1111
  case loop_type_t::loop_type_int64:
1112
    return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1113
        (bounds_infoXX_template<kmp_int64> *)(bounds),
1114
        (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
1115
        /*in/out*/
1116
        original_ivs, iterations, ind, start_with_lower_bound,
1117
        compare_with_start, original_ivs_start);
1118
    break;
1119
  case loop_type_t::loop_type_uint64:
1120
    return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1121
        (bounds_infoXX_template<kmp_uint64> *)(bounds),
1122
        (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
1123
        /*in/out*/
1124
        original_ivs, iterations, ind, start_with_lower_bound,
1125
        compare_with_start, original_ivs_start);
1126
    break;
1127
  default:
1128
    KMP_ASSERT(false);
1129
    return false;
1130
  }
1131
}
1132

1133
// Calculate old induction variables corresponding to overall new_iv for the
1134
// chunk end. If due to space extension we are getting old IVs outside of the
1135
// boundaries, bring them into the boundaries. Need to do this in the runtime,
1136
// esp. on the lower bounds side. When getting result need to make sure that the
1137
// new chunk starts at next position to old chunk, not overlaps with it (this is
1138
// done elsewhere), and need to make sure end of the chunk is further than the
1139
// beginning of the chunk. We don't need an exact ending point here, just
1140
// something more-or-less close to the desired chunk length, bigger is fine
1141
// (smaller would be fine, but we risk going into infinite loop, so do smaller
1142
// only at the very end of the space). result: false if could not find the
1143
// ending point in the original loop space. In this case the caller can use
1144
// original upper bounds as the end of the chunk. Chunk won't be empty, because
1145
// it'll have at least the starting point, which is by construction in the
1146
// original space.
1147
bool kmp_calc_original_ivs_for_chunk_end(
1148
    const bounds_info_t *original_bounds_nest, kmp_index_t n,
1149
    const bounds_info_internal_t *updated_bounds_nest,
1150
    const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1151
    /*out*/ kmp_point_t original_ivs) {
1152

1153
  // Iterations in the expanded space:
1154
  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
1155
  // First, calc corresponding iteration in every modified loop:
1156
  for (kmp_index_t ind = n; ind > 0;) {
1157
    --ind;
1158
    auto &updated_bounds = updated_bounds_nest[ind];
1159

1160
    // should be optimized to OPDIVREM:
1161
    auto new_ind = new_iv / updated_bounds.b.trip_count;
1162
    auto iteration = new_iv % updated_bounds.b.trip_count;
1163

1164
    new_iv = new_ind;
1165
    iterations[ind] = iteration;
1166
  }
1167
  KMP_DEBUG_ASSERT(new_iv == 0);
1168

1169
  kmp_index_t lengthened_ind = n;
1170
  kmp_index_t equal_ind = -1;
1171

1172
  // Next calculate the point, but in original loop nest.
1173
  for (kmp_index_t ind = 0; ind < n;) {
1174
    auto bounds = &(original_bounds_nest[ind]);
1175
    auto updated_bounds = &(updated_bounds_nest[ind].b);
1176

1177
    bool good = kmp_calc_one_iv_for_chunk_end(
1178
        bounds, updated_bounds,
1179
        /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
1180
        (equal_ind >= ind - 1), original_ivs_start);
1181

1182
    if (!good) {
1183
      // Too big (or too small for >=).
1184
      if (ind == 0) {
1185
        // Need to reduce to the end.
1186
        return false;
1187
      } else {
1188
        // Go to next iteration on outer loop:
1189
        --ind;
1190
        ++(iterations[ind]);
1191
        lengthened_ind = ind;
1192
        if (equal_ind >= lengthened_ind) {
1193
          // We've changed the number of iterations here,
1194
          // can't be same anymore:
1195
          equal_ind = lengthened_ind - 1;
1196
        }
1197
        for (kmp_index_t i = ind + 1; i < n; ++i) {
1198
          iterations[i] = 0;
1199
        }
1200
        continue;
1201
      }
1202
    }
1203

1204
    if ((equal_ind == ind - 1) &&
1205
        (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1206
                    original_ivs_start[ind]))) {
1207
      equal_ind = ind;
1208
    } else if ((equal_ind > ind - 1) &&
1209
               !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1210
                            original_ivs_start[ind]))) {
1211
      equal_ind = ind - 1;
1212
    }
1213
    ++ind;
1214
  }
1215

1216
  return true;
1217
}
1218

1219
//----------Calculate upper bounds for the last chunk-------------------------
1220

1221
// Calculate one upper bound for the end.
1222
template <typename T>
1223
void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
1224
                            /*in/out*/ kmp_point_t original_ivs,
1225
                            kmp_index_t ind) {
1226

1227
  T temp = bounds->ub0 +
1228
           bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
1229

1230
  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1231
}
1232

1233
void kmp_calc_one_iv_end(const bounds_info_t *bounds,
1234
                         /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
1235

1236
  switch (bounds->loop_type) {
1237
  default:
1238
    KMP_ASSERT(false);
1239
    break;
1240
  case loop_type_t::loop_type_int32:
1241
    kmp_calc_one_iv_end_XX<kmp_int32>(
1242
        (bounds_infoXX_template<kmp_int32> *)(bounds),
1243
        /*in/out*/ original_ivs, ind);
1244
    break;
1245
  case loop_type_t::loop_type_uint32:
1246
    kmp_calc_one_iv_end_XX<kmp_uint32>(
1247
        (bounds_infoXX_template<kmp_uint32> *)(bounds),
1248
        /*in/out*/ original_ivs, ind);
1249
    break;
1250
  case loop_type_t::loop_type_int64:
1251
    kmp_calc_one_iv_end_XX<kmp_int64>(
1252
        (bounds_infoXX_template<kmp_int64> *)(bounds),
1253
        /*in/out*/ original_ivs, ind);
1254
    break;
1255
  case loop_type_t::loop_type_uint64:
1256
    kmp_calc_one_iv_end_XX<kmp_uint64>(
1257
        (bounds_infoXX_template<kmp_uint64> *)(bounds),
1258
        /*in/out*/ original_ivs, ind);
1259
    break;
1260
  }
1261
}
1262

1263
// Calculate upper bounds for the last loop iteration. Just use original upper
1264
// bounds (adjusted when canonicalized to use <= / >=). No need to check that
1265
// this point is in the original space (it's likely not)
1266
void kmp_calc_original_ivs_for_end(
1267
    const bounds_info_t *const original_bounds_nest, kmp_index_t n,
1268
    /*out*/ kmp_point_t original_ivs) {
1269
  for (kmp_index_t ind = 0; ind < n; ++ind) {
1270
    auto bounds = &(original_bounds_nest[ind]);
1271
    kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
1272
  }
1273
}
1274

1275
/**************************************************************************
1276
 * Identify nested loop structure - loops come in the canonical form
1277
 * Lower triangle matrix: i = 0; i <= N; i++        {0,0}:{N,0}
1278
 *                        j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
1279
 * Upper Triangle matrix
1280
 *                        i = 0;     i <= N; i++    {0,0}:{N,0}
1281
 *                        j = 0+1*i; j <= N; j++    {0,1}:{N,0}
1282
 * ************************************************************************/
1283
nested_loop_type_t
1284
kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,
1285
                                   /*in*/ kmp_index_t n) {
1286
  // only 2-level nested loops are supported
1287
  if (n != 2) {
1288
    return nested_loop_type_unkown;
1289
  }
1290
  // loops must be canonical
1291
  KMP_ASSERT(
1292
      (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
1293
      (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
1294
  // check outer loop bounds: for triangular need to be {0,0}:{N,0}
1295
  kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1296
                                        original_bounds_nest[0].lb0_u64);
1297
  kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1298
                                        original_bounds_nest[0].ub0_u64);
1299
  kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1300
                                        original_bounds_nest[0].lb1_u64);
1301
  kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1302
                                        original_bounds_nest[0].ub1_u64);
1303
  if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
1304
    return nested_loop_type_unkown;
1305
  }
1306
  // check inner bounds to determine triangle type
1307
  kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1308
                                        original_bounds_nest[1].lb0_u64);
1309
  kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1310
                                        original_bounds_nest[1].ub0_u64);
1311
  kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1312
                                        original_bounds_nest[1].lb1_u64);
1313
  kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1314
                                        original_bounds_nest[1].ub1_u64);
1315
  // lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
1316
  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
1317
      (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
1318
    return nested_loop_type_lower_triangular_matrix;
1319
  }
1320
  // upper triangle loop inner bounds need to be {0,1}:{N,0}
1321
  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
1322
      inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
1323
    return nested_loop_type_upper_triangular_matrix;
1324
  }
1325
  return nested_loop_type_unkown;
1326
}
1327

1328
/**************************************************************************
1329
 * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
1330
 * Start point is x so the result is always > sqrt(x)
1331
 * The method has uniform convergence, PRECISION is set to 0.1
1332
 * ************************************************************************/
1333
#define level_of_precision 0.1
1334
double sqrt_newton_approx(/*in*/ kmp_uint64 x) {
1335
  double sqrt_old = 0.;
1336
  double sqrt_new = (double)x;
1337
  do {
1338
    sqrt_old = sqrt_new;
1339
    sqrt_new = (sqrt_old + x / sqrt_old) / 2;
1340
  } while ((sqrt_old - sqrt_new) > level_of_precision);
1341
  return sqrt_new;
1342
}
1343

1344
/**************************************************************************
1345
 *  Handle lower triangle matrix in the canonical form
1346
 *  i = 0; i <= N; i++          {0,0}:{N,0}
1347
 *  j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
1348
 * ************************************************************************/
1349
void kmp_handle_lower_triangle_matrix(
1350
    /*in*/ kmp_uint32 nth,
1351
    /*in*/ kmp_uint32 tid,
1352
    /*in */ kmp_index_t n,
1353
    /*in/out*/ bounds_info_t *original_bounds_nest,
1354
    /*out*/ bounds_info_t *chunk_bounds_nest) {
1355

1356
  // transfer loop types from the original loop to the chunks
1357
  for (kmp_index_t i = 0; i < n; ++i) {
1358
    chunk_bounds_nest[i] = original_bounds_nest[i];
1359
  }
1360
  // cleanup iv variables
1361
  kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1362
                                    original_bounds_nest[0].ub0_u64);
1363
  kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1364
                                    original_bounds_nest[0].lb0_u64);
1365
  kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1366
                                    original_bounds_nest[1].ub0_u64);
1367
  // calculate the chunk's lower and upper bounds
1368
  // the total number of iterations in the loop is the sum of the arithmetic
1369
  // progression from the outer lower to outer upper bound (inclusive since the
1370
  // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1371
  // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1372
  // + 1) -> N - 1
1373
  kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
1374
  kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1375
  // the current thread's number of iterations:
1376
  // each thread gets an equal number of iterations: total number of iterations
1377
  // divided by the number of threads plus, if there's a remainder,
1378
  // the first threads with the number up to the remainder get an additional
1379
  // iteration each to cover it
1380
  kmp_uint64 iter_current =
1381
      iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1382
  // cumulative number of iterations executed by all the previous threads:
1383
  // threads with the tid below the remainder will have (iter_total/nth+1)
1384
  // elements, and so will all threads before them so the cumulative number of
1385
  // iterations executed by the all previous will be the current thread's number
1386
  // of iterations multiplied by the number of previous threads which is equal
1387
  // to the current thread's tid; threads with the number equal or above the
1388
  // remainder will have (iter_total/nth) elements so the cumulative number of
1389
  // iterations previously executed is its number of iterations multipled by the
1390
  // number of previous threads which is again equal to the current thread's tid
1391
  // PLUS all the remainder iterations that will have been executed by the
1392
  // previous threads
1393
  kmp_uint64 iter_before_current =
1394
      tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1395
  // cumulative number of iterations executed with the current thread is
1396
  // the cumulative number executed before it plus its own
1397
  kmp_uint64 iter_with_current = iter_before_current + iter_current;
1398
  // calculate the outer loop lower bound (lbo) which is the max outer iv value
1399
  // that gives the number of iterations that is equal or just below the total
1400
  // number of iterations executed by the previous threads, for less_than
1401
  // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
1402
  // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
1403
  // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
1404
  // i.e. lbo*(lbo+1)/2<=iter_before_current =>
1405
  // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
1406
  // using a parameter to control the equation sign
1407
  kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
1408
  kmp_uint64 lower_bound_outer =
1409
      (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1410
                                      8 * iter_before_current) +
1411
                   inner_adjustment) /
1412
          2 -
1413
      inner_adjustment;
1414
  // calculate the inner loop lower bound which is the remaining number of
1415
  // iterations required to hit the total number of iterations executed by the
1416
  // previous threads giving the starting point of this thread
1417
  kmp_uint64 lower_bound_inner =
1418
      iter_before_current -
1419
      ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
1420
  // calculate the outer loop upper bound using the same approach as for the
1421
  // inner bound except using the total number of iterations executed with the
1422
  // current thread
1423
  kmp_uint64 upper_bound_outer =
1424
      (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1425
                                      8 * iter_with_current) +
1426
                   inner_adjustment) /
1427
          2 -
1428
      inner_adjustment;
1429
  // calculate the inner loop upper bound which is the remaining number of
1430
  // iterations required to hit the total number of iterations executed after
1431
  // the current thread giving the starting point of the next thread
1432
  kmp_uint64 upper_bound_inner =
1433
      iter_with_current -
1434
      ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
1435
  // adjust the upper bounds down by 1 element to point at the last iteration of
1436
  // the current thread the first iteration of the next thread
1437
  if (upper_bound_inner == 0) {
1438
    // {n,0} => {n-1,n-1}
1439
    upper_bound_outer -= 1;
1440
    upper_bound_inner = upper_bound_outer;
1441
  } else {
1442
    // {n,m} => {n,m-1} (m!=0)
1443
    upper_bound_inner -= 1;
1444
  }
1445

1446
  // assign the values, zeroing out lb1 and ub1 values since the iteration space
1447
  // is now one-dimensional
1448
  chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
1449
  chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
1450
  chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
1451
  chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
1452
  chunk_bounds_nest[0].lb1_u64 = 0;
1453
  chunk_bounds_nest[0].ub1_u64 = 0;
1454
  chunk_bounds_nest[1].lb1_u64 = 0;
1455
  chunk_bounds_nest[1].ub1_u64 = 0;
1456

1457
#if 0
1458
  printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1459
         tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1460
         chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1461
#endif
1462
}
1463

1464
/**************************************************************************
1465
 *  Handle upper triangle matrix in the canonical form
1466
 *  i = 0; i <= N; i++     {0,0}:{N,0}
1467
 *  j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1468
 * ************************************************************************/
1469
void kmp_handle_upper_triangle_matrix(
1470
    /*in*/ kmp_uint32 nth,
1471
    /*in*/ kmp_uint32 tid,
1472
    /*in */ kmp_index_t n,
1473
    /*in/out*/ bounds_info_t *original_bounds_nest,
1474
    /*out*/ bounds_info_t *chunk_bounds_nest) {
1475

1476
  // transfer loop types from the original loop to the chunks
1477
  for (kmp_index_t i = 0; i < n; ++i) {
1478
    chunk_bounds_nest[i] = original_bounds_nest[i];
1479
  }
1480
  // cleanup iv variables
1481
  kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1482
                                    original_bounds_nest[0].ub0_u64);
1483
  kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1484
                                    original_bounds_nest[0].lb0_u64);
1485
  [[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv(
1486
      original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);
1487
  // calculate the chunk's lower and upper bounds
1488
  // the total number of iterations in the loop is the sum of the arithmetic
1489
  // progression from the outer lower to outer upper bound (inclusive since the
1490
  // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1491
  // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1492
  // + 1) -> N - 1
1493
  kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
1494
  kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1495
  // the current thread's number of iterations:
1496
  // each thread gets an equal number of iterations: total number of iterations
1497
  // divided by the number of threads plus, if there's a remainder,
1498
  // the first threads with the number up to the remainder get an additional
1499
  // iteration each to cover it
1500
  kmp_uint64 iter_current =
1501
      iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1502
  // cumulative number of iterations executed by all the previous threads:
1503
  // threads with the tid below the remainder will have (iter_total/nth+1)
1504
  // elements, and so will all threads before them so the cumulative number of
1505
  // iterations executed by the all previous will be the current thread's number
1506
  // of iterations multiplied by the number of previous threads which is equal
1507
  // to the current thread's tid; threads with the number equal or above the
1508
  // remainder will have (iter_total/nth) elements so the cumulative number of
1509
  // iterations previously executed is its number of iterations multipled by the
1510
  // number of previous threads which is again equal to the current thread's tid
1511
  // PLUS all the remainder iterations that will have been executed by the
1512
  // previous threads
1513
  kmp_uint64 iter_before_current =
1514
      tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1515
  // cumulative number of iterations executed with the current thread is
1516
  // the cumulative number executed before it plus its own
1517
  kmp_uint64 iter_with_current = iter_before_current + iter_current;
1518
  // calculate the outer loop lower bound (lbo) which is the max outer iv value
1519
  // that gives the number of iterations that is equal or just below the total
1520
  // number of iterations executed by the previous threads:
1521
  // lbo*(lbo+1)/2<=iter_before_current =>
1522
  // lbo^2+lbo-2*iter_before_current<=0
1523
  kmp_uint64 lower_bound_outer =
1524
      (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
1525
  // calculate the inner loop lower bound which is the remaining number of
1526
  // iterations required to hit the total number of iterations executed by the
1527
  // previous threads giving the starting point of this thread
1528
  kmp_uint64 lower_bound_inner =
1529
      iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
1530
  // calculate the outer loop upper bound using the same approach as for the
1531
  // inner bound except using the total number of iterations executed with the
1532
  // current thread
1533
  kmp_uint64 upper_bound_outer =
1534
      (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
1535
  // calculate the inner loop upper bound which is the remaining number of
1536
  // iterations required to hit the total number of iterations executed after
1537
  // the current thread giving the starting point of the next thread
1538
  kmp_uint64 upper_bound_inner =
1539
      iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
1540
  // adjust the upper bounds down by 1 element to point at the last iteration of
1541
  // the current thread the first iteration of the next thread
1542
  if (upper_bound_inner == 0) {
1543
    // {n,0} => {n-1,n-1}
1544
    upper_bound_outer -= 1;
1545
    upper_bound_inner = upper_bound_outer;
1546
  } else {
1547
    // {n,m} => {n,m-1} (m!=0)
1548
    upper_bound_inner -= 1;
1549
  }
1550

1551
  // assign the values, zeroing out lb1 and ub1 values since the iteration space
1552
  // is now one-dimensional
1553
  chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
1554
  chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
1555
  chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
1556
  chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
1557
  chunk_bounds_nest[0].lb1_u64 = 0;
1558
  chunk_bounds_nest[0].ub1_u64 = 0;
1559
  chunk_bounds_nest[1].lb1_u64 = 0;
1560
  chunk_bounds_nest[1].ub1_u64 = 0;
1561

1562
#if 0
1563
  printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1564
         tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1565
         chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1566
#endif
1567
}
1568
//----------Init API for non-rectangular loops--------------------------------
1569

1570
// Init API for collapsed loops (static, no chunks defined).
1571
// "bounds_nest" has to be allocated per thread.
1572
// API will modify original bounds_nest array to bring it to a canonical form
1573
// (only <= and >=, no !=, <, >). If the original loop nest was already in a
1574
// canonical form there will be no changes to bounds in bounds_nest array
1575
// (only trip counts will be calculated). Internally API will expand the space
1576
// to parallelogram/parallelepiped, calculate total, calculate bounds for the
1577
// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1578
// important on the left side, to hit the lower bounds and not step over), and
1579
// pick the correct chunk for this thread (so it will calculate chunks up to the
1580
// needed one). It could be optimized to calculate just this chunk, potentially
1581
// a bit less well distributed among threads. It is designed to make sure that
1582
// threads will receive predictable chunks, deterministically (so that next nest
1583
// of loops with similar characteristics will get exactly same chunks on same
1584
// threads).
1585
// Current contract: chunk_bounds_nest has only lb0 and ub0,
1586
// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1587
extern "C" kmp_int32
1588
__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1589
                          /*in/out*/ bounds_info_t *original_bounds_nest,
1590
                          /*out*/ bounds_info_t *chunk_bounds_nest,
1591
                          kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
1592

1593
  KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1594
  KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
1595

1596
  if (__kmp_env_consistency_check) {
1597
    __kmp_push_workshare(gtid, ct_pdo, loc);
1598
  }
1599

1600
  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
1601

1602
  CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
1603

1604
  for (kmp_index_t i = 0; i < n; ++i) {
1605
    updated_bounds_nest[i].b = original_bounds_nest[i];
1606
  }
1607

1608
  kmp_loop_nest_iv_t total =
1609
      kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
1610

1611
  if (plastiter != NULL) {
1612
    *plastiter = FALSE;
1613
  }
1614

1615
  if (total == 0) {
1616
    // Loop won't execute:
1617
    return FALSE;
1618
  }
1619

1620
  // OMPTODO: DISTRIBUTE is not supported yet
1621
  __kmp_assert_valid_gtid(gtid);
1622
  kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1623

1624
  kmp_info_t *th = __kmp_threads[gtid];
1625
  kmp_team_t *team = th->th.th_team;
1626
  kmp_uint32 nth = team->t.t_nproc; // Number of threads
1627

1628
  KMP_DEBUG_ASSERT(tid < nth);
1629

1630
  // Handle special cases
1631
  nested_loop_type_t loop_type =
1632
      kmp_identify_nested_loop_structure(original_bounds_nest, n);
1633
  if (loop_type == nested_loop_type_lower_triangular_matrix) {
1634
    kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
1635
                                     chunk_bounds_nest);
1636
    return TRUE;
1637
  } else if (loop_type == nested_loop_type_upper_triangular_matrix) {
1638
    kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
1639
                                     chunk_bounds_nest);
1640
    return TRUE;
1641
  }
1642

1643
  CollapseAllocator<kmp_uint64> original_ivs_start(n);
1644

1645
  if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1646
                                       /*out*/ original_ivs_start)) {
1647
    // Loop won't execute:
1648
    return FALSE;
1649
  }
1650

1651
  // Not doing this optimization for one thread:
1652
  // (1) more to test
1653
  // (2) without it current contract that chunk_bounds_nest has only lb0 and
1654
  // ub0, lb1 and ub1 are set to 0 and can be ignored.
1655
  // if (nth == 1) {
1656
  //  // One thread:
1657
  //  // Copy all info from original_bounds_nest, it'll be good enough.
1658

1659
  //  for (kmp_index_t i = 0; i < n; ++i) {
1660
  //    chunk_bounds_nest[i] = original_bounds_nest[i];
1661
  //  }
1662

1663
  //  if (plastiter != NULL) {
1664
  //    *plastiter = TRUE;
1665
  //  }
1666
  //  return TRUE;
1667
  //}
1668

1669
  kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1670
      updated_bounds_nest, original_ivs_start, n);
1671

1672
  bool last_iter = false;
1673

1674
  for (; nth > 0;) {
1675
    // We could calculate chunk size once, but this is to compensate that the
1676
    // original space is not parallelepiped and some threads can be left
1677
    // without work:
1678
    KMP_DEBUG_ASSERT(total >= new_iv);
1679

1680
    kmp_loop_nest_iv_t total_left = total - new_iv;
1681
    kmp_loop_nest_iv_t chunk_size = total_left / nth;
1682
    kmp_loop_nest_iv_t remainder = total_left % nth;
1683

1684
    kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1685

1686
    if (remainder > 0) {
1687
      ++curr_chunk_size;
1688
      --remainder;
1689
    }
1690

1691
#if defined(KMP_DEBUG)
1692
    kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1693
#endif
1694

1695
    if (curr_chunk_size > 1) {
1696
      new_iv += curr_chunk_size - 1;
1697
    }
1698

1699
    CollapseAllocator<kmp_uint64> original_ivs_end(n);
1700
    if ((nth == 1) || (new_iv >= total - 1)) {
1701
      // Do this one till the end - just in case we miscalculated
1702
      // and either too much is left to process or new_iv is a bit too big:
1703
      kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1704
                                    /*out*/ original_ivs_end);
1705

1706
      last_iter = true;
1707
    } else {
1708
      // Note: here we make sure it's past (or equal to) the previous point.
1709
      if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1710
                                               updated_bounds_nest,
1711
                                               original_ivs_start, new_iv,
1712
                                               /*out*/ original_ivs_end)) {
1713
        // We could not find the ending point, use the original upper bounds:
1714
        kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1715
                                      /*out*/ original_ivs_end);
1716

1717
        last_iter = true;
1718
      }
1719
    }
1720

1721
#if defined(KMP_DEBUG)
1722
    auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1723
        updated_bounds_nest, original_ivs_end, n);
1724
    KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1725
#endif
1726

1727
    if (last_iter && (tid != 0)) {
1728
      // We are done, this was last chunk, but no chunk for current thread was
1729
      // found:
1730
      return FALSE;
1731
    }
1732

1733
    if (tid == 0) {
1734
      // We found the chunk for this thread, now we need to check if it's the
1735
      // last chunk or not:
1736

1737
      CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
1738
      if (last_iter ||
1739
          !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
1740
                                      /*out*/ original_ivs_next_start)) {
1741
        // no more loop iterations left to process,
1742
        // this means that currently found chunk is the last chunk:
1743
        if (plastiter != NULL) {
1744
          *plastiter = TRUE;
1745
        }
1746
      }
1747

1748
      // Fill in chunk bounds:
1749
      for (kmp_index_t i = 0; i < n; ++i) {
1750
        chunk_bounds_nest[i] =
1751
            original_bounds_nest[i]; // To fill in types, etc. - optional
1752
        chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
1753
        chunk_bounds_nest[i].lb1_u64 = 0;
1754

1755
        chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
1756
        chunk_bounds_nest[i].ub1_u64 = 0;
1757
      }
1758

1759
      return TRUE;
1760
    }
1761

1762
    --tid;
1763
    --nth;
1764

1765
    bool next_chunk = kmp_calc_next_original_ivs(
1766
        original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
1767
    if (!next_chunk) {
1768
      // no more loop iterations to process,
1769
      // the prevoius chunk was the last chunk
1770
      break;
1771
    }
1772

1773
    // original_ivs_start is next to previous chunk original_ivs_end,
1774
    // we need to start new chunk here, so chunks will be one after another
1775
    // without any gap or overlap:
1776
    new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
1777
                                               original_ivs_start, n);
1778
  }
1779

1780
  return FALSE;
1781
}
1782

1783
Product

Resources

Company