Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_collapse.cpp
35258 views
1
/*
2
* kmp_collapse.cpp -- loop collapse feature
3
*/
4
5
//===----------------------------------------------------------------------===//
6
//
7
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8
// See https://llvm.org/LICENSE.txt for license information.
9
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "kmp.h"
14
#include "kmp_error.h"
15
#include "kmp_i18n.h"
16
#include "kmp_itt.h"
17
#include "kmp_stats.h"
18
#include "kmp_str.h"
19
#include "kmp_collapse.h"
20
21
#if OMPT_SUPPORT
22
#include "ompt-specific.h"
23
#endif
24
25
// OMPTODO: different style of comments (see kmp_sched)
26
// OMPTODO: OMPT/OMPD
27
28
// avoid inadevertently using a library based abs
29
template <typename T> T __kmp_abs(const T val) {
30
return (val < 0) ? -val : val;
31
}
32
kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
33
kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
34
35
//----------------------------------------------------------------------------
36
// Common functions for working with rectangular and non-rectangular loops
37
//----------------------------------------------------------------------------
38
39
template <typename T> int __kmp_sign(T val) {
40
return (T(0) < val) - (val < T(0));
41
}
42
43
template <typename T> class CollapseAllocator {
44
typedef T *pT;
45
46
private:
47
static const size_t allocaSize = 32; // size limit for stack allocations
48
// (8 bytes x 4 nested loops)
49
char stackAlloc[allocaSize];
50
static constexpr size_t maxElemCount = allocaSize / sizeof(T);
51
pT pTAlloc;
52
53
public:
54
CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
55
if (n > maxElemCount) {
56
pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
57
}
58
}
59
~CollapseAllocator() {
60
if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
61
__kmp_free(pTAlloc);
62
}
63
}
64
T &operator[](int index) { return pTAlloc[index]; }
65
operator const pT() { return pTAlloc; }
66
};
67
68
//----------Loop canonicalization---------------------------------------------
69
70
// For loop nest (any shape):
71
// convert != to < or >;
72
// switch from using < or > to <= or >=.
73
// "bounds" array has to be allocated per thread.
74
// All other internal functions will work only with canonicalized loops.
75
template <typename T>
76
void kmp_canonicalize_one_loop_XX(
77
ident_t *loc,
78
/*in/out*/ bounds_infoXX_template<T> *bounds) {
79
80
if (__kmp_env_consistency_check) {
81
if (bounds->step == 0) {
82
__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
83
loc);
84
}
85
}
86
87
if (bounds->comparison == comparison_t::comp_not_eq) {
88
// We can convert this to < or >, depends on the sign of the step:
89
if (bounds->step > 0) {
90
bounds->comparison = comparison_t::comp_less;
91
} else {
92
bounds->comparison = comparison_t::comp_greater;
93
}
94
}
95
96
if (bounds->comparison == comparison_t::comp_less) {
97
// Note: ub0 can be unsigned. Should be Ok to hit overflow here,
98
// because ub0 + ub1*j should be still positive (otherwise loop was not
99
// well formed)
100
bounds->ub0 -= 1;
101
bounds->comparison = comparison_t::comp_less_or_eq;
102
} else if (bounds->comparison == comparison_t::comp_greater) {
103
bounds->ub0 += 1;
104
bounds->comparison = comparison_t::comp_greater_or_eq;
105
}
106
}
107
108
// Canonicalize loop nest. original_bounds_nest is an array of length n.
109
void kmp_canonicalize_loop_nest(ident_t *loc,
110
/*in/out*/ bounds_info_t *original_bounds_nest,
111
kmp_index_t n) {
112
113
for (kmp_index_t ind = 0; ind < n; ++ind) {
114
auto bounds = &(original_bounds_nest[ind]);
115
116
switch (bounds->loop_type) {
117
case loop_type_t::loop_type_int32:
118
kmp_canonicalize_one_loop_XX<kmp_int32>(
119
loc,
120
/*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
121
break;
122
case loop_type_t::loop_type_uint32:
123
kmp_canonicalize_one_loop_XX<kmp_uint32>(
124
loc,
125
/*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
126
break;
127
case loop_type_t::loop_type_int64:
128
kmp_canonicalize_one_loop_XX<kmp_int64>(
129
loc,
130
/*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
131
break;
132
case loop_type_t::loop_type_uint64:
133
kmp_canonicalize_one_loop_XX<kmp_uint64>(
134
loc,
135
/*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
136
break;
137
default:
138
KMP_ASSERT(false);
139
}
140
}
141
}
142
143
//----------Calculating trip count on one level-------------------------------
144
145
// Calculate trip count on this loop level.
146
// We do this either for a rectangular loop nest,
147
// or after an adjustment bringing the loops to a parallelepiped shape.
148
// This number should not depend on the value of outer IV
149
// even if the formular has lb1 and ub1.
150
// Note: for non-rectangular loops don't use span for this, it's too big.
151
152
template <typename T>
153
kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
154
/*in/out*/ bounds_infoXX_template<T> *bounds) {
155
156
if (bounds->comparison == comparison_t::comp_less_or_eq) {
157
if (bounds->ub0 < bounds->lb0) {
158
// Note: after this we don't need to calculate inner loops,
159
// but that should be an edge case:
160
bounds->trip_count = 0;
161
} else {
162
// ub - lb may exceed signed type range; we need to cast to
163
// kmp_loop_nest_iv_t anyway
164
bounds->trip_count =
165
static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
166
__kmp_abs(bounds->step) +
167
1;
168
}
169
} else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
170
if (bounds->lb0 < bounds->ub0) {
171
// Note: after this we don't need to calculate inner loops,
172
// but that should be an edge case:
173
bounds->trip_count = 0;
174
} else {
175
// lb - ub may exceed signed type range; we need to cast to
176
// kmp_loop_nest_iv_t anyway
177
bounds->trip_count =
178
static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
179
__kmp_abs(bounds->step) +
180
1;
181
}
182
} else {
183
KMP_ASSERT(false);
184
}
185
return bounds->trip_count;
186
}
187
188
// Calculate trip count on this loop level.
189
kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
190
191
kmp_loop_nest_iv_t trip_count = 0;
192
193
switch (bounds->loop_type) {
194
case loop_type_t::loop_type_int32:
195
trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
196
/*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
197
break;
198
case loop_type_t::loop_type_uint32:
199
trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
200
/*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
201
break;
202
case loop_type_t::loop_type_int64:
203
trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
204
/*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
205
break;
206
case loop_type_t::loop_type_uint64:
207
trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
208
/*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
209
break;
210
default:
211
KMP_ASSERT(false);
212
}
213
214
return trip_count;
215
}
216
217
//----------Trim original iv according to its type----------------------------
218
219
// Trim original iv according to its type.
220
// Return kmp_uint64 value which can be easily used in all internal calculations
221
// And can be statically cast back to original type in user code.
222
kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
223
kmp_uint64 res = 0;
224
225
switch (loop_iv_type) {
226
case loop_type_t::loop_type_int8:
227
res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
228
break;
229
case loop_type_t::loop_type_uint8:
230
res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
231
break;
232
case loop_type_t::loop_type_int16:
233
res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
234
break;
235
case loop_type_t::loop_type_uint16:
236
res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
237
break;
238
case loop_type_t::loop_type_int32:
239
res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
240
break;
241
case loop_type_t::loop_type_uint32:
242
res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
243
break;
244
case loop_type_t::loop_type_int64:
245
res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
246
break;
247
case loop_type_t::loop_type_uint64:
248
res = static_cast<kmp_uint64>(original_iv);
249
break;
250
default:
251
KMP_ASSERT(false);
252
}
253
254
return res;
255
}
256
257
//----------Compare two IVs (remember they have a type)-----------------------
258
259
bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
260
kmp_uint64 original_iv2) {
261
bool res = false;
262
263
switch (loop_iv_type) {
264
case loop_type_t::loop_type_int8:
265
res = static_cast<kmp_int8>(original_iv1) ==
266
static_cast<kmp_int8>(original_iv2);
267
break;
268
case loop_type_t::loop_type_uint8:
269
res = static_cast<kmp_uint8>(original_iv1) ==
270
static_cast<kmp_uint8>(original_iv2);
271
break;
272
case loop_type_t::loop_type_int16:
273
res = static_cast<kmp_int16>(original_iv1) ==
274
static_cast<kmp_int16>(original_iv2);
275
break;
276
case loop_type_t::loop_type_uint16:
277
res = static_cast<kmp_uint16>(original_iv1) ==
278
static_cast<kmp_uint16>(original_iv2);
279
break;
280
case loop_type_t::loop_type_int32:
281
res = static_cast<kmp_int32>(original_iv1) ==
282
static_cast<kmp_int32>(original_iv2);
283
break;
284
case loop_type_t::loop_type_uint32:
285
res = static_cast<kmp_uint32>(original_iv1) ==
286
static_cast<kmp_uint32>(original_iv2);
287
break;
288
case loop_type_t::loop_type_int64:
289
res = static_cast<kmp_int64>(original_iv1) ==
290
static_cast<kmp_int64>(original_iv2);
291
break;
292
case loop_type_t::loop_type_uint64:
293
res = static_cast<kmp_uint64>(original_iv1) ==
294
static_cast<kmp_uint64>(original_iv2);
295
break;
296
default:
297
KMP_ASSERT(false);
298
}
299
300
return res;
301
}
302
303
//----------Calculate original iv on one level--------------------------------
304
305
// Return true if the point fits into upper bounds on this level,
306
// false otherwise
307
template <typename T>
308
bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
309
const kmp_point_t original_ivs,
310
kmp_index_t ind) {
311
312
T iv = static_cast<T>(original_ivs[ind]);
313
T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
314
315
if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
316
(iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
317
((bounds->comparison == comparison_t::comp_greater_or_eq) &&
318
(iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
319
// The calculated point is outside of loop upper boundary:
320
return false;
321
}
322
323
return true;
324
}
325
326
// Calculate one iv corresponding to iteration on the level ind.
327
// Return true if it fits into lower-upper bounds on this level
328
// (if not, we need to re-calculate)
329
template <typename T>
330
bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
331
/*in/out*/ kmp_point_t original_ivs,
332
const kmp_iterations_t iterations, kmp_index_t ind,
333
bool start_with_lower_bound, bool checkBounds) {
334
335
kmp_uint64 temp = 0;
336
T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
337
338
if (start_with_lower_bound) {
339
// we moved to the next iteration on one of outer loops, should start
340
// with the lower bound here:
341
temp = bounds->lb0 + bounds->lb1 * outer_iv;
342
} else {
343
auto iteration = iterations[ind];
344
temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
345
}
346
347
// Now trim original iv according to its type:
348
original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
349
350
if (checkBounds) {
351
return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
352
} else {
353
return true;
354
}
355
}
356
357
bool kmp_calc_one_iv(const bounds_info_t *bounds,
358
/*in/out*/ kmp_point_t original_ivs,
359
const kmp_iterations_t iterations, kmp_index_t ind,
360
bool start_with_lower_bound, bool checkBounds) {
361
362
switch (bounds->loop_type) {
363
case loop_type_t::loop_type_int32:
364
return kmp_calc_one_iv_XX<kmp_int32>(
365
(bounds_infoXX_template<kmp_int32> *)(bounds),
366
/*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
367
checkBounds);
368
break;
369
case loop_type_t::loop_type_uint32:
370
return kmp_calc_one_iv_XX<kmp_uint32>(
371
(bounds_infoXX_template<kmp_uint32> *)(bounds),
372
/*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
373
checkBounds);
374
break;
375
case loop_type_t::loop_type_int64:
376
return kmp_calc_one_iv_XX<kmp_int64>(
377
(bounds_infoXX_template<kmp_int64> *)(bounds),
378
/*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
379
checkBounds);
380
break;
381
case loop_type_t::loop_type_uint64:
382
return kmp_calc_one_iv_XX<kmp_uint64>(
383
(bounds_infoXX_template<kmp_uint64> *)(bounds),
384
/*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
385
checkBounds);
386
break;
387
default:
388
KMP_ASSERT(false);
389
return false;
390
}
391
}
392
393
//----------Calculate original iv on one level for rectangular loop nest------
394
395
// Calculate one iv corresponding to iteration on the level ind.
396
// Return true if it fits into lower-upper bounds on this level
397
// (if not, we need to re-calculate)
398
template <typename T>
399
void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
400
/*in/out*/ kmp_uint64 *original_ivs,
401
const kmp_iterations_t iterations,
402
kmp_index_t ind) {
403
404
auto iteration = iterations[ind];
405
406
kmp_uint64 temp =
407
bounds->lb0 +
408
bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
409
iteration * bounds->step;
410
411
// Now trim original iv according to its type:
412
original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
413
}
414
415
void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
416
/*in/out*/ kmp_uint64 *original_ivs,
417
const kmp_iterations_t iterations,
418
kmp_index_t ind) {
419
420
switch (bounds->loop_type) {
421
case loop_type_t::loop_type_int32:
422
kmp_calc_one_iv_rectang_XX<kmp_int32>(
423
(bounds_infoXX_template<kmp_int32> *)(bounds),
424
/*in/out*/ original_ivs, iterations, ind);
425
break;
426
case loop_type_t::loop_type_uint32:
427
kmp_calc_one_iv_rectang_XX<kmp_uint32>(
428
(bounds_infoXX_template<kmp_uint32> *)(bounds),
429
/*in/out*/ original_ivs, iterations, ind);
430
break;
431
case loop_type_t::loop_type_int64:
432
kmp_calc_one_iv_rectang_XX<kmp_int64>(
433
(bounds_infoXX_template<kmp_int64> *)(bounds),
434
/*in/out*/ original_ivs, iterations, ind);
435
break;
436
case loop_type_t::loop_type_uint64:
437
kmp_calc_one_iv_rectang_XX<kmp_uint64>(
438
(bounds_infoXX_template<kmp_uint64> *)(bounds),
439
/*in/out*/ original_ivs, iterations, ind);
440
break;
441
default:
442
KMP_ASSERT(false);
443
}
444
}
445
446
//----------------------------------------------------------------------------
447
// Rectangular loop nest
448
//----------------------------------------------------------------------------
449
450
//----------Canonicalize loop nest and calculate trip count-------------------
451
452
// Canonicalize loop nest and calculate overall trip count.
453
// "bounds_nest" has to be allocated per thread.
454
// API will modify original bounds_nest array to bring it to a canonical form
455
// (only <= and >=, no !=, <, >). If the original loop nest was already in a
456
// canonical form there will be no changes to bounds in bounds_nest array
457
// (only trip counts will be calculated).
458
// Returns trip count of overall space.
459
extern "C" kmp_loop_nest_iv_t
460
__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
461
/*in/out*/ bounds_info_t *original_bounds_nest,
462
kmp_index_t n) {
463
464
kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
465
466
kmp_loop_nest_iv_t total = 1;
467
468
for (kmp_index_t ind = 0; ind < n; ++ind) {
469
auto bounds = &(original_bounds_nest[ind]);
470
471
kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
472
total *= trip_count;
473
}
474
475
return total;
476
}
477
478
//----------Calculate old induction variables---------------------------------
479
480
// Calculate old induction variables corresponding to overall new_iv.
481
// Note: original IV will be returned as if it had kmp_uint64 type,
482
// will have to be converted to original type in user code.
483
// Note: trip counts should be already calculated by
484
// __kmpc_process_loop_nest_rectang.
485
// OMPTODO: special case 2, 3 nested loops: either do different
486
// interface without array or possibly template this over n
487
extern "C" void
488
__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
489
const bounds_info_t *original_bounds_nest,
490
/*out*/ kmp_uint64 *original_ivs,
491
kmp_index_t n) {
492
493
CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
494
495
// First, calc corresponding iteration in every original loop:
496
for (kmp_index_t ind = n; ind > 0;) {
497
--ind;
498
auto bounds = &(original_bounds_nest[ind]);
499
500
// should be optimized to OPDIVREM:
501
auto temp = new_iv / bounds->trip_count;
502
auto iteration = new_iv % bounds->trip_count;
503
new_iv = temp;
504
505
iterations[ind] = iteration;
506
}
507
KMP_ASSERT(new_iv == 0);
508
509
for (kmp_index_t ind = 0; ind < n; ++ind) {
510
auto bounds = &(original_bounds_nest[ind]);
511
512
kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
513
}
514
}
515
516
//----------------------------------------------------------------------------
517
// Non-rectangular loop nest
518
//----------------------------------------------------------------------------
519
520
//----------Calculate maximum possible span of iv values on one level---------
521
522
// Calculate span for IV on this loop level for "<=" case.
523
// Note: it's for <= on this loop nest level, so lower bound should be smallest
524
// value, upper bound should be the biggest value. If the loop won't execute,
525
// 'smallest' may be bigger than 'biggest', but we'd better not switch them
526
// around.
527
template <typename T>
528
void kmp_calc_span_lessoreq_XX(
529
/* in/out*/ bounds_info_internalXX_template<T> *bounds,
530
/* in/out*/ bounds_info_internal_t *bounds_nest) {
531
532
typedef typename traits_t<T>::unsigned_t UT;
533
// typedef typename traits_t<T>::signed_t ST;
534
535
// typedef typename big_span_t span_t;
536
typedef T span_t;
537
538
auto &bbounds = bounds->b;
539
540
if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
541
// This dimention depends on one of previous ones; can't be the outermost
542
// one.
543
bounds_info_internalXX_template<T> *previous =
544
reinterpret_cast<bounds_info_internalXX_template<T> *>(
545
&(bounds_nest[bbounds.outer_iv]));
546
547
// OMPTODO: assert that T is compatible with loop variable type on
548
// 'previous' loop
549
550
{
551
span_t bound_candidate1 =
552
bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
553
span_t bound_candidate2 =
554
bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
555
if (bound_candidate1 < bound_candidate2) {
556
bounds->span_smallest = bound_candidate1;
557
} else {
558
bounds->span_smallest = bound_candidate2;
559
}
560
}
561
562
{
563
// We can't adjust the upper bound with respect to step, because
564
// lower bound might be off after adjustments
565
566
span_t bound_candidate1 =
567
bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
568
span_t bound_candidate2 =
569
bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
570
if (bound_candidate1 < bound_candidate2) {
571
bounds->span_biggest = bound_candidate2;
572
} else {
573
bounds->span_biggest = bound_candidate1;
574
}
575
}
576
} else {
577
// Rectangular:
578
bounds->span_smallest = bbounds.lb0;
579
bounds->span_biggest = bbounds.ub0;
580
}
581
if (!bounds->loop_bounds_adjusted) {
582
// Here it's safe to reduce the space to the multiply of step.
583
// OMPTODO: check if the formular is correct.
584
// Also check if it would be safe to do this if we didn't adjust left side.
585
bounds->span_biggest -=
586
(static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
587
}
588
}
589
590
// Calculate span for IV on this loop level for ">=" case.
591
template <typename T>
592
void kmp_calc_span_greateroreq_XX(
593
/* in/out*/ bounds_info_internalXX_template<T> *bounds,
594
/* in/out*/ bounds_info_internal_t *bounds_nest) {
595
596
typedef typename traits_t<T>::unsigned_t UT;
597
// typedef typename traits_t<T>::signed_t ST;
598
599
// typedef typename big_span_t span_t;
600
typedef T span_t;
601
602
auto &bbounds = bounds->b;
603
604
if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
605
// This dimention depends on one of previous ones; can't be the outermost
606
// one.
607
bounds_info_internalXX_template<T> *previous =
608
reinterpret_cast<bounds_info_internalXX_template<T> *>(
609
&(bounds_nest[bbounds.outer_iv]));
610
611
// OMPTODO: assert that T is compatible with loop variable type on
612
// 'previous' loop
613
614
{
615
span_t bound_candidate1 =
616
bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
617
span_t bound_candidate2 =
618
bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
619
if (bound_candidate1 >= bound_candidate2) {
620
bounds->span_smallest = bound_candidate1;
621
} else {
622
bounds->span_smallest = bound_candidate2;
623
}
624
}
625
626
{
627
// We can't adjust the upper bound with respect to step, because
628
// lower bound might be off after adjustments
629
630
span_t bound_candidate1 =
631
bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
632
span_t bound_candidate2 =
633
bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
634
if (bound_candidate1 >= bound_candidate2) {
635
bounds->span_biggest = bound_candidate2;
636
} else {
637
bounds->span_biggest = bound_candidate1;
638
}
639
}
640
641
} else {
642
// Rectangular:
643
bounds->span_biggest = bbounds.lb0;
644
bounds->span_smallest = bbounds.ub0;
645
}
646
if (!bounds->loop_bounds_adjusted) {
647
// Here it's safe to reduce the space to the multiply of step.
648
// OMPTODO: check if the formular is correct.
649
// Also check if it would be safe to do this if we didn't adjust left side.
650
bounds->span_biggest -=
651
(static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
652
}
653
}
654
655
// Calculate maximum possible span for IV on this loop level.
656
template <typename T>
657
void kmp_calc_span_XX(
658
/* in/out*/ bounds_info_internalXX_template<T> *bounds,
659
/* in/out*/ bounds_info_internal_t *bounds_nest) {
660
661
if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
662
kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
663
} else {
664
KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
665
kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
666
}
667
}
668
669
//----------All initial processing of the loop nest---------------------------
670
671
// Calculate new bounds for this loop level.
672
// To be able to work with the nest we need to get it to a parallelepiped shape.
673
// We need to stay in the original range of values, so that there will be no
674
// overflow, for that we'll adjust both upper and lower bounds as needed.
675
template <typename T>
676
void kmp_calc_new_bounds_XX(
677
/* in/out*/ bounds_info_internalXX_template<T> *bounds,
678
/* in/out*/ bounds_info_internal_t *bounds_nest) {
679
680
auto &bbounds = bounds->b;
681
682
if (bbounds.lb1 == bbounds.ub1) {
683
// Already parallel, no need to adjust:
684
bounds->loop_bounds_adjusted = false;
685
} else {
686
bounds->loop_bounds_adjusted = true;
687
688
T old_lb1 = bbounds.lb1;
689
T old_ub1 = bbounds.ub1;
690
691
if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
692
// With this shape we can adjust to a rectangle:
693
bbounds.lb1 = 0;
694
bbounds.ub1 = 0;
695
} else {
696
// get upper and lower bounds to be parallel
697
// with values in the old range.
698
// Note: abs didn't work here.
699
if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
700
((old_lb1 > 0) && (old_lb1 > old_ub1))) {
701
bbounds.lb1 = old_ub1;
702
} else {
703
bbounds.ub1 = old_lb1;
704
}
705
}
706
707
// Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
708
// The idea here that for this IV we are now getting the same span
709
// irrespective of the previous IV value.
710
bounds_info_internalXX_template<T> *previous =
711
reinterpret_cast<bounds_info_internalXX_template<T> *>(
712
&bounds_nest[bbounds.outer_iv]);
713
714
if (bbounds.comparison == comparison_t::comp_less_or_eq) {
715
if (old_lb1 < bbounds.lb1) {
716
KMP_ASSERT(old_lb1 < 0);
717
// The length is good on outer_iv biggest number,
718
// can use it to find where to move the lower bound:
719
720
T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
721
bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
722
// e.g. it was 0?? (same below)
723
} else if (old_lb1 > bbounds.lb1) {
724
// still need to move lower bound:
725
T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
726
bbounds.lb0 += add;
727
}
728
729
if (old_ub1 > bbounds.ub1) {
730
KMP_ASSERT(old_ub1 > 0);
731
// The length is good on outer_iv biggest number,
732
// can use it to find where to move upper bound:
733
734
T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
735
bbounds.ub0 += add;
736
} else if (old_ub1 < bbounds.ub1) {
737
// still need to move upper bound:
738
T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
739
bbounds.ub0 -= sub;
740
}
741
} else {
742
KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
743
if (old_lb1 < bbounds.lb1) {
744
KMP_ASSERT(old_lb1 < 0);
745
T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
746
bbounds.lb0 -= sub;
747
} else if (old_lb1 > bbounds.lb1) {
748
T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
749
bbounds.lb0 += add;
750
}
751
752
if (old_ub1 > bbounds.ub1) {
753
KMP_ASSERT(old_ub1 > 0);
754
T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
755
bbounds.ub0 += add;
756
} else if (old_ub1 < bbounds.ub1) {
757
T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
758
bbounds.ub0 -= sub;
759
}
760
}
761
}
762
}
763
764
// Do all processing for one canonicalized loop in the nest
765
// (assuming that outer loops already were processed):
766
template <typename T>
767
kmp_loop_nest_iv_t kmp_process_one_loop_XX(
768
/* in/out*/ bounds_info_internalXX_template<T> *bounds,
769
/*in/out*/ bounds_info_internal_t *bounds_nest) {
770
771
kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
772
kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
773
return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
774
}
775
776
// Non-rectangular loop nest, canonicalized to use <= or >=.
777
// Process loop nest to have a parallelepiped shape,
778
// calculate biggest spans for IV's on all levels and calculate overall trip
779
// count. "bounds_nest" has to be allocated per thread.
780
// Returns overall trip count (for adjusted space).
781
kmp_loop_nest_iv_t kmp_process_loop_nest(
782
/*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
783
784
kmp_loop_nest_iv_t total = 1;
785
786
for (kmp_index_t ind = 0; ind < n; ++ind) {
787
auto bounds = &(bounds_nest[ind]);
788
kmp_loop_nest_iv_t trip_count = 0;
789
790
switch (bounds->b.loop_type) {
791
case loop_type_t::loop_type_int32:
792
trip_count = kmp_process_one_loop_XX<kmp_int32>(
793
/*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
794
/*in/out*/ bounds_nest);
795
break;
796
case loop_type_t::loop_type_uint32:
797
trip_count = kmp_process_one_loop_XX<kmp_uint32>(
798
/*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
799
/*in/out*/ bounds_nest);
800
break;
801
case loop_type_t::loop_type_int64:
802
trip_count = kmp_process_one_loop_XX<kmp_int64>(
803
/*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
804
/*in/out*/ bounds_nest);
805
break;
806
case loop_type_t::loop_type_uint64:
807
trip_count = kmp_process_one_loop_XX<kmp_uint64>(
808
/*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
809
/*in/out*/ bounds_nest);
810
break;
811
default:
812
KMP_ASSERT(false);
813
}
814
total *= trip_count;
815
}
816
817
return total;
818
}
819
820
//----------Calculate iterations (in the original or updated space)-----------
821
822
// Calculate number of iterations in original or updated space resulting in
823
// original_ivs[ind] (only on this level, non-negative)
824
// (not counting initial iteration)
825
template <typename T>
826
kmp_loop_nest_iv_t
827
kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
828
const kmp_point_t original_ivs,
829
kmp_index_t ind) {
830
831
kmp_loop_nest_iv_t iterations = 0;
832
833
if (bounds->comparison == comparison_t::comp_less_or_eq) {
834
iterations =
835
(static_cast<T>(original_ivs[ind]) - bounds->lb0 -
836
bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
837
__kmp_abs(bounds->step);
838
} else {
839
KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
840
iterations = (bounds->lb0 +
841
bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
842
static_cast<T>(original_ivs[ind])) /
843
__kmp_abs(bounds->step);
844
}
845
846
return iterations;
847
}
848
849
// Calculate number of iterations in the original or updated space resulting in
850
// original_ivs[ind] (only on this level, non-negative)
851
kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
852
const kmp_point_t original_ivs,
853
kmp_index_t ind) {
854
855
switch (bounds->loop_type) {
856
case loop_type_t::loop_type_int32:
857
return kmp_calc_number_of_iterations_XX<kmp_int32>(
858
(bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
859
break;
860
case loop_type_t::loop_type_uint32:
861
return kmp_calc_number_of_iterations_XX<kmp_uint32>(
862
(bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
863
break;
864
case loop_type_t::loop_type_int64:
865
return kmp_calc_number_of_iterations_XX<kmp_int64>(
866
(bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
867
break;
868
case loop_type_t::loop_type_uint64:
869
return kmp_calc_number_of_iterations_XX<kmp_uint64>(
870
(bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
871
break;
872
default:
873
KMP_ASSERT(false);
874
return 0;
875
}
876
}
877
878
//----------Calculate new iv corresponding to original ivs--------------------
879
880
// We got a point in the original loop nest.
881
// Take updated bounds and calculate what new_iv will correspond to this point.
882
// When we are getting original IVs from new_iv, we have to adjust to fit into
883
// original loops bounds. Getting new_iv for the adjusted original IVs will help
884
// with making more chunks non-empty.
885
kmp_loop_nest_iv_t
886
kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
887
const kmp_point_t original_ivs,
888
kmp_index_t n) {
889
890
kmp_loop_nest_iv_t new_iv = 0;
891
892
for (kmp_index_t ind = 0; ind < n; ++ind) {
893
auto bounds = &(bounds_nest[ind].b);
894
895
new_iv = new_iv * bounds->trip_count +
896
kmp_calc_number_of_iterations(bounds, original_ivs, ind);
897
}
898
899
return new_iv;
900
}
901
902
//----------Calculate original ivs for provided iterations--------------------
903
904
// Calculate original IVs for provided iterations, assuming iterations are
905
// calculated in the original space.
906
// Loop nest is in canonical form (with <= / >=).
907
bool kmp_calc_original_ivs_from_iterations(
908
const bounds_info_t *original_bounds_nest, kmp_index_t n,
909
/*in/out*/ kmp_point_t original_ivs,
910
/*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
911
912
kmp_index_t lengthened_ind = n;
913
914
for (; ind < n;) {
915
auto bounds = &(original_bounds_nest[ind]);
916
bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
917
ind, (lengthened_ind < ind), true);
918
919
if (!good) {
920
// The calculated iv value is too big (or too small for >=):
921
if (ind == 0) {
922
// Space is empty:
923
return false;
924
} else {
925
// Go to next iteration on the outer loop:
926
--ind;
927
++iterations[ind];
928
lengthened_ind = ind;
929
for (kmp_index_t i = ind + 1; i < n; ++i) {
930
iterations[i] = 0;
931
}
932
continue;
933
}
934
}
935
++ind;
936
}
937
938
return true;
939
}
940
941
//----------Calculate original ivs for the beginning of the loop nest---------
942
943
// Calculate IVs for the beginning of the loop nest.
944
// Note: lower bounds of all loops may not work -
945
// if on some of the iterations of the outer loops inner loops are empty.
946
// Loop nest is in canonical form (with <= / >=).
947
bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
948
kmp_index_t n,
949
/*out*/ kmp_point_t original_ivs) {
950
951
// Iterations in the original space, multiplied by step:
952
CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
953
for (kmp_index_t ind = n; ind > 0;) {
954
--ind;
955
iterations[ind] = 0;
956
}
957
958
// Now calculate the point:
959
bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
960
/*in/out*/ original_ivs,
961
/*in/out*/ iterations, 0);
962
return b;
963
}
964
965
//----------Calculate next point in the original loop space-------------------
966
967
// From current set of original IVs calculate next point.
968
// Return false if there is no next point in the loop bounds.
969
bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
970
kmp_index_t n, const kmp_point_t original_ivs,
971
/*out*/ kmp_point_t next_original_ivs) {
972
// Iterations in the original space, multiplied by step (so can be negative):
973
CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
974
// First, calc corresponding iteration in every original loop:
975
for (kmp_index_t ind = 0; ind < n; ++ind) {
976
auto bounds = &(original_bounds_nest[ind]);
977
iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
978
}
979
980
for (kmp_index_t ind = 0; ind < n; ++ind) {
981
next_original_ivs[ind] = original_ivs[ind];
982
}
983
984
// Next add one step to the iterations on the inner-most level, and see if we
985
// need to move up the nest:
986
kmp_index_t ind = n - 1;
987
++iterations[ind];
988
989
bool b = kmp_calc_original_ivs_from_iterations(
990
original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
991
992
return b;
993
}
994
995
//----------Calculate chunk end in the original loop space--------------------
996
997
// For one level calculate old induction variable corresponding to overall
998
// new_iv for the chunk end.
999
// Return true if it fits into upper bound on this level
1000
// (if not, we need to re-calculate)
1001
template <typename T>
1002
bool kmp_calc_one_iv_for_chunk_end_XX(
1003
const bounds_infoXX_template<T> *bounds,
1004
const bounds_infoXX_template<T> *updated_bounds,
1005
/*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
1006
kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
1007
const kmp_point_t original_ivs_start) {
1008
1009
// typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
1010
// big_span_t;
1011
1012
// OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
1013
T temp = 0;
1014
1015
T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
1016
1017
if (start_with_lower_bound) {
1018
// we moved to the next iteration on one of outer loops, may as well use
1019
// the lower bound here:
1020
temp = bounds->lb0 + bounds->lb1 * outer_iv;
1021
} else {
1022
// Start in expanded space, but:
1023
// - we need to hit original space lower bound, so need to account for
1024
// that
1025
// - we have to go into original space, even if that means adding more
1026
// iterations than was planned
1027
// - we have to go past (or equal to) previous point (which is the chunk
1028
// starting point)
1029
1030
auto iteration = iterations[ind];
1031
1032
auto step = bounds->step;
1033
1034
// In case of >= it's negative:
1035
auto accountForStep =
1036
((bounds->lb0 + bounds->lb1 * outer_iv) -
1037
(updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1038
step;
1039
1040
temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1041
accountForStep + iteration * step;
1042
1043
if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1044
(temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
1045
((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1046
(temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1047
// Too small (or too big), didn't reach the original lower bound. Use
1048
// heuristic:
1049
temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
1050
}
1051
1052
if (compare_with_start) {
1053
1054
T start = static_cast<T>(original_ivs_start[ind]);
1055
1056
temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1057
1058
// On all previous levels start of the chunk is same as the end, need to
1059
// be really careful here:
1060
if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1061
(temp < start)) ||
1062
((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1063
(temp > start))) {
1064
// End of the chunk can't be smaller (for >= bigger) than it's start.
1065
// Use heuristic:
1066
temp = start + iteration / 4 * step;
1067
}
1068
}
1069
}
1070
1071
original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1072
1073
if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1074
(temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
1075
((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1076
(temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1077
// Too big (or too small for >=).
1078
return false;
1079
}
1080
1081
return true;
1082
}
1083
1084
// For one level calculate old induction variable corresponding to overall
1085
// new_iv for the chunk end.
1086
bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
1087
const bounds_info_t *updated_bounds,
1088
/*in/out*/ kmp_point_t original_ivs,
1089
const kmp_iterations_t iterations,
1090
kmp_index_t ind, bool start_with_lower_bound,
1091
bool compare_with_start,
1092
const kmp_point_t original_ivs_start) {
1093
1094
switch (bounds->loop_type) {
1095
case loop_type_t::loop_type_int32:
1096
return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1097
(bounds_infoXX_template<kmp_int32> *)(bounds),
1098
(bounds_infoXX_template<kmp_int32> *)(updated_bounds),
1099
/*in/out*/
1100
original_ivs, iterations, ind, start_with_lower_bound,
1101
compare_with_start, original_ivs_start);
1102
break;
1103
case loop_type_t::loop_type_uint32:
1104
return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1105
(bounds_infoXX_template<kmp_uint32> *)(bounds),
1106
(bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
1107
/*in/out*/
1108
original_ivs, iterations, ind, start_with_lower_bound,
1109
compare_with_start, original_ivs_start);
1110
break;
1111
case loop_type_t::loop_type_int64:
1112
return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1113
(bounds_infoXX_template<kmp_int64> *)(bounds),
1114
(bounds_infoXX_template<kmp_int64> *)(updated_bounds),
1115
/*in/out*/
1116
original_ivs, iterations, ind, start_with_lower_bound,
1117
compare_with_start, original_ivs_start);
1118
break;
1119
case loop_type_t::loop_type_uint64:
1120
return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1121
(bounds_infoXX_template<kmp_uint64> *)(bounds),
1122
(bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
1123
/*in/out*/
1124
original_ivs, iterations, ind, start_with_lower_bound,
1125
compare_with_start, original_ivs_start);
1126
break;
1127
default:
1128
KMP_ASSERT(false);
1129
return false;
1130
}
1131
}
1132
1133
// Calculate old induction variables corresponding to overall new_iv for the
1134
// chunk end. If due to space extension we are getting old IVs outside of the
1135
// boundaries, bring them into the boundaries. Need to do this in the runtime,
1136
// esp. on the lower bounds side. When getting result need to make sure that the
1137
// new chunk starts at next position to old chunk, not overlaps with it (this is
1138
// done elsewhere), and need to make sure end of the chunk is further than the
1139
// beginning of the chunk. We don't need an exact ending point here, just
1140
// something more-or-less close to the desired chunk length, bigger is fine
1141
// (smaller would be fine, but we risk going into infinite loop, so do smaller
1142
// only at the very end of the space). result: false if could not find the
1143
// ending point in the original loop space. In this case the caller can use
1144
// original upper bounds as the end of the chunk. Chunk won't be empty, because
1145
// it'll have at least the starting point, which is by construction in the
1146
// original space.
1147
bool kmp_calc_original_ivs_for_chunk_end(
1148
const bounds_info_t *original_bounds_nest, kmp_index_t n,
1149
const bounds_info_internal_t *updated_bounds_nest,
1150
const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1151
/*out*/ kmp_point_t original_ivs) {
1152
1153
// Iterations in the expanded space:
1154
CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
1155
// First, calc corresponding iteration in every modified loop:
1156
for (kmp_index_t ind = n; ind > 0;) {
1157
--ind;
1158
auto &updated_bounds = updated_bounds_nest[ind];
1159
1160
// should be optimized to OPDIVREM:
1161
auto new_ind = new_iv / updated_bounds.b.trip_count;
1162
auto iteration = new_iv % updated_bounds.b.trip_count;
1163
1164
new_iv = new_ind;
1165
iterations[ind] = iteration;
1166
}
1167
KMP_DEBUG_ASSERT(new_iv == 0);
1168
1169
kmp_index_t lengthened_ind = n;
1170
kmp_index_t equal_ind = -1;
1171
1172
// Next calculate the point, but in original loop nest.
1173
for (kmp_index_t ind = 0; ind < n;) {
1174
auto bounds = &(original_bounds_nest[ind]);
1175
auto updated_bounds = &(updated_bounds_nest[ind].b);
1176
1177
bool good = kmp_calc_one_iv_for_chunk_end(
1178
bounds, updated_bounds,
1179
/*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
1180
(equal_ind >= ind - 1), original_ivs_start);
1181
1182
if (!good) {
1183
// Too big (or too small for >=).
1184
if (ind == 0) {
1185
// Need to reduce to the end.
1186
return false;
1187
} else {
1188
// Go to next iteration on outer loop:
1189
--ind;
1190
++(iterations[ind]);
1191
lengthened_ind = ind;
1192
if (equal_ind >= lengthened_ind) {
1193
// We've changed the number of iterations here,
1194
// can't be same anymore:
1195
equal_ind = lengthened_ind - 1;
1196
}
1197
for (kmp_index_t i = ind + 1; i < n; ++i) {
1198
iterations[i] = 0;
1199
}
1200
continue;
1201
}
1202
}
1203
1204
if ((equal_ind == ind - 1) &&
1205
(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1206
original_ivs_start[ind]))) {
1207
equal_ind = ind;
1208
} else if ((equal_ind > ind - 1) &&
1209
!(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1210
original_ivs_start[ind]))) {
1211
equal_ind = ind - 1;
1212
}
1213
++ind;
1214
}
1215
1216
return true;
1217
}
1218
1219
//----------Calculate upper bounds for the last chunk-------------------------
1220
1221
// Calculate one upper bound for the end.
1222
template <typename T>
1223
void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
1224
/*in/out*/ kmp_point_t original_ivs,
1225
kmp_index_t ind) {
1226
1227
T temp = bounds->ub0 +
1228
bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
1229
1230
original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1231
}
1232
1233
void kmp_calc_one_iv_end(const bounds_info_t *bounds,
1234
/*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
1235
1236
switch (bounds->loop_type) {
1237
default:
1238
KMP_ASSERT(false);
1239
break;
1240
case loop_type_t::loop_type_int32:
1241
kmp_calc_one_iv_end_XX<kmp_int32>(
1242
(bounds_infoXX_template<kmp_int32> *)(bounds),
1243
/*in/out*/ original_ivs, ind);
1244
break;
1245
case loop_type_t::loop_type_uint32:
1246
kmp_calc_one_iv_end_XX<kmp_uint32>(
1247
(bounds_infoXX_template<kmp_uint32> *)(bounds),
1248
/*in/out*/ original_ivs, ind);
1249
break;
1250
case loop_type_t::loop_type_int64:
1251
kmp_calc_one_iv_end_XX<kmp_int64>(
1252
(bounds_infoXX_template<kmp_int64> *)(bounds),
1253
/*in/out*/ original_ivs, ind);
1254
break;
1255
case loop_type_t::loop_type_uint64:
1256
kmp_calc_one_iv_end_XX<kmp_uint64>(
1257
(bounds_infoXX_template<kmp_uint64> *)(bounds),
1258
/*in/out*/ original_ivs, ind);
1259
break;
1260
}
1261
}
1262
1263
// Calculate upper bounds for the last loop iteration. Just use original upper
1264
// bounds (adjusted when canonicalized to use <= / >=). No need to check that
1265
// this point is in the original space (it's likely not)
1266
void kmp_calc_original_ivs_for_end(
1267
const bounds_info_t *const original_bounds_nest, kmp_index_t n,
1268
/*out*/ kmp_point_t original_ivs) {
1269
for (kmp_index_t ind = 0; ind < n; ++ind) {
1270
auto bounds = &(original_bounds_nest[ind]);
1271
kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
1272
}
1273
}
1274
1275
/**************************************************************************
1276
* Identify nested loop structure - loops come in the canonical form
1277
* Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0}
1278
* j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
1279
* Upper Triangle matrix
1280
* i = 0; i <= N; i++ {0,0}:{N,0}
1281
* j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1282
* ************************************************************************/
1283
nested_loop_type_t
1284
kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,
1285
/*in*/ kmp_index_t n) {
1286
// only 2-level nested loops are supported
1287
if (n != 2) {
1288
return nested_loop_type_unkown;
1289
}
1290
// loops must be canonical
1291
KMP_ASSERT(
1292
(original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
1293
(original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
1294
// check outer loop bounds: for triangular need to be {0,0}:{N,0}
1295
kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1296
original_bounds_nest[0].lb0_u64);
1297
kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1298
original_bounds_nest[0].ub0_u64);
1299
kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1300
original_bounds_nest[0].lb1_u64);
1301
kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1302
original_bounds_nest[0].ub1_u64);
1303
if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
1304
return nested_loop_type_unkown;
1305
}
1306
// check inner bounds to determine triangle type
1307
kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1308
original_bounds_nest[1].lb0_u64);
1309
kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1310
original_bounds_nest[1].ub0_u64);
1311
kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1312
original_bounds_nest[1].lb1_u64);
1313
kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1314
original_bounds_nest[1].ub1_u64);
1315
// lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
1316
if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
1317
(inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
1318
return nested_loop_type_lower_triangular_matrix;
1319
}
1320
// upper triangle loop inner bounds need to be {0,1}:{N,0}
1321
if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
1322
inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
1323
return nested_loop_type_upper_triangular_matrix;
1324
}
1325
return nested_loop_type_unkown;
1326
}
1327
1328
/**************************************************************************
1329
* SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
1330
* Start point is x so the result is always > sqrt(x)
1331
* The method has uniform convergence, PRECISION is set to 0.1
1332
* ************************************************************************/
1333
#define level_of_precision 0.1
1334
double sqrt_newton_approx(/*in*/ kmp_uint64 x) {
1335
double sqrt_old = 0.;
1336
double sqrt_new = (double)x;
1337
do {
1338
sqrt_old = sqrt_new;
1339
sqrt_new = (sqrt_old + x / sqrt_old) / 2;
1340
} while ((sqrt_old - sqrt_new) > level_of_precision);
1341
return sqrt_new;
1342
}
1343
1344
/**************************************************************************
1345
* Handle lower triangle matrix in the canonical form
1346
* i = 0; i <= N; i++ {0,0}:{N,0}
1347
* j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
1348
* ************************************************************************/
1349
void kmp_handle_lower_triangle_matrix(
1350
/*in*/ kmp_uint32 nth,
1351
/*in*/ kmp_uint32 tid,
1352
/*in */ kmp_index_t n,
1353
/*in/out*/ bounds_info_t *original_bounds_nest,
1354
/*out*/ bounds_info_t *chunk_bounds_nest) {
1355
1356
// transfer loop types from the original loop to the chunks
1357
for (kmp_index_t i = 0; i < n; ++i) {
1358
chunk_bounds_nest[i] = original_bounds_nest[i];
1359
}
1360
// cleanup iv variables
1361
kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1362
original_bounds_nest[0].ub0_u64);
1363
kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1364
original_bounds_nest[0].lb0_u64);
1365
kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1366
original_bounds_nest[1].ub0_u64);
1367
// calculate the chunk's lower and upper bounds
1368
// the total number of iterations in the loop is the sum of the arithmetic
1369
// progression from the outer lower to outer upper bound (inclusive since the
1370
// loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1371
// effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1372
// + 1) -> N - 1
1373
kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
1374
kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1375
// the current thread's number of iterations:
1376
// each thread gets an equal number of iterations: total number of iterations
1377
// divided by the number of threads plus, if there's a remainder,
1378
// the first threads with the number up to the remainder get an additional
1379
// iteration each to cover it
1380
kmp_uint64 iter_current =
1381
iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1382
// cumulative number of iterations executed by all the previous threads:
1383
// threads with the tid below the remainder will have (iter_total/nth+1)
1384
// elements, and so will all threads before them so the cumulative number of
1385
// iterations executed by the all previous will be the current thread's number
1386
// of iterations multiplied by the number of previous threads which is equal
1387
// to the current thread's tid; threads with the number equal or above the
1388
// remainder will have (iter_total/nth) elements so the cumulative number of
1389
// iterations previously executed is its number of iterations multipled by the
1390
// number of previous threads which is again equal to the current thread's tid
1391
// PLUS all the remainder iterations that will have been executed by the
1392
// previous threads
1393
kmp_uint64 iter_before_current =
1394
tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1395
// cumulative number of iterations executed with the current thread is
1396
// the cumulative number executed before it plus its own
1397
kmp_uint64 iter_with_current = iter_before_current + iter_current;
1398
// calculate the outer loop lower bound (lbo) which is the max outer iv value
1399
// that gives the number of iterations that is equal or just below the total
1400
// number of iterations executed by the previous threads, for less_than
1401
// (1-based) inner loops (inner_ub0 == -1) it will be i.e.
1402
// lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
1403
// for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
1404
// i.e. lbo*(lbo+1)/2<=iter_before_current =>
1405
// lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
1406
// using a parameter to control the equation sign
1407
kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
1408
kmp_uint64 lower_bound_outer =
1409
(kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1410
8 * iter_before_current) +
1411
inner_adjustment) /
1412
2 -
1413
inner_adjustment;
1414
// calculate the inner loop lower bound which is the remaining number of
1415
// iterations required to hit the total number of iterations executed by the
1416
// previous threads giving the starting point of this thread
1417
kmp_uint64 lower_bound_inner =
1418
iter_before_current -
1419
((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
1420
// calculate the outer loop upper bound using the same approach as for the
1421
// inner bound except using the total number of iterations executed with the
1422
// current thread
1423
kmp_uint64 upper_bound_outer =
1424
(kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1425
8 * iter_with_current) +
1426
inner_adjustment) /
1427
2 -
1428
inner_adjustment;
1429
// calculate the inner loop upper bound which is the remaining number of
1430
// iterations required to hit the total number of iterations executed after
1431
// the current thread giving the starting point of the next thread
1432
kmp_uint64 upper_bound_inner =
1433
iter_with_current -
1434
((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
1435
// adjust the upper bounds down by 1 element to point at the last iteration of
1436
// the current thread the first iteration of the next thread
1437
if (upper_bound_inner == 0) {
1438
// {n,0} => {n-1,n-1}
1439
upper_bound_outer -= 1;
1440
upper_bound_inner = upper_bound_outer;
1441
} else {
1442
// {n,m} => {n,m-1} (m!=0)
1443
upper_bound_inner -= 1;
1444
}
1445
1446
// assign the values, zeroing out lb1 and ub1 values since the iteration space
1447
// is now one-dimensional
1448
chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
1449
chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
1450
chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
1451
chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
1452
chunk_bounds_nest[0].lb1_u64 = 0;
1453
chunk_bounds_nest[0].ub1_u64 = 0;
1454
chunk_bounds_nest[1].lb1_u64 = 0;
1455
chunk_bounds_nest[1].ub1_u64 = 0;
1456
1457
#if 0
1458
printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1459
tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1460
chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1461
#endif
1462
}
1463
1464
/**************************************************************************
1465
* Handle upper triangle matrix in the canonical form
1466
* i = 0; i <= N; i++ {0,0}:{N,0}
1467
* j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1468
* ************************************************************************/
1469
void kmp_handle_upper_triangle_matrix(
1470
/*in*/ kmp_uint32 nth,
1471
/*in*/ kmp_uint32 tid,
1472
/*in */ kmp_index_t n,
1473
/*in/out*/ bounds_info_t *original_bounds_nest,
1474
/*out*/ bounds_info_t *chunk_bounds_nest) {
1475
1476
// transfer loop types from the original loop to the chunks
1477
for (kmp_index_t i = 0; i < n; ++i) {
1478
chunk_bounds_nest[i] = original_bounds_nest[i];
1479
}
1480
// cleanup iv variables
1481
kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1482
original_bounds_nest[0].ub0_u64);
1483
kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1484
original_bounds_nest[0].lb0_u64);
1485
[[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv(
1486
original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);
1487
// calculate the chunk's lower and upper bounds
1488
// the total number of iterations in the loop is the sum of the arithmetic
1489
// progression from the outer lower to outer upper bound (inclusive since the
1490
// loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1491
// effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1492
// + 1) -> N - 1
1493
kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
1494
kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1495
// the current thread's number of iterations:
1496
// each thread gets an equal number of iterations: total number of iterations
1497
// divided by the number of threads plus, if there's a remainder,
1498
// the first threads with the number up to the remainder get an additional
1499
// iteration each to cover it
1500
kmp_uint64 iter_current =
1501
iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1502
// cumulative number of iterations executed by all the previous threads:
1503
// threads with the tid below the remainder will have (iter_total/nth+1)
1504
// elements, and so will all threads before them so the cumulative number of
1505
// iterations executed by the all previous will be the current thread's number
1506
// of iterations multiplied by the number of previous threads which is equal
1507
// to the current thread's tid; threads with the number equal or above the
1508
// remainder will have (iter_total/nth) elements so the cumulative number of
1509
// iterations previously executed is its number of iterations multipled by the
1510
// number of previous threads which is again equal to the current thread's tid
1511
// PLUS all the remainder iterations that will have been executed by the
1512
// previous threads
1513
kmp_uint64 iter_before_current =
1514
tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1515
// cumulative number of iterations executed with the current thread is
1516
// the cumulative number executed before it plus its own
1517
kmp_uint64 iter_with_current = iter_before_current + iter_current;
1518
// calculate the outer loop lower bound (lbo) which is the max outer iv value
1519
// that gives the number of iterations that is equal or just below the total
1520
// number of iterations executed by the previous threads:
1521
// lbo*(lbo+1)/2<=iter_before_current =>
1522
// lbo^2+lbo-2*iter_before_current<=0
1523
kmp_uint64 lower_bound_outer =
1524
(kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
1525
// calculate the inner loop lower bound which is the remaining number of
1526
// iterations required to hit the total number of iterations executed by the
1527
// previous threads giving the starting point of this thread
1528
kmp_uint64 lower_bound_inner =
1529
iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
1530
// calculate the outer loop upper bound using the same approach as for the
1531
// inner bound except using the total number of iterations executed with the
1532
// current thread
1533
kmp_uint64 upper_bound_outer =
1534
(kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
1535
// calculate the inner loop upper bound which is the remaining number of
1536
// iterations required to hit the total number of iterations executed after
1537
// the current thread giving the starting point of the next thread
1538
kmp_uint64 upper_bound_inner =
1539
iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
1540
// adjust the upper bounds down by 1 element to point at the last iteration of
1541
// the current thread the first iteration of the next thread
1542
if (upper_bound_inner == 0) {
1543
// {n,0} => {n-1,n-1}
1544
upper_bound_outer -= 1;
1545
upper_bound_inner = upper_bound_outer;
1546
} else {
1547
// {n,m} => {n,m-1} (m!=0)
1548
upper_bound_inner -= 1;
1549
}
1550
1551
// assign the values, zeroing out lb1 and ub1 values since the iteration space
1552
// is now one-dimensional
1553
chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
1554
chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
1555
chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
1556
chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
1557
chunk_bounds_nest[0].lb1_u64 = 0;
1558
chunk_bounds_nest[0].ub1_u64 = 0;
1559
chunk_bounds_nest[1].lb1_u64 = 0;
1560
chunk_bounds_nest[1].ub1_u64 = 0;
1561
1562
#if 0
1563
printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1564
tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1565
chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1566
#endif
1567
}
1568
//----------Init API for non-rectangular loops--------------------------------
1569
1570
// Init API for collapsed loops (static, no chunks defined).
1571
// "bounds_nest" has to be allocated per thread.
1572
// API will modify original bounds_nest array to bring it to a canonical form
1573
// (only <= and >=, no !=, <, >). If the original loop nest was already in a
1574
// canonical form there will be no changes to bounds in bounds_nest array
1575
// (only trip counts will be calculated). Internally API will expand the space
1576
// to parallelogram/parallelepiped, calculate total, calculate bounds for the
1577
// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1578
// important on the left side, to hit the lower bounds and not step over), and
1579
// pick the correct chunk for this thread (so it will calculate chunks up to the
1580
// needed one). It could be optimized to calculate just this chunk, potentially
1581
// a bit less well distributed among threads. It is designed to make sure that
1582
// threads will receive predictable chunks, deterministically (so that next nest
1583
// of loops with similar characteristics will get exactly same chunks on same
1584
// threads).
1585
// Current contract: chunk_bounds_nest has only lb0 and ub0,
1586
// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1587
extern "C" kmp_int32
1588
__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1589
/*in/out*/ bounds_info_t *original_bounds_nest,
1590
/*out*/ bounds_info_t *chunk_bounds_nest,
1591
kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
1592
1593
KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1594
KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
1595
1596
if (__kmp_env_consistency_check) {
1597
__kmp_push_workshare(gtid, ct_pdo, loc);
1598
}
1599
1600
kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
1601
1602
CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
1603
1604
for (kmp_index_t i = 0; i < n; ++i) {
1605
updated_bounds_nest[i].b = original_bounds_nest[i];
1606
}
1607
1608
kmp_loop_nest_iv_t total =
1609
kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
1610
1611
if (plastiter != NULL) {
1612
*plastiter = FALSE;
1613
}
1614
1615
if (total == 0) {
1616
// Loop won't execute:
1617
return FALSE;
1618
}
1619
1620
// OMPTODO: DISTRIBUTE is not supported yet
1621
__kmp_assert_valid_gtid(gtid);
1622
kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1623
1624
kmp_info_t *th = __kmp_threads[gtid];
1625
kmp_team_t *team = th->th.th_team;
1626
kmp_uint32 nth = team->t.t_nproc; // Number of threads
1627
1628
KMP_DEBUG_ASSERT(tid < nth);
1629
1630
// Handle special cases
1631
nested_loop_type_t loop_type =
1632
kmp_identify_nested_loop_structure(original_bounds_nest, n);
1633
if (loop_type == nested_loop_type_lower_triangular_matrix) {
1634
kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
1635
chunk_bounds_nest);
1636
return TRUE;
1637
} else if (loop_type == nested_loop_type_upper_triangular_matrix) {
1638
kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
1639
chunk_bounds_nest);
1640
return TRUE;
1641
}
1642
1643
CollapseAllocator<kmp_uint64> original_ivs_start(n);
1644
1645
if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1646
/*out*/ original_ivs_start)) {
1647
// Loop won't execute:
1648
return FALSE;
1649
}
1650
1651
// Not doing this optimization for one thread:
1652
// (1) more to test
1653
// (2) without it current contract that chunk_bounds_nest has only lb0 and
1654
// ub0, lb1 and ub1 are set to 0 and can be ignored.
1655
// if (nth == 1) {
1656
// // One thread:
1657
// // Copy all info from original_bounds_nest, it'll be good enough.
1658
1659
// for (kmp_index_t i = 0; i < n; ++i) {
1660
// chunk_bounds_nest[i] = original_bounds_nest[i];
1661
// }
1662
1663
// if (plastiter != NULL) {
1664
// *plastiter = TRUE;
1665
// }
1666
// return TRUE;
1667
//}
1668
1669
kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1670
updated_bounds_nest, original_ivs_start, n);
1671
1672
bool last_iter = false;
1673
1674
for (; nth > 0;) {
1675
// We could calculate chunk size once, but this is to compensate that the
1676
// original space is not parallelepiped and some threads can be left
1677
// without work:
1678
KMP_DEBUG_ASSERT(total >= new_iv);
1679
1680
kmp_loop_nest_iv_t total_left = total - new_iv;
1681
kmp_loop_nest_iv_t chunk_size = total_left / nth;
1682
kmp_loop_nest_iv_t remainder = total_left % nth;
1683
1684
kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1685
1686
if (remainder > 0) {
1687
++curr_chunk_size;
1688
--remainder;
1689
}
1690
1691
#if defined(KMP_DEBUG)
1692
kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1693
#endif
1694
1695
if (curr_chunk_size > 1) {
1696
new_iv += curr_chunk_size - 1;
1697
}
1698
1699
CollapseAllocator<kmp_uint64> original_ivs_end(n);
1700
if ((nth == 1) || (new_iv >= total - 1)) {
1701
// Do this one till the end - just in case we miscalculated
1702
// and either too much is left to process or new_iv is a bit too big:
1703
kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1704
/*out*/ original_ivs_end);
1705
1706
last_iter = true;
1707
} else {
1708
// Note: here we make sure it's past (or equal to) the previous point.
1709
if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1710
updated_bounds_nest,
1711
original_ivs_start, new_iv,
1712
/*out*/ original_ivs_end)) {
1713
// We could not find the ending point, use the original upper bounds:
1714
kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1715
/*out*/ original_ivs_end);
1716
1717
last_iter = true;
1718
}
1719
}
1720
1721
#if defined(KMP_DEBUG)
1722
auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1723
updated_bounds_nest, original_ivs_end, n);
1724
KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1725
#endif
1726
1727
if (last_iter && (tid != 0)) {
1728
// We are done, this was last chunk, but no chunk for current thread was
1729
// found:
1730
return FALSE;
1731
}
1732
1733
if (tid == 0) {
1734
// We found the chunk for this thread, now we need to check if it's the
1735
// last chunk or not:
1736
1737
CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
1738
if (last_iter ||
1739
!kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
1740
/*out*/ original_ivs_next_start)) {
1741
// no more loop iterations left to process,
1742
// this means that currently found chunk is the last chunk:
1743
if (plastiter != NULL) {
1744
*plastiter = TRUE;
1745
}
1746
}
1747
1748
// Fill in chunk bounds:
1749
for (kmp_index_t i = 0; i < n; ++i) {
1750
chunk_bounds_nest[i] =
1751
original_bounds_nest[i]; // To fill in types, etc. - optional
1752
chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
1753
chunk_bounds_nest[i].lb1_u64 = 0;
1754
1755
chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
1756
chunk_bounds_nest[i].ub1_u64 = 0;
1757
}
1758
1759
return TRUE;
1760
}
1761
1762
--tid;
1763
--nth;
1764
1765
bool next_chunk = kmp_calc_next_original_ivs(
1766
original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
1767
if (!next_chunk) {
1768
// no more loop iterations to process,
1769
// the prevoius chunk was the last chunk
1770
break;
1771
}
1772
1773
// original_ivs_start is next to previous chunk original_ivs_end,
1774
// we need to start new chunk here, so chunks will be one after another
1775
// without any gap or overlap:
1776
new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
1777
original_ivs_start, n);
1778
}
1779
1780
return FALSE;
1781
}
1782
1783