Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_dispatch.cpp
35258 views
1
/*
2
* kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3
*/
4
5
//===----------------------------------------------------------------------===//
6
//
7
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8
// See https://llvm.org/LICENSE.txt for license information.
9
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10
//
11
//===----------------------------------------------------------------------===//
12
13
/* Dynamic scheduling initialization and dispatch.
14
*
15
* NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16
* it may change values between parallel regions. __kmp_max_nth
17
* is the largest value __kmp_nth may take, 1 is the smallest.
18
*/
19
20
#include "kmp.h"
21
#include "kmp_error.h"
22
#include "kmp_i18n.h"
23
#include "kmp_itt.h"
24
#include "kmp_stats.h"
25
#include "kmp_str.h"
26
#if KMP_USE_X87CONTROL
27
#include <float.h>
28
#endif
29
#include "kmp_lock.h"
30
#include "kmp_dispatch.h"
31
#if KMP_USE_HIER_SCHED
32
#include "kmp_dispatch_hier.h"
33
#endif
34
35
#if OMPT_SUPPORT
36
#include "ompt-specific.h"
37
#endif
38
39
/* ------------------------------------------------------------------------ */
40
/* ------------------------------------------------------------------------ */
41
42
void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43
kmp_info_t *th;
44
45
KMP_DEBUG_ASSERT(gtid_ref);
46
47
if (__kmp_env_consistency_check) {
48
th = __kmp_threads[*gtid_ref];
49
if (th->th.th_root->r.r_active &&
50
(th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51
#if KMP_USE_DYNAMIC_LOCK
52
__kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53
#else
54
__kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55
#endif
56
}
57
}
58
}
59
60
void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61
kmp_info_t *th;
62
63
if (__kmp_env_consistency_check) {
64
th = __kmp_threads[*gtid_ref];
65
if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66
__kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67
}
68
}
69
}
70
71
// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72
static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73
bool use_hier = false) {
74
// Pick up the nonmonotonic/monotonic bits from the scheduling type
75
// Nonmonotonic as default for dynamic schedule when no modifier is specified
76
int monotonicity = SCHEDULE_NONMONOTONIC;
77
78
// Let default be monotonic for executables
79
// compiled with OpenMP* 4.5 or less compilers
80
if (loc != NULL && loc->get_openmp_version() < 50)
81
monotonicity = SCHEDULE_MONOTONIC;
82
83
if (use_hier || __kmp_force_monotonic)
84
monotonicity = SCHEDULE_MONOTONIC;
85
else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86
monotonicity = SCHEDULE_NONMONOTONIC;
87
else if (SCHEDULE_HAS_MONOTONIC(schedule))
88
monotonicity = SCHEDULE_MONOTONIC;
89
90
return monotonicity;
91
}
92
93
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
94
// Return floating point number rounded to two decimal points
95
static inline float __kmp_round_2decimal_val(float num) {
96
return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
97
}
98
static inline int __kmp_get_round_val(float num) {
99
return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
100
}
101
#endif
102
103
template <typename T>
104
inline void
105
__kmp_initialize_self_buffer(kmp_team_t *team, T id,
106
dispatch_private_info_template<T> *pr,
107
typename traits_t<T>::unsigned_t nchunks, T nproc,
108
typename traits_t<T>::unsigned_t &init,
109
T &small_chunk, T &extras, T &p_extra) {
110
111
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
112
if (pr->flags.use_hybrid) {
113
kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
114
kmp_hw_core_type_t type =
115
(kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
116
T pchunks = pr->u.p.pchunks;
117
T echunks = nchunks - pchunks;
118
T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
119
T num_procs_with_ecore = nproc - num_procs_with_pcore;
120
T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
121
T big_chunk =
122
pchunks / num_procs_with_pcore; // chunks per thread with p-core
123
small_chunk =
124
echunks / num_procs_with_ecore; // chunks per thread with e-core
125
126
extras =
127
(pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
128
129
p_extra = (big_chunk - small_chunk);
130
131
if (type == KMP_HW_CORE_TYPE_CORE) {
132
if (id < first_thread_with_ecore) {
133
init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
134
} else {
135
init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
136
(id < extras ? id : extras);
137
}
138
} else {
139
if (id == first_thread_with_ecore) {
140
init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
141
} else {
142
init = id * small_chunk + first_thread_with_ecore * p_extra +
143
(id < extras ? id : extras);
144
}
145
}
146
p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
147
return;
148
}
149
#endif
150
151
small_chunk = nchunks / nproc; // chunks per thread
152
extras = nchunks % nproc;
153
p_extra = 0;
154
init = id * small_chunk + (id < extras ? id : extras);
155
}
156
157
#if KMP_STATIC_STEAL_ENABLED
158
enum { // values for steal_flag (possible states of private per-loop buffer)
159
UNUSED = 0,
160
CLAIMED = 1, // owner thread started initialization
161
READY = 2, // available for stealing
162
THIEF = 3 // finished by owner, or claimed by thief
163
// possible state changes:
164
// 0 -> 1 owner only, sync
165
// 0 -> 3 thief only, sync
166
// 1 -> 2 owner only, async
167
// 2 -> 3 owner only, async
168
// 3 -> 2 owner only, async
169
// 3 -> 0 last thread finishing the loop, async
170
};
171
#endif
172
173
// Initialize a dispatch_private_info_template<T> buffer for a particular
174
// type of schedule,chunk. The loop description is found in lb (lower bound),
175
// ub (upper bound), and st (stride). nproc is the number of threads relevant
176
// to the scheduling (often the number of threads in a team, but not always if
177
// hierarchical scheduling is used). tid is the id of the thread calling
178
// the function within the group of nproc threads. It will have a value
179
// between 0 and nproc - 1. This is often just the thread id within a team, but
180
// is not necessarily the case when using hierarchical scheduling.
181
// loc is the source file location of the corresponding loop
182
// gtid is the global thread id
183
template <typename T>
184
void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
185
dispatch_private_info_template<T> *pr,
186
enum sched_type schedule, T lb, T ub,
187
typename traits_t<T>::signed_t st,
188
#if USE_ITT_BUILD
189
kmp_uint64 *cur_chunk,
190
#endif
191
typename traits_t<T>::signed_t chunk,
192
T nproc, T tid) {
193
typedef typename traits_t<T>::unsigned_t UT;
194
typedef typename traits_t<T>::floating_t DBL;
195
196
int active;
197
T tc;
198
kmp_info_t *th;
199
kmp_team_t *team;
200
int monotonicity;
201
bool use_hier;
202
203
#ifdef KMP_DEBUG
204
typedef typename traits_t<T>::signed_t ST;
205
{
206
char *buff;
207
// create format specifiers before the debug output
208
buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
209
"pr:%%p lb:%%%s ub:%%%s st:%%%s "
210
"schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
211
traits_t<T>::spec, traits_t<T>::spec,
212
traits_t<ST>::spec, traits_t<ST>::spec,
213
traits_t<T>::spec, traits_t<T>::spec);
214
KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
215
__kmp_str_free(&buff);
216
}
217
#endif
218
/* setup data */
219
th = __kmp_threads[gtid];
220
team = th->th.th_team;
221
active = !team->t.t_serialized;
222
223
#if USE_ITT_BUILD
224
int itt_need_metadata_reporting =
225
__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
226
KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
227
team->t.t_active_level == 1;
228
#endif
229
230
#if KMP_USE_HIER_SCHED
231
use_hier = pr->flags.use_hier;
232
#else
233
use_hier = false;
234
#endif
235
236
/* Pick up the nonmonotonic/monotonic bits from the scheduling type */
237
monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
238
schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
239
240
/* Pick up the nomerge/ordered bits from the scheduling type */
241
if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
242
pr->flags.nomerge = TRUE;
243
schedule =
244
(enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
245
} else {
246
pr->flags.nomerge = FALSE;
247
}
248
pr->type_size = traits_t<T>::type_size; // remember the size of variables
249
if (kmp_ord_lower & schedule) {
250
pr->flags.ordered = TRUE;
251
schedule =
252
(enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
253
} else {
254
pr->flags.ordered = FALSE;
255
}
256
// Ordered overrides nonmonotonic
257
if (pr->flags.ordered) {
258
monotonicity = SCHEDULE_MONOTONIC;
259
}
260
261
if (schedule == kmp_sch_static) {
262
schedule = __kmp_static;
263
} else {
264
if (schedule == kmp_sch_runtime) {
265
// Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
266
// not specified)
267
schedule = team->t.t_sched.r_sched_type;
268
monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
269
schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
270
if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
271
monotonicity = SCHEDULE_MONOTONIC;
272
// Detail the schedule if needed (global controls are differentiated
273
// appropriately)
274
if (schedule == kmp_sch_guided_chunked) {
275
schedule = __kmp_guided;
276
} else if (schedule == kmp_sch_static) {
277
schedule = __kmp_static;
278
}
279
// Use the chunk size specified by OMP_SCHEDULE (or default if not
280
// specified)
281
chunk = team->t.t_sched.chunk;
282
#if USE_ITT_BUILD
283
if (cur_chunk)
284
*cur_chunk = chunk;
285
#endif
286
#ifdef KMP_DEBUG
287
{
288
char *buff;
289
// create format specifiers before the debug output
290
buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
291
"schedule:%%d chunk:%%%s\n",
292
traits_t<ST>::spec);
293
KD_TRACE(10, (buff, gtid, schedule, chunk));
294
__kmp_str_free(&buff);
295
}
296
#endif
297
} else {
298
if (schedule == kmp_sch_guided_chunked) {
299
schedule = __kmp_guided;
300
}
301
if (chunk <= 0) {
302
chunk = KMP_DEFAULT_CHUNK;
303
}
304
}
305
306
if (schedule == kmp_sch_auto) {
307
// mapping and differentiation: in the __kmp_do_serial_initialize()
308
schedule = __kmp_auto;
309
#ifdef KMP_DEBUG
310
{
311
char *buff;
312
// create format specifiers before the debug output
313
buff = __kmp_str_format(
314
"__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
315
"schedule:%%d chunk:%%%s\n",
316
traits_t<ST>::spec);
317
KD_TRACE(10, (buff, gtid, schedule, chunk));
318
__kmp_str_free(&buff);
319
}
320
#endif
321
}
322
#if KMP_STATIC_STEAL_ENABLED
323
// map nonmonotonic:dynamic to static steal
324
if (schedule == kmp_sch_dynamic_chunked) {
325
if (monotonicity == SCHEDULE_NONMONOTONIC)
326
schedule = kmp_sch_static_steal;
327
}
328
#endif
329
/* guided analytical not safe for too many threads */
330
if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
331
schedule = kmp_sch_guided_iterative_chunked;
332
KMP_WARNING(DispatchManyThreads);
333
}
334
if (schedule == kmp_sch_runtime_simd) {
335
// compiler provides simd_width in the chunk parameter
336
schedule = team->t.t_sched.r_sched_type;
337
monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
338
schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
339
// Detail the schedule if needed (global controls are differentiated
340
// appropriately)
341
if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
342
schedule == __kmp_static) {
343
schedule = kmp_sch_static_balanced_chunked;
344
} else {
345
if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
346
schedule = kmp_sch_guided_simd;
347
}
348
chunk = team->t.t_sched.chunk * chunk;
349
}
350
#if USE_ITT_BUILD
351
if (cur_chunk)
352
*cur_chunk = chunk;
353
#endif
354
#ifdef KMP_DEBUG
355
{
356
char *buff;
357
// create format specifiers before the debug output
358
buff = __kmp_str_format(
359
"__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
360
" chunk:%%%s\n",
361
traits_t<ST>::spec);
362
KD_TRACE(10, (buff, gtid, schedule, chunk));
363
__kmp_str_free(&buff);
364
}
365
#endif
366
}
367
pr->u.p.parm1 = chunk;
368
}
369
KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
370
"unknown scheduling type");
371
372
pr->u.p.count = 0;
373
374
if (__kmp_env_consistency_check) {
375
if (st == 0) {
376
__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
377
(pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
378
}
379
}
380
// compute trip count
381
if (st == 1) { // most common case
382
if (ub >= lb) {
383
tc = ub - lb + 1;
384
} else { // ub < lb
385
tc = 0; // zero-trip
386
}
387
} else if (st < 0) {
388
if (lb >= ub) {
389
// AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
390
// where the division needs to be unsigned regardless of the result type
391
tc = (UT)(lb - ub) / (-st) + 1;
392
} else { // lb < ub
393
tc = 0; // zero-trip
394
}
395
} else { // st > 0
396
if (ub >= lb) {
397
// AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
398
// where the division needs to be unsigned regardless of the result type
399
tc = (UT)(ub - lb) / st + 1;
400
} else { // ub < lb
401
tc = 0; // zero-trip
402
}
403
}
404
405
#if KMP_STATS_ENABLED
406
if (KMP_MASTER_GTID(gtid)) {
407
KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
408
}
409
#endif
410
411
pr->u.p.lb = lb;
412
pr->u.p.ub = ub;
413
pr->u.p.st = st;
414
pr->u.p.tc = tc;
415
416
#if KMP_OS_WINDOWS
417
pr->u.p.last_upper = ub + st;
418
#endif /* KMP_OS_WINDOWS */
419
420
/* NOTE: only the active parallel region(s) has active ordered sections */
421
422
if (active) {
423
if (pr->flags.ordered) {
424
pr->ordered_bumped = 0;
425
pr->u.p.ordered_lower = 1;
426
pr->u.p.ordered_upper = 0;
427
}
428
}
429
430
switch (schedule) {
431
#if KMP_STATIC_STEAL_ENABLED
432
case kmp_sch_static_steal: {
433
T ntc, init = 0;
434
435
KD_TRACE(100,
436
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
437
gtid));
438
439
ntc = (tc % chunk ? 1 : 0) + tc / chunk;
440
if (nproc > 1 && ntc >= nproc) {
441
KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
442
T id = tid;
443
T small_chunk, extras, p_extra = 0;
444
kmp_uint32 old = UNUSED;
445
int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
446
if (traits_t<T>::type_size > 4) {
447
// AC: TODO: check if 16-byte CAS available and use it to
448
// improve performance (probably wait for explicit request
449
// before spending time on this).
450
// For now use dynamically allocated per-private-buffer lock,
451
// free memory in __kmp_dispatch_next when status==0.
452
pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
453
__kmp_init_lock(pr->u.p.steal_lock);
454
}
455
456
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
457
// Iterations are divided in a 60/40 skewed distribution among CORE and
458
// ATOM processors for hybrid systems
459
bool use_hybrid = false;
460
kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
461
T first_thread_with_ecore = 0;
462
T num_procs_with_pcore = 0;
463
T num_procs_with_ecore = 0;
464
T p_ntc = 0, e_ntc = 0;
465
if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
466
__kmp_affinity.type != affinity_explicit) {
467
use_hybrid = true;
468
core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
469
if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
470
__kmp_first_osid_with_ecore > -1) {
471
for (int i = 0; i < team->t.t_nproc; ++i) {
472
kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
473
->th.th_topology_attrs.core_type;
474
int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
475
if (id == __kmp_first_osid_with_ecore) {
476
first_thread_with_ecore =
477
team->t.t_threads[i]->th.th_info.ds.ds_tid;
478
}
479
if (type == KMP_HW_CORE_TYPE_CORE) {
480
num_procs_with_pcore++;
481
} else if (type == KMP_HW_CORE_TYPE_ATOM) {
482
num_procs_with_ecore++;
483
} else {
484
use_hybrid = false;
485
break;
486
}
487
}
488
}
489
if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
490
float multiplier = 60.0 / 40.0;
491
float p_ratio = (float)num_procs_with_pcore / nproc;
492
float e_ratio = (float)num_procs_with_ecore / nproc;
493
float e_multiplier =
494
(float)1 /
495
(((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
496
float p_multiplier = multiplier * e_multiplier;
497
p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
498
if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
499
e_ntc =
500
(int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
501
else
502
e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
503
KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
504
505
// Use regular static steal if not enough chunks for skewed
506
// distribution
507
use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
508
e_ntc >= num_procs_with_ecore)
509
? true
510
: false);
511
} else {
512
use_hybrid = false;
513
}
514
}
515
pr->flags.use_hybrid = use_hybrid;
516
pr->u.p.pchunks = p_ntc;
517
pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
518
pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
519
520
if (use_hybrid) {
521
KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
522
T big_chunk = p_ntc / num_procs_with_pcore;
523
small_chunk = e_ntc / num_procs_with_ecore;
524
525
extras =
526
(p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
527
528
p_extra = (big_chunk - small_chunk);
529
530
if (core_type == KMP_HW_CORE_TYPE_CORE) {
531
if (id < first_thread_with_ecore) {
532
init =
533
id * small_chunk + id * p_extra + (id < extras ? id : extras);
534
} else {
535
init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
536
(id < extras ? id : extras);
537
}
538
} else {
539
if (id == first_thread_with_ecore) {
540
init =
541
id * small_chunk + id * p_extra + (id < extras ? id : extras);
542
} else {
543
init = id * small_chunk + first_thread_with_ecore * p_extra +
544
(id < extras ? id : extras);
545
}
546
}
547
p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
548
} else
549
#endif
550
{
551
small_chunk = ntc / nproc;
552
extras = ntc % nproc;
553
init = id * small_chunk + (id < extras ? id : extras);
554
p_extra = 0;
555
}
556
pr->u.p.count = init;
557
if (claimed) { // are we succeeded in claiming own buffer?
558
pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
559
// Other threads will inspect steal_flag when searching for a victim.
560
// READY means other threads may steal from this thread from now on.
561
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
562
} else {
563
// other thread has stolen whole our range
564
KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
565
pr->u.p.ub = init; // mark there is no iterations to work on
566
}
567
pr->u.p.parm2 = ntc; // save number of chunks
568
// parm3 is the number of times to attempt stealing which is
569
// nproc (just a heuristics, could be optimized later on).
570
pr->u.p.parm3 = nproc;
571
pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
572
break;
573
} else {
574
/* too few chunks: switching to kmp_sch_dynamic_chunked */
575
schedule = kmp_sch_dynamic_chunked;
576
KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
577
"kmp_sch_dynamic_chunked\n",
578
gtid));
579
goto dynamic_init;
580
break;
581
} // if
582
} // case
583
#endif
584
case kmp_sch_static_balanced: {
585
T init, limit;
586
587
KD_TRACE(
588
100,
589
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
590
gtid));
591
592
if (nproc > 1) {
593
T id = tid;
594
595
if (tc < nproc) {
596
if (id < tc) {
597
init = id;
598
limit = id;
599
pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
600
} else {
601
pr->u.p.count = 1; /* means no more chunks to execute */
602
pr->u.p.parm1 = FALSE;
603
break;
604
}
605
} else {
606
T small_chunk = tc / nproc;
607
T extras = tc % nproc;
608
init = id * small_chunk + (id < extras ? id : extras);
609
limit = init + small_chunk - (id < extras ? 0 : 1);
610
pr->u.p.parm1 = (id == nproc - 1);
611
}
612
} else {
613
if (tc > 0) {
614
init = 0;
615
limit = tc - 1;
616
pr->u.p.parm1 = TRUE;
617
} else {
618
// zero trip count
619
pr->u.p.count = 1; /* means no more chunks to execute */
620
pr->u.p.parm1 = FALSE;
621
break;
622
}
623
}
624
#if USE_ITT_BUILD
625
// Calculate chunk for metadata report
626
if (itt_need_metadata_reporting)
627
if (cur_chunk)
628
*cur_chunk = limit - init + 1;
629
#endif
630
if (st == 1) {
631
pr->u.p.lb = lb + init;
632
pr->u.p.ub = lb + limit;
633
} else {
634
// calculated upper bound, "ub" is user-defined upper bound
635
T ub_tmp = lb + limit * st;
636
pr->u.p.lb = lb + init * st;
637
// adjust upper bound to "ub" if needed, so that MS lastprivate will match
638
// it exactly
639
if (st > 0) {
640
pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
641
} else {
642
pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
643
}
644
}
645
if (pr->flags.ordered) {
646
pr->u.p.ordered_lower = init;
647
pr->u.p.ordered_upper = limit;
648
}
649
break;
650
} // case
651
case kmp_sch_static_balanced_chunked: {
652
// similar to balanced, but chunk adjusted to multiple of simd width
653
T nth = nproc;
654
KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
655
" -> falling-through to static_greedy\n",
656
gtid));
657
schedule = kmp_sch_static_greedy;
658
if (nth > 1)
659
pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
660
else
661
pr->u.p.parm1 = tc;
662
break;
663
} // case
664
case kmp_sch_guided_simd:
665
case kmp_sch_guided_iterative_chunked: {
666
KD_TRACE(
667
100,
668
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
669
" case\n",
670
gtid));
671
672
if (nproc > 1) {
673
if ((2L * chunk + 1) * nproc >= tc) {
674
/* chunk size too large, switch to dynamic */
675
schedule = kmp_sch_dynamic_chunked;
676
goto dynamic_init;
677
} else {
678
// when remaining iters become less than parm2 - switch to dynamic
679
pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
680
*(double *)&pr->u.p.parm3 =
681
guided_flt_param / (double)nproc; // may occupy parm3 and parm4
682
}
683
} else {
684
KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
685
"kmp_sch_static_greedy\n",
686
gtid));
687
schedule = kmp_sch_static_greedy;
688
/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
689
KD_TRACE(
690
100,
691
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
692
gtid));
693
pr->u.p.parm1 = tc;
694
} // if
695
} // case
696
break;
697
case kmp_sch_guided_analytical_chunked: {
698
KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
699
"kmp_sch_guided_analytical_chunked case\n",
700
gtid));
701
702
if (nproc > 1) {
703
if ((2L * chunk + 1) * nproc >= tc) {
704
/* chunk size too large, switch to dynamic */
705
schedule = kmp_sch_dynamic_chunked;
706
goto dynamic_init;
707
} else {
708
/* commonly used term: (2 nproc - 1)/(2 nproc) */
709
DBL x;
710
711
#if KMP_USE_X87CONTROL
712
/* Linux* OS already has 64-bit computation by default for long double,
713
and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
714
Windows* OS on IA-32 architecture, we need to set precision to 64-bit
715
instead of the default 53-bit. Even though long double doesn't work
716
on Windows* OS on Intel(R) 64, the resulting lack of precision is not
717
expected to impact the correctness of the algorithm, but this has not
718
been mathematically proven. */
719
// save original FPCW and set precision to 64-bit, as
720
// Windows* OS on IA-32 architecture defaults to 53-bit
721
unsigned int oldFpcw = _control87(0, 0);
722
_control87(_PC_64, _MCW_PC); // 0,0x30000
723
#endif
724
/* value used for comparison in solver for cross-over point */
725
KMP_ASSERT(tc > 0);
726
long double target = ((long double)chunk * 2 + 1) * nproc / tc;
727
728
/* crossover point--chunk indexes equal to or greater than
729
this point switch to dynamic-style scheduling */
730
UT cross;
731
732
/* commonly used term: (2 nproc - 1)/(2 nproc) */
733
x = 1.0 - 0.5 / (double)nproc;
734
735
#ifdef KMP_DEBUG
736
{ // test natural alignment
737
struct _test_a {
738
char a;
739
union {
740
char b;
741
DBL d;
742
};
743
} t;
744
ptrdiff_t natural_alignment =
745
(ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
746
//__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
747
// long)natural_alignment );
748
KMP_DEBUG_ASSERT(
749
(((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
750
}
751
#endif // KMP_DEBUG
752
753
/* save the term in thread private dispatch structure */
754
*(DBL *)&pr->u.p.parm3 = x;
755
756
/* solve for the crossover point to the nearest integer i for which C_i
757
<= chunk */
758
{
759
UT left, right, mid;
760
long double p;
761
762
/* estimate initial upper and lower bound */
763
764
/* doesn't matter what value right is as long as it is positive, but
765
it affects performance of the solver */
766
right = 229;
767
p = __kmp_pow<UT>(x, right);
768
if (p > target) {
769
do {
770
p *= p;
771
right <<= 1;
772
} while (p > target && right < (1 << 27));
773
/* lower bound is previous (failed) estimate of upper bound */
774
left = right >> 1;
775
} else {
776
left = 0;
777
}
778
779
/* bisection root-finding method */
780
while (left + 1 < right) {
781
mid = (left + right) / 2;
782
if (__kmp_pow<UT>(x, mid) > target) {
783
left = mid;
784
} else {
785
right = mid;
786
}
787
} // while
788
cross = right;
789
}
790
/* assert sanity of computed crossover point */
791
KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
792
__kmp_pow<UT>(x, cross) <= target);
793
794
/* save the crossover point in thread private dispatch structure */
795
pr->u.p.parm2 = cross;
796
797
// C75803
798
#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
799
#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
800
#else
801
#define GUIDED_ANALYTICAL_WORKAROUND (x)
802
#endif
803
/* dynamic-style scheduling offset */
804
pr->u.p.count = tc -
805
__kmp_dispatch_guided_remaining(
806
tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
807
cross * chunk;
808
#if KMP_USE_X87CONTROL
809
// restore FPCW
810
_control87(oldFpcw, _MCW_PC);
811
#endif
812
} // if
813
} else {
814
KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
815
"kmp_sch_static_greedy\n",
816
gtid));
817
schedule = kmp_sch_static_greedy;
818
/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
819
pr->u.p.parm1 = tc;
820
} // if
821
} // case
822
break;
823
case kmp_sch_static_greedy:
824
KD_TRACE(
825
100,
826
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
827
gtid));
828
pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
829
break;
830
case kmp_sch_static_chunked:
831
case kmp_sch_dynamic_chunked:
832
dynamic_init:
833
if (tc == 0)
834
break;
835
if (pr->u.p.parm1 <= 0)
836
pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
837
else if (pr->u.p.parm1 > tc)
838
pr->u.p.parm1 = tc;
839
// Store the total number of chunks to prevent integer overflow during
840
// bounds calculations in the get next chunk routine.
841
pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
842
KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
843
"kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
844
gtid));
845
break;
846
case kmp_sch_trapezoidal: {
847
/* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
848
849
T parm1, parm2, parm3, parm4;
850
KD_TRACE(100,
851
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
852
gtid));
853
854
parm1 = chunk;
855
856
/* F : size of the first cycle */
857
parm2 = (tc / (2 * nproc));
858
859
if (parm2 < 1) {
860
parm2 = 1;
861
}
862
863
/* L : size of the last cycle. Make sure the last cycle is not larger
864
than the first cycle. */
865
if (parm1 < 1) {
866
parm1 = 1;
867
} else if (parm1 > parm2) {
868
parm1 = parm2;
869
}
870
871
/* N : number of cycles */
872
parm3 = (parm2 + parm1);
873
parm3 = (2 * tc + parm3 - 1) / parm3;
874
875
if (parm3 < 2) {
876
parm3 = 2;
877
}
878
879
/* sigma : decreasing incr of the trapezoid */
880
parm4 = (parm3 - 1);
881
parm4 = (parm2 - parm1) / parm4;
882
883
// pointless check, because parm4 >= 0 always
884
// if ( parm4 < 0 ) {
885
// parm4 = 0;
886
//}
887
888
pr->u.p.parm1 = parm1;
889
pr->u.p.parm2 = parm2;
890
pr->u.p.parm3 = parm3;
891
pr->u.p.parm4 = parm4;
892
} // case
893
break;
894
895
default: {
896
__kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
897
KMP_HNT(GetNewerLibrary), // Hint
898
__kmp_msg_null // Variadic argument list terminator
899
);
900
} break;
901
} // switch
902
pr->schedule = schedule;
903
}
904
905
#if KMP_USE_HIER_SCHED
906
template <typename T>
907
inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
908
typename traits_t<T>::signed_t st);
909
template <>
910
inline void
911
__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
912
kmp_int32 ub, kmp_int32 st) {
913
__kmp_dispatch_init_hierarchy<kmp_int32>(
914
loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
915
__kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
916
}
917
template <>
918
inline void
919
__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
920
kmp_uint32 ub, kmp_int32 st) {
921
__kmp_dispatch_init_hierarchy<kmp_uint32>(
922
loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
923
__kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
924
}
925
template <>
926
inline void
927
__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
928
kmp_int64 ub, kmp_int64 st) {
929
__kmp_dispatch_init_hierarchy<kmp_int64>(
930
loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
931
__kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
932
}
933
template <>
934
inline void
935
__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
936
kmp_uint64 ub, kmp_int64 st) {
937
__kmp_dispatch_init_hierarchy<kmp_uint64>(
938
loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
939
__kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
940
}
941
942
// free all the hierarchy scheduling memory associated with the team
943
void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
944
int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
945
for (int i = 0; i < num_disp_buff; ++i) {
946
// type does not matter here so use kmp_int32
947
auto sh =
948
reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
949
&team->t.t_disp_buffer[i]);
950
if (sh->hier) {
951
sh->hier->deallocate();
952
__kmp_free(sh->hier);
953
}
954
}
955
}
956
#endif
957
958
// UT - unsigned flavor of T, ST - signed flavor of T,
959
// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
960
template <typename T>
961
static void
962
__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
963
T ub, typename traits_t<T>::signed_t st,
964
typename traits_t<T>::signed_t chunk, int push_ws) {
965
typedef typename traits_t<T>::unsigned_t UT;
966
967
int active;
968
kmp_info_t *th;
969
kmp_team_t *team;
970
kmp_uint32 my_buffer_index;
971
dispatch_private_info_template<T> *pr;
972
dispatch_shared_info_template<T> volatile *sh;
973
974
KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
975
sizeof(dispatch_private_info));
976
KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
977
sizeof(dispatch_shared_info));
978
__kmp_assert_valid_gtid(gtid);
979
980
if (!TCR_4(__kmp_init_parallel))
981
__kmp_parallel_initialize();
982
983
__kmp_resume_if_soft_paused();
984
985
#if INCLUDE_SSC_MARKS
986
SSC_MARK_DISPATCH_INIT();
987
#endif
988
#ifdef KMP_DEBUG
989
typedef typename traits_t<T>::signed_t ST;
990
{
991
char *buff;
992
// create format specifiers before the debug output
993
buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
994
"chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
995
traits_t<ST>::spec, traits_t<T>::spec,
996
traits_t<T>::spec, traits_t<ST>::spec);
997
KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
998
__kmp_str_free(&buff);
999
}
1000
#endif
1001
/* setup data */
1002
th = __kmp_threads[gtid];
1003
team = th->th.th_team;
1004
active = !team->t.t_serialized;
1005
th->th.th_ident = loc;
1006
1007
// Any half-decent optimizer will remove this test when the blocks are empty
1008
// since the macros expand to nothing
1009
// when statistics are disabled.
1010
if (schedule == __kmp_static) {
1011
KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
1012
} else {
1013
KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
1014
}
1015
1016
#if KMP_USE_HIER_SCHED
1017
// Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
1018
// Hierarchical scheduling does not work with ordered, so if ordered is
1019
// detected, then revert back to threaded scheduling.
1020
bool ordered;
1021
enum sched_type my_sched = schedule;
1022
my_buffer_index = th->th.th_dispatch->th_disp_index;
1023
pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1024
&th->th.th_dispatch
1025
->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1026
my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
1027
if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
1028
my_sched =
1029
(enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
1030
ordered = (kmp_ord_lower & my_sched);
1031
if (pr->flags.use_hier) {
1032
if (ordered) {
1033
KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
1034
"Disabling hierarchical scheduling.\n",
1035
gtid));
1036
pr->flags.use_hier = FALSE;
1037
}
1038
}
1039
if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
1040
// Don't use hierarchical for ordered parallel loops and don't
1041
// use the runtime hierarchy if one was specified in the program
1042
if (!ordered && !pr->flags.use_hier)
1043
__kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
1044
}
1045
#endif // KMP_USE_HIER_SCHED
1046
1047
#if USE_ITT_BUILD
1048
kmp_uint64 cur_chunk = chunk;
1049
int itt_need_metadata_reporting =
1050
__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
1051
KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
1052
team->t.t_active_level == 1;
1053
#endif
1054
if (!active) {
1055
pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1056
th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1057
} else {
1058
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1059
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1060
1061
my_buffer_index = th->th.th_dispatch->th_disp_index++;
1062
1063
/* What happens when number of threads changes, need to resize buffer? */
1064
pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1065
&th->th.th_dispatch
1066
->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1067
sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
1068
&team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1069
KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
1070
my_buffer_index));
1071
if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
1072
KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
1073
" sh->buffer_index:%d\n",
1074
gtid, my_buffer_index, sh->buffer_index));
1075
__kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1076
__kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1077
// Note: KMP_WAIT() cannot be used there: buffer index and
1078
// my_buffer_index are *always* 32-bit integers.
1079
KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1080
"sh->buffer_index:%d\n",
1081
gtid, my_buffer_index, sh->buffer_index));
1082
}
1083
}
1084
1085
__kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
1086
#if USE_ITT_BUILD
1087
&cur_chunk,
1088
#endif
1089
chunk, (T)th->th.th_team_nproc,
1090
(T)th->th.th_info.ds.ds_tid);
1091
if (active) {
1092
if (pr->flags.ordered == 0) {
1093
th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
1094
th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
1095
} else {
1096
th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
1097
th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
1098
}
1099
th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1100
th->th.th_dispatch->th_dispatch_sh_current =
1101
CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1102
#if USE_ITT_BUILD
1103
if (pr->flags.ordered) {
1104
__kmp_itt_ordered_init(gtid);
1105
}
1106
// Report loop metadata
1107
if (itt_need_metadata_reporting) {
1108
// Only report metadata by primary thread of active team at level 1
1109
kmp_uint64 schedtype = 0;
1110
switch (schedule) {
1111
case kmp_sch_static_chunked:
1112
case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1113
break;
1114
case kmp_sch_static_greedy:
1115
cur_chunk = pr->u.p.parm1;
1116
break;
1117
case kmp_sch_dynamic_chunked:
1118
schedtype = 1;
1119
break;
1120
case kmp_sch_guided_iterative_chunked:
1121
case kmp_sch_guided_analytical_chunked:
1122
case kmp_sch_guided_simd:
1123
schedtype = 2;
1124
break;
1125
default:
1126
// Should we put this case under "static"?
1127
// case kmp_sch_static_steal:
1128
schedtype = 3;
1129
break;
1130
}
1131
__kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
1132
}
1133
#if KMP_USE_HIER_SCHED
1134
if (pr->flags.use_hier) {
1135
pr->u.p.count = 0;
1136
pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
1137
}
1138
#endif // KMP_USER_HIER_SCHED
1139
#endif /* USE_ITT_BUILD */
1140
}
1141
1142
#ifdef KMP_DEBUG
1143
{
1144
char *buff;
1145
// create format specifiers before the debug output
1146
buff = __kmp_str_format(
1147
"__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1148
"lb:%%%s ub:%%%s"
1149
" st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1150
" parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1151
traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1152
traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1153
traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1154
traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1155
KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
1156
pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1157
pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1158
pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
1159
__kmp_str_free(&buff);
1160
}
1161
#endif
1162
#if OMPT_SUPPORT && OMPT_OPTIONAL
1163
if (ompt_enabled.ompt_callback_work) {
1164
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1165
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1166
ompt_callbacks.ompt_callback(ompt_callback_work)(
1167
ompt_get_work_schedule(pr->schedule), ompt_scope_begin,
1168
&(team_info->parallel_data), &(task_info->task_data), pr->u.p.tc,
1169
OMPT_LOAD_RETURN_ADDRESS(gtid));
1170
}
1171
#endif
1172
KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1173
}
1174
1175
/* For ordered loops, either __kmp_dispatch_finish() should be called after
1176
* every iteration, or __kmp_dispatch_finish_chunk() should be called after
1177
* every chunk of iterations. If the ordered section(s) were not executed
1178
* for this iteration (or every iteration in this chunk), we need to set the
1179
* ordered iteration counters so that the next thread can proceed. */
1180
template <typename UT>
1181
static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1182
typedef typename traits_t<UT>::signed_t ST;
1183
__kmp_assert_valid_gtid(gtid);
1184
kmp_info_t *th = __kmp_threads[gtid];
1185
1186
KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1187
if (!th->th.th_team->t.t_serialized) {
1188
1189
dispatch_private_info_template<UT> *pr =
1190
reinterpret_cast<dispatch_private_info_template<UT> *>(
1191
th->th.th_dispatch->th_dispatch_pr_current);
1192
dispatch_shared_info_template<UT> volatile *sh =
1193
reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1194
th->th.th_dispatch->th_dispatch_sh_current);
1195
KMP_DEBUG_ASSERT(pr);
1196
KMP_DEBUG_ASSERT(sh);
1197
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1198
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1199
1200
if (pr->ordered_bumped) {
1201
KD_TRACE(
1202
1000,
1203
("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1204
gtid));
1205
pr->ordered_bumped = 0;
1206
} else {
1207
UT lower = pr->u.p.ordered_lower;
1208
1209
#ifdef KMP_DEBUG
1210
{
1211
char *buff;
1212
// create format specifiers before the debug output
1213
buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1214
"ordered_iteration:%%%s lower:%%%s\n",
1215
traits_t<UT>::spec, traits_t<UT>::spec);
1216
KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1217
__kmp_str_free(&buff);
1218
}
1219
#endif
1220
1221
__kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1222
__kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1223
KMP_MB(); /* is this necessary? */
1224
#ifdef KMP_DEBUG
1225
{
1226
char *buff;
1227
// create format specifiers before the debug output
1228
buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1229
"ordered_iteration:%%%s lower:%%%s\n",
1230
traits_t<UT>::spec, traits_t<UT>::spec);
1231
KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1232
__kmp_str_free(&buff);
1233
}
1234
#endif
1235
1236
test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1237
} // if
1238
} // if
1239
KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1240
}
1241
1242
#ifdef KMP_GOMP_COMPAT
1243
1244
template <typename UT>
1245
static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1246
typedef typename traits_t<UT>::signed_t ST;
1247
__kmp_assert_valid_gtid(gtid);
1248
kmp_info_t *th = __kmp_threads[gtid];
1249
1250
KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1251
if (!th->th.th_team->t.t_serialized) {
1252
dispatch_private_info_template<UT> *pr =
1253
reinterpret_cast<dispatch_private_info_template<UT> *>(
1254
th->th.th_dispatch->th_dispatch_pr_current);
1255
dispatch_shared_info_template<UT> volatile *sh =
1256
reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1257
th->th.th_dispatch->th_dispatch_sh_current);
1258
KMP_DEBUG_ASSERT(pr);
1259
KMP_DEBUG_ASSERT(sh);
1260
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1261
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1262
1263
UT lower = pr->u.p.ordered_lower;
1264
UT upper = pr->u.p.ordered_upper;
1265
UT inc = upper - lower + 1;
1266
1267
if (pr->ordered_bumped == inc) {
1268
KD_TRACE(
1269
1000,
1270
("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1271
gtid));
1272
pr->ordered_bumped = 0;
1273
} else {
1274
inc -= pr->ordered_bumped;
1275
1276
#ifdef KMP_DEBUG
1277
{
1278
char *buff;
1279
// create format specifiers before the debug output
1280
buff = __kmp_str_format(
1281
"__kmp_dispatch_finish_chunk: T#%%d before wait: "
1282
"ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1283
traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1284
KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1285
__kmp_str_free(&buff);
1286
}
1287
#endif
1288
1289
__kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1290
__kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1291
1292
KMP_MB(); /* is this necessary? */
1293
KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1294
"ordered_bumped to zero\n",
1295
gtid));
1296
pr->ordered_bumped = 0;
1297
//!!!!! TODO check if the inc should be unsigned, or signed???
1298
#ifdef KMP_DEBUG
1299
{
1300
char *buff;
1301
// create format specifiers before the debug output
1302
buff = __kmp_str_format(
1303
"__kmp_dispatch_finish_chunk: T#%%d after wait: "
1304
"ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1305
traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1306
traits_t<UT>::spec);
1307
KD_TRACE(1000,
1308
(buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1309
__kmp_str_free(&buff);
1310
}
1311
#endif
1312
1313
test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1314
}
1315
// }
1316
}
1317
KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1318
}
1319
1320
#endif /* KMP_GOMP_COMPAT */
1321
1322
template <typename T>
1323
int __kmp_dispatch_next_algorithm(int gtid,
1324
dispatch_private_info_template<T> *pr,
1325
dispatch_shared_info_template<T> volatile *sh,
1326
kmp_int32 *p_last, T *p_lb, T *p_ub,
1327
typename traits_t<T>::signed_t *p_st, T nproc,
1328
T tid) {
1329
typedef typename traits_t<T>::unsigned_t UT;
1330
typedef typename traits_t<T>::signed_t ST;
1331
typedef typename traits_t<T>::floating_t DBL;
1332
int status = 0;
1333
bool last = false;
1334
T start;
1335
ST incr;
1336
UT limit, trip, init;
1337
kmp_info_t *th = __kmp_threads[gtid];
1338
kmp_team_t *team = th->th.th_team;
1339
1340
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1341
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1342
KMP_DEBUG_ASSERT(pr);
1343
KMP_DEBUG_ASSERT(sh);
1344
KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1345
#ifdef KMP_DEBUG
1346
{
1347
char *buff;
1348
// create format specifiers before the debug output
1349
buff =
1350
__kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1351
"sh:%%p nproc:%%%s tid:%%%s\n",
1352
traits_t<T>::spec, traits_t<T>::spec);
1353
KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1354
__kmp_str_free(&buff);
1355
}
1356
#endif
1357
1358
// zero trip count
1359
if (pr->u.p.tc == 0) {
1360
KD_TRACE(10,
1361
("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1362
"zero status:%d\n",
1363
gtid, status));
1364
return 0;
1365
}
1366
1367
switch (pr->schedule) {
1368
#if KMP_STATIC_STEAL_ENABLED
1369
case kmp_sch_static_steal: {
1370
T chunk = pr->u.p.parm1;
1371
UT nchunks = pr->u.p.parm2;
1372
KD_TRACE(100,
1373
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1374
gtid));
1375
1376
trip = pr->u.p.tc - 1;
1377
1378
if (traits_t<T>::type_size > 4) {
1379
// use lock for 8-byte induction variable.
1380
// TODO (optional): check presence and use 16-byte CAS
1381
kmp_lock_t *lck = pr->u.p.steal_lock;
1382
KMP_DEBUG_ASSERT(lck != NULL);
1383
if (pr->u.p.count < (UT)pr->u.p.ub) {
1384
KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1385
__kmp_acquire_lock(lck, gtid);
1386
// try to get own chunk of iterations
1387
init = (pr->u.p.count)++;
1388
status = (init < (UT)pr->u.p.ub);
1389
__kmp_release_lock(lck, gtid);
1390
} else {
1391
status = 0; // no own chunks
1392
}
1393
if (!status) { // try to steal
1394
kmp_lock_t *lckv; // victim buffer's lock
1395
T while_limit = pr->u.p.parm3;
1396
T while_index = 0;
1397
int idx = (th->th.th_dispatch->th_disp_index - 1) %
1398
__kmp_dispatch_num_buffers; // current loop index
1399
// note: victim thread can potentially execute another loop
1400
KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1401
while ((!status) && (while_limit != ++while_index)) {
1402
dispatch_private_info_template<T> *v;
1403
T remaining;
1404
T victimId = pr->u.p.parm4;
1405
T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1406
v = reinterpret_cast<dispatch_private_info_template<T> *>(
1407
&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1408
KMP_DEBUG_ASSERT(v);
1409
while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1410
oldVictimId != victimId) {
1411
victimId = (victimId + 1) % nproc;
1412
v = reinterpret_cast<dispatch_private_info_template<T> *>(
1413
&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1414
KMP_DEBUG_ASSERT(v);
1415
}
1416
if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1417
continue; // try once more (nproc attempts in total)
1418
}
1419
if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1420
kmp_uint32 old = UNUSED;
1421
// try to steal whole range from inactive victim
1422
status = v->steal_flag.compare_exchange_strong(old, THIEF);
1423
if (status) {
1424
// initialize self buffer with victim's whole range of chunks
1425
T id = victimId;
1426
T small_chunk = 0, extras = 0, p_extra = 0;
1427
__kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1428
init, small_chunk, extras,
1429
p_extra);
1430
__kmp_acquire_lock(lck, gtid);
1431
pr->u.p.count = init + 1; // exclude one we execute immediately
1432
pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1433
__kmp_release_lock(lck, gtid);
1434
pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1435
// no need to reinitialize other thread invariants: lb, st, etc.
1436
#ifdef KMP_DEBUG
1437
{
1438
char *buff;
1439
// create format specifiers before the debug output
1440
buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1441
"stolen chunks from T#%%d, "
1442
"count:%%%s ub:%%%s\n",
1443
traits_t<UT>::spec, traits_t<T>::spec);
1444
KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1445
__kmp_str_free(&buff);
1446
}
1447
#endif
1448
// activate non-empty buffer and let others steal from us
1449
if (pr->u.p.count < (UT)pr->u.p.ub)
1450
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1451
break;
1452
}
1453
}
1454
if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1455
v->u.p.count >= (UT)v->u.p.ub) {
1456
pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1457
continue; // no chunks to steal, try next victim
1458
}
1459
lckv = v->u.p.steal_lock;
1460
KMP_ASSERT(lckv != NULL);
1461
__kmp_acquire_lock(lckv, gtid);
1462
limit = v->u.p.ub; // keep initial ub
1463
if (v->u.p.count >= limit) {
1464
__kmp_release_lock(lckv, gtid);
1465
pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1466
continue; // no chunks to steal, try next victim
1467
}
1468
1469
// stealing succeded, reduce victim's ub by 1/4 of undone chunks
1470
// TODO: is this heuristics good enough??
1471
remaining = limit - v->u.p.count;
1472
if (remaining > 7) {
1473
// steal 1/4 of remaining
1474
KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1475
init = (v->u.p.ub -= (remaining >> 2));
1476
} else {
1477
// steal 1 chunk of 1..7 remaining
1478
KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1479
init = (v->u.p.ub -= 1);
1480
}
1481
__kmp_release_lock(lckv, gtid);
1482
#ifdef KMP_DEBUG
1483
{
1484
char *buff;
1485
// create format specifiers before the debug output
1486
buff = __kmp_str_format(
1487
"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1488
"count:%%%s ub:%%%s\n",
1489
traits_t<UT>::spec, traits_t<UT>::spec);
1490
KD_TRACE(10, (buff, gtid, victimId, init, limit));
1491
__kmp_str_free(&buff);
1492
}
1493
#endif
1494
KMP_DEBUG_ASSERT(init + 1 <= limit);
1495
pr->u.p.parm4 = victimId; // remember victim to steal from
1496
status = 1;
1497
// now update own count and ub with stolen range excluding init chunk
1498
__kmp_acquire_lock(lck, gtid);
1499
pr->u.p.count = init + 1;
1500
pr->u.p.ub = limit;
1501
__kmp_release_lock(lck, gtid);
1502
// activate non-empty buffer and let others steal from us
1503
if (init + 1 < limit)
1504
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1505
} // while (search for victim)
1506
} // if (try to find victim and steal)
1507
} else {
1508
// 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1509
// as all operations on pair (count, ub) must be done atomically
1510
typedef union {
1511
struct {
1512
UT count;
1513
T ub;
1514
} p;
1515
kmp_int64 b;
1516
} union_i4;
1517
union_i4 vold, vnew;
1518
if (pr->u.p.count < (UT)pr->u.p.ub) {
1519
KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1520
vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1521
vnew.b = vold.b;
1522
vnew.p.count++; // get chunk from head of self range
1523
while (!KMP_COMPARE_AND_STORE_REL64(
1524
(volatile kmp_int64 *)&pr->u.p.count,
1525
*VOLATILE_CAST(kmp_int64 *) & vold.b,
1526
*VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1527
KMP_CPU_PAUSE();
1528
vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1529
vnew.b = vold.b;
1530
vnew.p.count++;
1531
}
1532
init = vold.p.count;
1533
status = (init < (UT)vold.p.ub);
1534
} else {
1535
status = 0; // no own chunks
1536
}
1537
if (!status) { // try to steal
1538
T while_limit = pr->u.p.parm3;
1539
T while_index = 0;
1540
int idx = (th->th.th_dispatch->th_disp_index - 1) %
1541
__kmp_dispatch_num_buffers; // current loop index
1542
// note: victim thread can potentially execute another loop
1543
KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1544
while ((!status) && (while_limit != ++while_index)) {
1545
dispatch_private_info_template<T> *v;
1546
T remaining;
1547
T victimId = pr->u.p.parm4;
1548
T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1549
v = reinterpret_cast<dispatch_private_info_template<T> *>(
1550
&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1551
KMP_DEBUG_ASSERT(v);
1552
while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1553
oldVictimId != victimId) {
1554
victimId = (victimId + 1) % nproc;
1555
v = reinterpret_cast<dispatch_private_info_template<T> *>(
1556
&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1557
KMP_DEBUG_ASSERT(v);
1558
}
1559
if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1560
continue; // try once more (nproc attempts in total)
1561
}
1562
if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1563
kmp_uint32 old = UNUSED;
1564
// try to steal whole range from inactive victim
1565
status = v->steal_flag.compare_exchange_strong(old, THIEF);
1566
if (status) {
1567
// initialize self buffer with victim's whole range of chunks
1568
T id = victimId;
1569
T small_chunk = 0, extras = 0, p_extra = 0;
1570
__kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1571
init, small_chunk, extras,
1572
p_extra);
1573
vnew.p.count = init + 1;
1574
vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1575
// write pair (count, ub) at once atomically
1576
#if KMP_ARCH_X86
1577
KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1578
#else
1579
*(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1580
#endif
1581
pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1582
// no need to initialize other thread invariants: lb, st, etc.
1583
#ifdef KMP_DEBUG
1584
{
1585
char *buff;
1586
// create format specifiers before the debug output
1587
buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1588
"stolen chunks from T#%%d, "
1589
"count:%%%s ub:%%%s\n",
1590
traits_t<UT>::spec, traits_t<T>::spec);
1591
KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1592
__kmp_str_free(&buff);
1593
}
1594
#endif
1595
// activate non-empty buffer and let others steal from us
1596
if (pr->u.p.count < (UT)pr->u.p.ub)
1597
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1598
break;
1599
}
1600
}
1601
while (1) { // CAS loop with check if victim still has enough chunks
1602
// many threads may be stealing concurrently from same victim
1603
vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1604
if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1605
vold.p.count >= (UT)vold.p.ub) {
1606
pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1607
break; // no chunks to steal, try next victim
1608
}
1609
vnew.b = vold.b;
1610
remaining = vold.p.ub - vold.p.count;
1611
// try to steal 1/4 of remaining
1612
// TODO: is this heuristics good enough??
1613
if (remaining > 7) {
1614
vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1615
} else {
1616
vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1617
}
1618
KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1619
if (KMP_COMPARE_AND_STORE_REL64(
1620
(volatile kmp_int64 *)&v->u.p.count,
1621
*VOLATILE_CAST(kmp_int64 *) & vold.b,
1622
*VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1623
// stealing succedded
1624
#ifdef KMP_DEBUG
1625
{
1626
char *buff;
1627
// create format specifiers before the debug output
1628
buff = __kmp_str_format(
1629
"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1630
"count:%%%s ub:%%%s\n",
1631
traits_t<T>::spec, traits_t<T>::spec);
1632
KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1633
__kmp_str_free(&buff);
1634
}
1635
#endif
1636
KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1637
vold.p.ub - vnew.p.ub);
1638
status = 1;
1639
pr->u.p.parm4 = victimId; // keep victim id
1640
// now update own count and ub
1641
init = vnew.p.ub;
1642
vold.p.count = init + 1;
1643
#if KMP_ARCH_X86
1644
KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1645
#else
1646
*(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1647
#endif
1648
// activate non-empty buffer and let others steal from us
1649
if (vold.p.count < (UT)vold.p.ub)
1650
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1651
break;
1652
} // if (check CAS result)
1653
KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1654
} // while (try to steal from particular victim)
1655
} // while (search for victim)
1656
} // if (try to find victim and steal)
1657
} // if (4-byte induction variable)
1658
if (!status) {
1659
*p_lb = 0;
1660
*p_ub = 0;
1661
if (p_st != NULL)
1662
*p_st = 0;
1663
} else {
1664
start = pr->u.p.lb;
1665
init *= chunk;
1666
limit = chunk + init - 1;
1667
incr = pr->u.p.st;
1668
KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1669
1670
KMP_DEBUG_ASSERT(init <= trip);
1671
// keep track of done chunks for possible early exit from stealing
1672
// TODO: count executed chunks locally with rare update of shared location
1673
// test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1674
if ((last = (limit >= trip)) != 0)
1675
limit = trip;
1676
if (p_st != NULL)
1677
*p_st = incr;
1678
1679
if (incr == 1) {
1680
*p_lb = start + init;
1681
*p_ub = start + limit;
1682
} else {
1683
*p_lb = start + init * incr;
1684
*p_ub = start + limit * incr;
1685
}
1686
} // if
1687
break;
1688
} // case
1689
#endif // KMP_STATIC_STEAL_ENABLED
1690
case kmp_sch_static_balanced: {
1691
KD_TRACE(
1692
10,
1693
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1694
gtid));
1695
/* check if thread has any iteration to do */
1696
if ((status = !pr->u.p.count) != 0) {
1697
pr->u.p.count = 1;
1698
*p_lb = pr->u.p.lb;
1699
*p_ub = pr->u.p.ub;
1700
last = (pr->u.p.parm1 != 0);
1701
if (p_st != NULL)
1702
*p_st = pr->u.p.st;
1703
} else { /* no iterations to do */
1704
pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1705
}
1706
} // case
1707
break;
1708
case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1709
merged here */
1710
case kmp_sch_static_chunked: {
1711
T parm1;
1712
1713
KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1714
"kmp_sch_static_[affinity|chunked] case\n",
1715
gtid));
1716
parm1 = pr->u.p.parm1;
1717
1718
trip = pr->u.p.tc - 1;
1719
init = parm1 * (pr->u.p.count + tid);
1720
1721
if ((status = (init <= trip)) != 0) {
1722
start = pr->u.p.lb;
1723
incr = pr->u.p.st;
1724
limit = parm1 + init - 1;
1725
1726
if ((last = (limit >= trip)) != 0)
1727
limit = trip;
1728
1729
if (p_st != NULL)
1730
*p_st = incr;
1731
1732
pr->u.p.count += nproc;
1733
1734
if (incr == 1) {
1735
*p_lb = start + init;
1736
*p_ub = start + limit;
1737
} else {
1738
*p_lb = start + init * incr;
1739
*p_ub = start + limit * incr;
1740
}
1741
1742
if (pr->flags.ordered) {
1743
pr->u.p.ordered_lower = init;
1744
pr->u.p.ordered_upper = limit;
1745
} // if
1746
} // if
1747
} // case
1748
break;
1749
1750
case kmp_sch_dynamic_chunked: {
1751
UT chunk_number;
1752
UT chunk_size = pr->u.p.parm1;
1753
UT nchunks = pr->u.p.parm2;
1754
1755
KD_TRACE(
1756
100,
1757
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1758
gtid));
1759
1760
chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1761
status = (chunk_number < nchunks);
1762
if (!status) {
1763
*p_lb = 0;
1764
*p_ub = 0;
1765
if (p_st != NULL)
1766
*p_st = 0;
1767
} else {
1768
init = chunk_size * chunk_number;
1769
trip = pr->u.p.tc - 1;
1770
start = pr->u.p.lb;
1771
incr = pr->u.p.st;
1772
1773
if ((last = (trip - init < (UT)chunk_size)))
1774
limit = trip;
1775
else
1776
limit = chunk_size + init - 1;
1777
1778
if (p_st != NULL)
1779
*p_st = incr;
1780
1781
if (incr == 1) {
1782
*p_lb = start + init;
1783
*p_ub = start + limit;
1784
} else {
1785
*p_lb = start + init * incr;
1786
*p_ub = start + limit * incr;
1787
}
1788
1789
if (pr->flags.ordered) {
1790
pr->u.p.ordered_lower = init;
1791
pr->u.p.ordered_upper = limit;
1792
} // if
1793
} // if
1794
} // case
1795
break;
1796
1797
case kmp_sch_guided_iterative_chunked: {
1798
T chunkspec = pr->u.p.parm1;
1799
KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1800
"iterative case\n",
1801
gtid));
1802
trip = pr->u.p.tc;
1803
// Start atomic part of calculations
1804
while (1) {
1805
ST remaining; // signed, because can be < 0
1806
init = sh->u.s.iteration; // shared value
1807
remaining = trip - init;
1808
if (remaining <= 0) { // AC: need to compare with 0 first
1809
// nothing to do, don't try atomic op
1810
status = 0;
1811
break;
1812
}
1813
if ((T)remaining <
1814
pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1815
// use dynamic-style schedule
1816
// atomically increment iterations, get old value
1817
init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1818
(ST)chunkspec);
1819
remaining = trip - init;
1820
if (remaining <= 0) {
1821
status = 0; // all iterations got by other threads
1822
} else {
1823
// got some iterations to work on
1824
status = 1;
1825
if ((T)remaining > chunkspec) {
1826
limit = init + chunkspec - 1;
1827
} else {
1828
last = true; // the last chunk
1829
limit = init + remaining - 1;
1830
} // if
1831
} // if
1832
break;
1833
} // if
1834
limit = init + (UT)((double)remaining *
1835
*(double *)&pr->u.p.parm3); // divide by K*nproc
1836
if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1837
(ST)init, (ST)limit)) {
1838
// CAS was successful, chunk obtained
1839
status = 1;
1840
--limit;
1841
break;
1842
} // if
1843
} // while
1844
if (status != 0) {
1845
start = pr->u.p.lb;
1846
incr = pr->u.p.st;
1847
if (p_st != NULL)
1848
*p_st = incr;
1849
*p_lb = start + init * incr;
1850
*p_ub = start + limit * incr;
1851
if (pr->flags.ordered) {
1852
pr->u.p.ordered_lower = init;
1853
pr->u.p.ordered_upper = limit;
1854
} // if
1855
} else {
1856
*p_lb = 0;
1857
*p_ub = 0;
1858
if (p_st != NULL)
1859
*p_st = 0;
1860
} // if
1861
} // case
1862
break;
1863
1864
case kmp_sch_guided_simd: {
1865
// same as iterative but curr-chunk adjusted to be multiple of given
1866
// chunk
1867
T chunk = pr->u.p.parm1;
1868
KD_TRACE(100,
1869
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1870
gtid));
1871
trip = pr->u.p.tc;
1872
// Start atomic part of calculations
1873
while (1) {
1874
ST remaining; // signed, because can be < 0
1875
init = sh->u.s.iteration; // shared value
1876
remaining = trip - init;
1877
if (remaining <= 0) { // AC: need to compare with 0 first
1878
status = 0; // nothing to do, don't try atomic op
1879
break;
1880
}
1881
KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1882
// compare with K*nproc*(chunk+1), K=2 by default
1883
if ((T)remaining < pr->u.p.parm2) {
1884
// use dynamic-style schedule
1885
// atomically increment iterations, get old value
1886
init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1887
(ST)chunk);
1888
remaining = trip - init;
1889
if (remaining <= 0) {
1890
status = 0; // all iterations got by other threads
1891
} else {
1892
// got some iterations to work on
1893
status = 1;
1894
if ((T)remaining > chunk) {
1895
limit = init + chunk - 1;
1896
} else {
1897
last = true; // the last chunk
1898
limit = init + remaining - 1;
1899
} // if
1900
} // if
1901
break;
1902
} // if
1903
// divide by K*nproc
1904
UT span;
1905
__kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1906
&span);
1907
UT rem = span % chunk;
1908
if (rem) // adjust so that span%chunk == 0
1909
span += chunk - rem;
1910
limit = init + span;
1911
if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1912
(ST)init, (ST)limit)) {
1913
// CAS was successful, chunk obtained
1914
status = 1;
1915
--limit;
1916
break;
1917
} // if
1918
} // while
1919
if (status != 0) {
1920
start = pr->u.p.lb;
1921
incr = pr->u.p.st;
1922
if (p_st != NULL)
1923
*p_st = incr;
1924
*p_lb = start + init * incr;
1925
*p_ub = start + limit * incr;
1926
if (pr->flags.ordered) {
1927
pr->u.p.ordered_lower = init;
1928
pr->u.p.ordered_upper = limit;
1929
} // if
1930
} else {
1931
*p_lb = 0;
1932
*p_ub = 0;
1933
if (p_st != NULL)
1934
*p_st = 0;
1935
} // if
1936
} // case
1937
break;
1938
1939
case kmp_sch_guided_analytical_chunked: {
1940
T chunkspec = pr->u.p.parm1;
1941
UT chunkIdx;
1942
#if KMP_USE_X87CONTROL
1943
/* for storing original FPCW value for Windows* OS on
1944
IA-32 architecture 8-byte version */
1945
unsigned int oldFpcw;
1946
unsigned int fpcwSet = 0;
1947
#endif
1948
KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1949
"kmp_sch_guided_analytical_chunked case\n",
1950
gtid));
1951
1952
trip = pr->u.p.tc;
1953
1954
KMP_DEBUG_ASSERT(nproc > 1);
1955
KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1956
1957
while (1) { /* this while loop is a safeguard against unexpected zero
1958
chunk sizes */
1959
chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1960
if (chunkIdx >= (UT)pr->u.p.parm2) {
1961
--trip;
1962
/* use dynamic-style scheduling */
1963
init = chunkIdx * chunkspec + pr->u.p.count;
1964
/* need to verify init > 0 in case of overflow in the above
1965
* calculation */
1966
if ((status = (init > 0 && init <= trip)) != 0) {
1967
limit = init + chunkspec - 1;
1968
1969
if ((last = (limit >= trip)) != 0)
1970
limit = trip;
1971
}
1972
break;
1973
} else {
1974
/* use exponential-style scheduling */
1975
/* The following check is to workaround the lack of long double precision on
1976
Windows* OS.
1977
This check works around the possible effect that init != 0 for chunkIdx == 0.
1978
*/
1979
#if KMP_USE_X87CONTROL
1980
/* If we haven't already done so, save original
1981
FPCW and set precision to 64-bit, as Windows* OS
1982
on IA-32 architecture defaults to 53-bit */
1983
if (!fpcwSet) {
1984
oldFpcw = _control87(0, 0);
1985
_control87(_PC_64, _MCW_PC);
1986
fpcwSet = 0x30000;
1987
}
1988
#endif
1989
if (chunkIdx) {
1990
init = __kmp_dispatch_guided_remaining<T>(
1991
trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1992
KMP_DEBUG_ASSERT(init);
1993
init = trip - init;
1994
} else
1995
init = 0;
1996
limit = trip - __kmp_dispatch_guided_remaining<T>(
1997
trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1998
KMP_ASSERT(init <= limit);
1999
if (init < limit) {
2000
KMP_DEBUG_ASSERT(limit <= trip);
2001
--limit;
2002
status = 1;
2003
break;
2004
} // if
2005
} // if
2006
} // while (1)
2007
#if KMP_USE_X87CONTROL
2008
/* restore FPCW if necessary
2009
AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2010
*/
2011
if (fpcwSet && (oldFpcw & fpcwSet))
2012
_control87(oldFpcw, _MCW_PC);
2013
#endif
2014
if (status != 0) {
2015
start = pr->u.p.lb;
2016
incr = pr->u.p.st;
2017
if (p_st != NULL)
2018
*p_st = incr;
2019
*p_lb = start + init * incr;
2020
*p_ub = start + limit * incr;
2021
if (pr->flags.ordered) {
2022
pr->u.p.ordered_lower = init;
2023
pr->u.p.ordered_upper = limit;
2024
}
2025
} else {
2026
*p_lb = 0;
2027
*p_ub = 0;
2028
if (p_st != NULL)
2029
*p_st = 0;
2030
}
2031
} // case
2032
break;
2033
2034
case kmp_sch_trapezoidal: {
2035
UT index;
2036
T parm2 = pr->u.p.parm2;
2037
T parm3 = pr->u.p.parm3;
2038
T parm4 = pr->u.p.parm4;
2039
KD_TRACE(100,
2040
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
2041
gtid));
2042
2043
index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2044
2045
init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2046
trip = pr->u.p.tc - 1;
2047
2048
if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2049
*p_lb = 0;
2050
*p_ub = 0;
2051
if (p_st != NULL)
2052
*p_st = 0;
2053
} else {
2054
start = pr->u.p.lb;
2055
limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2056
incr = pr->u.p.st;
2057
2058
if ((last = (limit >= trip)) != 0)
2059
limit = trip;
2060
2061
if (p_st != NULL)
2062
*p_st = incr;
2063
2064
if (incr == 1) {
2065
*p_lb = start + init;
2066
*p_ub = start + limit;
2067
} else {
2068
*p_lb = start + init * incr;
2069
*p_ub = start + limit * incr;
2070
}
2071
2072
if (pr->flags.ordered) {
2073
pr->u.p.ordered_lower = init;
2074
pr->u.p.ordered_upper = limit;
2075
} // if
2076
} // if
2077
} // case
2078
break;
2079
default: {
2080
status = 0; // to avoid complaints on uninitialized variable use
2081
__kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2082
KMP_HNT(GetNewerLibrary), // Hint
2083
__kmp_msg_null // Variadic argument list terminator
2084
);
2085
} break;
2086
} // switch
2087
if (p_last)
2088
*p_last = last;
2089
#ifdef KMP_DEBUG
2090
if (pr->flags.ordered) {
2091
char *buff;
2092
// create format specifiers before the debug output
2093
buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
2094
"ordered_lower:%%%s ordered_upper:%%%s\n",
2095
traits_t<UT>::spec, traits_t<UT>::spec);
2096
KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
2097
__kmp_str_free(&buff);
2098
}
2099
{
2100
char *buff;
2101
// create format specifiers before the debug output
2102
buff = __kmp_str_format(
2103
"__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
2104
"p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
2105
traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2106
KMP_DEBUG_ASSERT(p_last);
2107
KMP_DEBUG_ASSERT(p_st);
2108
KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
2109
__kmp_str_free(&buff);
2110
}
2111
#endif
2112
return status;
2113
}
2114
2115
/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
2116
work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
2117
is not called. */
2118
#if OMPT_SUPPORT && OMPT_OPTIONAL
2119
#define OMPT_LOOP_END \
2120
if (status == 0) { \
2121
if (ompt_enabled.ompt_callback_work) { \
2122
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
2123
ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
2124
ompt_callbacks.ompt_callback(ompt_callback_work)( \
2125
ompt_get_work_schedule(pr->schedule), ompt_scope_end, \
2126
&(team_info->parallel_data), &(task_info->task_data), 0, codeptr); \
2127
} \
2128
}
2129
#define OMPT_LOOP_DISPATCH(lb, ub, st, status) \
2130
if (ompt_enabled.ompt_callback_dispatch && status) { \
2131
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
2132
ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
2133
ompt_dispatch_chunk_t chunk; \
2134
ompt_data_t instance = ompt_data_none; \
2135
OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st); \
2136
instance.ptr = &chunk; \
2137
ompt_callbacks.ompt_callback(ompt_callback_dispatch)( \
2138
&(team_info->parallel_data), &(task_info->task_data), \
2139
ompt_dispatch_ws_loop_chunk, instance); \
2140
}
2141
// TODO: implement count
2142
#else
2143
#define OMPT_LOOP_END // no-op
2144
#define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
2145
#endif
2146
2147
#if KMP_STATS_ENABLED
2148
#define KMP_STATS_LOOP_END \
2149
{ \
2150
kmp_int64 u, l, t, i; \
2151
l = (kmp_int64)(*p_lb); \
2152
u = (kmp_int64)(*p_ub); \
2153
i = (kmp_int64)(pr->u.p.st); \
2154
if (status == 0) { \
2155
t = 0; \
2156
KMP_POP_PARTITIONED_TIMER(); \
2157
} else if (i == 1) { \
2158
if (u >= l) \
2159
t = u - l + 1; \
2160
else \
2161
t = 0; \
2162
} else if (i < 0) { \
2163
if (l >= u) \
2164
t = (l - u) / (-i) + 1; \
2165
else \
2166
t = 0; \
2167
} else { \
2168
if (u >= l) \
2169
t = (u - l) / i + 1; \
2170
else \
2171
t = 0; \
2172
} \
2173
KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
2174
}
2175
#else
2176
#define KMP_STATS_LOOP_END /* Nothing */
2177
#endif
2178
2179
template <typename T>
2180
static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2181
T *p_lb, T *p_ub,
2182
typename traits_t<T>::signed_t *p_st
2183
#if OMPT_SUPPORT && OMPT_OPTIONAL
2184
,
2185
void *codeptr
2186
#endif
2187
) {
2188
2189
typedef typename traits_t<T>::unsigned_t UT;
2190
typedef typename traits_t<T>::signed_t ST;
2191
// This is potentially slightly misleading, schedule(runtime) will appear here
2192
// even if the actual runtime schedule is static. (Which points out a
2193
// disadvantage of schedule(runtime): even when static scheduling is used it
2194
// costs more than a compile time choice to use static scheduling would.)
2195
KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2196
2197
int status;
2198
dispatch_private_info_template<T> *pr;
2199
__kmp_assert_valid_gtid(gtid);
2200
kmp_info_t *th = __kmp_threads[gtid];
2201
kmp_team_t *team = th->th.th_team;
2202
2203
KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2204
KD_TRACE(
2205
1000,
2206
("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2207
gtid, p_lb, p_ub, p_st, p_last));
2208
2209
if (team->t.t_serialized) {
2210
/* NOTE: serialize this dispatch because we are not at the active level */
2211
pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2212
th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2213
KMP_DEBUG_ASSERT(pr);
2214
2215
if ((status = (pr->u.p.tc != 0)) == 0) {
2216
*p_lb = 0;
2217
*p_ub = 0;
2218
// if ( p_last != NULL )
2219
// *p_last = 0;
2220
if (p_st != NULL)
2221
*p_st = 0;
2222
if (__kmp_env_consistency_check) {
2223
if (pr->pushed_ws != ct_none) {
2224
pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2225
}
2226
}
2227
} else if (pr->flags.nomerge) {
2228
kmp_int32 last;
2229
T start;
2230
UT limit, trip, init;
2231
ST incr;
2232
T chunk = pr->u.p.parm1;
2233
2234
KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2235
gtid));
2236
2237
init = chunk * pr->u.p.count++;
2238
trip = pr->u.p.tc - 1;
2239
2240
if ((status = (init <= trip)) == 0) {
2241
*p_lb = 0;
2242
*p_ub = 0;
2243
// if ( p_last != NULL )
2244
// *p_last = 0;
2245
if (p_st != NULL)
2246
*p_st = 0;
2247
if (__kmp_env_consistency_check) {
2248
if (pr->pushed_ws != ct_none) {
2249
pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2250
}
2251
}
2252
} else {
2253
start = pr->u.p.lb;
2254
limit = chunk + init - 1;
2255
incr = pr->u.p.st;
2256
2257
if ((last = (limit >= trip)) != 0) {
2258
limit = trip;
2259
#if KMP_OS_WINDOWS
2260
pr->u.p.last_upper = pr->u.p.ub;
2261
#endif /* KMP_OS_WINDOWS */
2262
}
2263
if (p_last != NULL)
2264
*p_last = last;
2265
if (p_st != NULL)
2266
*p_st = incr;
2267
if (incr == 1) {
2268
*p_lb = start + init;
2269
*p_ub = start + limit;
2270
} else {
2271
*p_lb = start + init * incr;
2272
*p_ub = start + limit * incr;
2273
}
2274
2275
if (pr->flags.ordered) {
2276
pr->u.p.ordered_lower = init;
2277
pr->u.p.ordered_upper = limit;
2278
#ifdef KMP_DEBUG
2279
{
2280
char *buff;
2281
// create format specifiers before the debug output
2282
buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2283
"ordered_lower:%%%s ordered_upper:%%%s\n",
2284
traits_t<UT>::spec, traits_t<UT>::spec);
2285
KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2286
pr->u.p.ordered_upper));
2287
__kmp_str_free(&buff);
2288
}
2289
#endif
2290
} // if
2291
} // if
2292
} else {
2293
pr->u.p.tc = 0;
2294
*p_lb = pr->u.p.lb;
2295
*p_ub = pr->u.p.ub;
2296
#if KMP_OS_WINDOWS
2297
pr->u.p.last_upper = *p_ub;
2298
#endif /* KMP_OS_WINDOWS */
2299
if (p_last != NULL)
2300
*p_last = TRUE;
2301
if (p_st != NULL)
2302
*p_st = pr->u.p.st;
2303
} // if
2304
#ifdef KMP_DEBUG
2305
{
2306
char *buff;
2307
// create format specifiers before the debug output
2308
buff = __kmp_str_format(
2309
"__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2310
"p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2311
traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2312
KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2313
(p_last ? *p_last : 0), status));
2314
__kmp_str_free(&buff);
2315
}
2316
#endif
2317
#if INCLUDE_SSC_MARKS
2318
SSC_MARK_DISPATCH_NEXT();
2319
#endif
2320
OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2321
OMPT_LOOP_END;
2322
KMP_STATS_LOOP_END;
2323
return status;
2324
} else {
2325
kmp_int32 last = 0;
2326
dispatch_shared_info_template<T> volatile *sh;
2327
2328
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2329
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2330
2331
pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2332
th->th.th_dispatch->th_dispatch_pr_current);
2333
KMP_DEBUG_ASSERT(pr);
2334
sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2335
th->th.th_dispatch->th_dispatch_sh_current);
2336
KMP_DEBUG_ASSERT(sh);
2337
2338
#if KMP_USE_HIER_SCHED
2339
if (pr->flags.use_hier)
2340
status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2341
else
2342
#endif // KMP_USE_HIER_SCHED
2343
status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2344
p_st, th->th.th_team_nproc,
2345
th->th.th_info.ds.ds_tid);
2346
// status == 0: no more iterations to execute
2347
if (status == 0) {
2348
ST num_done;
2349
num_done = test_then_inc<ST>(&sh->u.s.num_done);
2350
#ifdef KMP_DEBUG
2351
{
2352
char *buff;
2353
// create format specifiers before the debug output
2354
buff = __kmp_str_format(
2355
"__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2356
traits_t<ST>::spec);
2357
KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2358
__kmp_str_free(&buff);
2359
}
2360
#endif
2361
2362
#if KMP_USE_HIER_SCHED
2363
pr->flags.use_hier = FALSE;
2364
#endif
2365
if (num_done == th->th.th_team_nproc - 1) {
2366
#if KMP_STATIC_STEAL_ENABLED
2367
if (pr->schedule == kmp_sch_static_steal) {
2368
int i;
2369
int idx = (th->th.th_dispatch->th_disp_index - 1) %
2370
__kmp_dispatch_num_buffers; // current loop index
2371
// loop complete, safe to destroy locks used for stealing
2372
for (i = 0; i < th->th.th_team_nproc; ++i) {
2373
dispatch_private_info_template<T> *buf =
2374
reinterpret_cast<dispatch_private_info_template<T> *>(
2375
&team->t.t_dispatch[i].th_disp_buffer[idx]);
2376
KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2377
KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2378
if (traits_t<T>::type_size > 4) {
2379
// destroy locks used for stealing
2380
kmp_lock_t *lck = buf->u.p.steal_lock;
2381
KMP_ASSERT(lck != NULL);
2382
__kmp_destroy_lock(lck);
2383
__kmp_free(lck);
2384
buf->u.p.steal_lock = NULL;
2385
}
2386
}
2387
}
2388
#endif
2389
/* NOTE: release shared buffer to be reused */
2390
2391
KMP_MB(); /* Flush all pending memory write invalidates. */
2392
2393
sh->u.s.num_done = 0;
2394
sh->u.s.iteration = 0;
2395
2396
/* TODO replace with general release procedure? */
2397
if (pr->flags.ordered) {
2398
sh->u.s.ordered_iteration = 0;
2399
}
2400
2401
KMP_MB(); /* Flush all pending memory write invalidates. */
2402
2403
sh->buffer_index += __kmp_dispatch_num_buffers;
2404
KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2405
gtid, sh->buffer_index));
2406
2407
KMP_MB(); /* Flush all pending memory write invalidates. */
2408
2409
} // if
2410
if (__kmp_env_consistency_check) {
2411
if (pr->pushed_ws != ct_none) {
2412
pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2413
}
2414
}
2415
2416
th->th.th_dispatch->th_deo_fcn = NULL;
2417
th->th.th_dispatch->th_dxo_fcn = NULL;
2418
th->th.th_dispatch->th_dispatch_sh_current = NULL;
2419
th->th.th_dispatch->th_dispatch_pr_current = NULL;
2420
} // if (status == 0)
2421
#if KMP_OS_WINDOWS
2422
else if (last) {
2423
pr->u.p.last_upper = pr->u.p.ub;
2424
}
2425
#endif /* KMP_OS_WINDOWS */
2426
if (p_last != NULL && status != 0)
2427
*p_last = last;
2428
} // if
2429
2430
#ifdef KMP_DEBUG
2431
{
2432
char *buff;
2433
// create format specifiers before the debug output
2434
buff = __kmp_str_format(
2435
"__kmp_dispatch_next: T#%%d normal case: "
2436
"p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2437
traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2438
KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2439
(p_last ? *p_last : 0), status));
2440
__kmp_str_free(&buff);
2441
}
2442
#endif
2443
#if INCLUDE_SSC_MARKS
2444
SSC_MARK_DISPATCH_NEXT();
2445
#endif
2446
OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2447
OMPT_LOOP_END;
2448
KMP_STATS_LOOP_END;
2449
return status;
2450
}
2451
2452
/*!
2453
@ingroup WORK_SHARING
2454
@param loc source location information
2455
@param global_tid global thread number
2456
@return Zero if the parallel region is not active and this thread should execute
2457
all sections, non-zero otherwise.
2458
2459
Beginning of sections construct.
2460
There are no implicit barriers in the "sections" calls, rather the compiler
2461
should introduce an explicit barrier if it is required.
2462
2463
This implementation is based on __kmp_dispatch_init, using same constructs for
2464
shared data (we can't have sections nested directly in omp for loop, there
2465
should be a parallel region in between)
2466
*/
2467
kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
2468
2469
int active;
2470
kmp_info_t *th;
2471
kmp_team_t *team;
2472
kmp_uint32 my_buffer_index;
2473
dispatch_shared_info_template<kmp_int32> volatile *sh;
2474
2475
KMP_DEBUG_ASSERT(__kmp_init_serial);
2476
2477
if (!TCR_4(__kmp_init_parallel))
2478
__kmp_parallel_initialize();
2479
__kmp_resume_if_soft_paused();
2480
2481
/* setup data */
2482
th = __kmp_threads[gtid];
2483
team = th->th.th_team;
2484
active = !team->t.t_serialized;
2485
th->th.th_ident = loc;
2486
2487
KMP_COUNT_BLOCK(OMP_SECTIONS);
2488
KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
2489
2490
if (active) {
2491
// Setup sections in the same way as dynamic scheduled loops.
2492
// We need one shared data: which section is to execute next.
2493
// (in case parallel is not active, all sections will be executed on the
2494
// same thread)
2495
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2496
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2497
2498
my_buffer_index = th->th.th_dispatch->th_disp_index++;
2499
2500
// reuse shared data structures from dynamic sched loops:
2501
sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2502
&team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2503
KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2504
my_buffer_index));
2505
2506
th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2507
th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2508
2509
KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2510
"sh->buffer_index:%d\n",
2511
gtid, my_buffer_index, sh->buffer_index));
2512
__kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
2513
__kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
2514
// Note: KMP_WAIT() cannot be used there: buffer index and
2515
// my_buffer_index are *always* 32-bit integers.
2516
KMP_MB();
2517
KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2518
"sh->buffer_index:%d\n",
2519
gtid, my_buffer_index, sh->buffer_index));
2520
2521
th->th.th_dispatch->th_dispatch_pr_current =
2522
nullptr; // sections construct doesn't need private data
2523
th->th.th_dispatch->th_dispatch_sh_current =
2524
CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
2525
}
2526
2527
#if OMPT_SUPPORT && OMPT_OPTIONAL
2528
if (ompt_enabled.ompt_callback_work) {
2529
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2530
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2531
ompt_callbacks.ompt_callback(ompt_callback_work)(
2532
ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2533
&(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2534
}
2535
#endif
2536
KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2537
2538
return active;
2539
}
2540
2541
/*!
2542
@ingroup WORK_SHARING
2543
@param loc source location information
2544
@param global_tid global thread number
2545
@param numberOfSections number of sections in the 'sections' construct
2546
@return unsigned [from 0 to n) - number (id) of the section to execute next on
2547
this thread. n (or any other number not in range) - nothing to execute on this
2548
thread
2549
*/
2550
2551
kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
2552
kmp_int32 numberOfSections) {
2553
2554
KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
2555
2556
kmp_info_t *th = __kmp_threads[gtid];
2557
#ifdef KMP_DEBUG
2558
kmp_team_t *team = th->th.th_team;
2559
#endif
2560
2561
KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2562
numberOfSections));
2563
2564
// For serialized case we should not call this function:
2565
KMP_DEBUG_ASSERT(!team->t.t_serialized);
2566
2567
dispatch_shared_info_template<kmp_int32> volatile *sh;
2568
2569
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2570
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2571
2572
KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2573
sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2574
th->th.th_dispatch->th_dispatch_sh_current);
2575
KMP_DEBUG_ASSERT(sh);
2576
2577
kmp_int32 sectionIndex = 0;
2578
bool moreSectionsToExecute = true;
2579
2580
// Find section to execute:
2581
sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
2582
if (sectionIndex >= numberOfSections) {
2583
moreSectionsToExecute = false;
2584
}
2585
2586
// status == 0: no more sections to execute;
2587
// OMPTODO: __kmpc_end_sections could be bypassed?
2588
if (!moreSectionsToExecute) {
2589
kmp_int32 num_done;
2590
2591
num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
2592
2593
if (num_done == th->th.th_team_nproc - 1) {
2594
/* NOTE: release this buffer to be reused */
2595
2596
KMP_MB(); /* Flush all pending memory write invalidates. */
2597
2598
sh->u.s.num_done = 0;
2599
sh->u.s.iteration = 0;
2600
2601
KMP_MB(); /* Flush all pending memory write invalidates. */
2602
2603
sh->buffer_index += __kmp_dispatch_num_buffers;
2604
KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2605
sh->buffer_index));
2606
2607
KMP_MB(); /* Flush all pending memory write invalidates. */
2608
2609
} // if
2610
2611
th->th.th_dispatch->th_deo_fcn = NULL;
2612
th->th.th_dispatch->th_dxo_fcn = NULL;
2613
th->th.th_dispatch->th_dispatch_sh_current = NULL;
2614
th->th.th_dispatch->th_dispatch_pr_current = NULL;
2615
2616
#if OMPT_SUPPORT && OMPT_OPTIONAL
2617
if (ompt_enabled.ompt_callback_dispatch) {
2618
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2619
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2620
ompt_data_t instance = ompt_data_none;
2621
instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
2622
ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2623
&(team_info->parallel_data), &(task_info->task_data),
2624
ompt_dispatch_section, instance);
2625
}
2626
#endif
2627
}
2628
2629
return sectionIndex;
2630
}
2631
2632
/*!
2633
@ingroup WORK_SHARING
2634
@param loc source location information
2635
@param global_tid global thread number
2636
2637
End of "sections" construct.
2638
Don't need to wait here: barrier is added separately when needed.
2639
*/
2640
void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
2641
2642
kmp_info_t *th = __kmp_threads[gtid];
2643
int active = !th->th.th_team->t.t_serialized;
2644
2645
KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
2646
2647
if (!active) {
2648
// In active case call finalization is done in __kmpc_next_section
2649
#if OMPT_SUPPORT && OMPT_OPTIONAL
2650
if (ompt_enabled.ompt_callback_work) {
2651
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2652
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2653
ompt_callbacks.ompt_callback(ompt_callback_work)(
2654
ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2655
&(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2656
}
2657
#endif
2658
}
2659
2660
KMP_POP_PARTITIONED_TIMER();
2661
KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
2662
}
2663
2664
template <typename T>
2665
static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2666
kmp_int32 *plastiter, T *plower, T *pupper,
2667
typename traits_t<T>::signed_t incr) {
2668
typedef typename traits_t<T>::unsigned_t UT;
2669
kmp_uint32 team_id;
2670
kmp_uint32 nteams;
2671
UT trip_count;
2672
kmp_team_t *team;
2673
kmp_info_t *th;
2674
2675
KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2676
KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2677
#ifdef KMP_DEBUG
2678
typedef typename traits_t<T>::signed_t ST;
2679
{
2680
char *buff;
2681
// create format specifiers before the debug output
2682
buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2683
"iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2684
traits_t<T>::spec, traits_t<T>::spec,
2685
traits_t<ST>::spec, traits_t<T>::spec);
2686
KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2687
__kmp_str_free(&buff);
2688
}
2689
#endif
2690
2691
if (__kmp_env_consistency_check) {
2692
if (incr == 0) {
2693
__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2694
loc);
2695
}
2696
if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2697
// The loop is illegal.
2698
// Some zero-trip loops maintained by compiler, e.g.:
2699
// for(i=10;i<0;++i) // lower >= upper - run-time check
2700
// for(i=0;i>10;--i) // lower <= upper - run-time check
2701
// for(i=0;i>10;++i) // incr > 0 - compile-time check
2702
// for(i=10;i<0;--i) // incr < 0 - compile-time check
2703
// Compiler does not check the following illegal loops:
2704
// for(i=0;i<10;i+=incr) // where incr<0
2705
// for(i=10;i>0;i-=incr) // where incr<0
2706
__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2707
}
2708
}
2709
__kmp_assert_valid_gtid(gtid);
2710
th = __kmp_threads[gtid];
2711
team = th->th.th_team;
2712
KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2713
nteams = th->th.th_teams_size.nteams;
2714
team_id = team->t.t_master_tid;
2715
KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2716
2717
// compute global trip count
2718
if (incr == 1) {
2719
trip_count = *pupper - *plower + 1;
2720
} else if (incr == -1) {
2721
trip_count = *plower - *pupper + 1;
2722
} else if (incr > 0) {
2723
// upper-lower can exceed the limit of signed type
2724
trip_count = (UT)(*pupper - *plower) / incr + 1;
2725
} else {
2726
trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2727
}
2728
2729
if (trip_count <= nteams) {
2730
KMP_DEBUG_ASSERT(
2731
__kmp_static == kmp_sch_static_greedy ||
2732
__kmp_static ==
2733
kmp_sch_static_balanced); // Unknown static scheduling type.
2734
// only some teams get single iteration, others get nothing
2735
if (team_id < trip_count) {
2736
*pupper = *plower = *plower + team_id * incr;
2737
} else {
2738
*plower = *pupper + incr; // zero-trip loop
2739
}
2740
if (plastiter != NULL)
2741
*plastiter = (team_id == trip_count - 1);
2742
} else {
2743
if (__kmp_static == kmp_sch_static_balanced) {
2744
UT chunk = trip_count / nteams;
2745
UT extras = trip_count % nteams;
2746
*plower +=
2747
incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2748
*pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2749
if (plastiter != NULL)
2750
*plastiter = (team_id == nteams - 1);
2751
} else {
2752
T chunk_inc_count =
2753
(trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2754
T upper = *pupper;
2755
KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2756
// Unknown static scheduling type.
2757
*plower += team_id * chunk_inc_count;
2758
*pupper = *plower + chunk_inc_count - incr;
2759
// Check/correct bounds if needed
2760
if (incr > 0) {
2761
if (*pupper < *plower)
2762
*pupper = traits_t<T>::max_value;
2763
if (plastiter != NULL)
2764
*plastiter = *plower <= upper && *pupper > upper - incr;
2765
if (*pupper > upper)
2766
*pupper = upper; // tracker C73258
2767
} else {
2768
if (*pupper > *plower)
2769
*pupper = traits_t<T>::min_value;
2770
if (plastiter != NULL)
2771
*plastiter = *plower >= upper && *pupper < upper - incr;
2772
if (*pupper < upper)
2773
*pupper = upper; // tracker C73258
2774
}
2775
}
2776
}
2777
}
2778
2779
//-----------------------------------------------------------------------------
2780
// Dispatch routines
2781
// Transfer call to template< type T >
2782
// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2783
// T lb, T ub, ST st, ST chunk )
2784
extern "C" {
2785
2786
/*!
2787
@ingroup WORK_SHARING
2788
@{
2789
@param loc Source location
2790
@param gtid Global thread id
2791
@param schedule Schedule type
2792
@param lb Lower bound
2793
@param ub Upper bound
2794
@param st Step (or increment if you prefer)
2795
@param chunk The chunk size to block with
2796
2797
This function prepares the runtime to start a dynamically scheduled for loop,
2798
saving the loop arguments.
2799
These functions are all identical apart from the types of the arguments.
2800
*/
2801
2802
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2803
enum sched_type schedule, kmp_int32 lb,
2804
kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2805
KMP_DEBUG_ASSERT(__kmp_init_serial);
2806
#if OMPT_SUPPORT && OMPT_OPTIONAL
2807
OMPT_STORE_RETURN_ADDRESS(gtid);
2808
#endif
2809
__kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2810
}
2811
/*!
2812
See @ref __kmpc_dispatch_init_4
2813
*/
2814
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2815
enum sched_type schedule, kmp_uint32 lb,
2816
kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2817
KMP_DEBUG_ASSERT(__kmp_init_serial);
2818
#if OMPT_SUPPORT && OMPT_OPTIONAL
2819
OMPT_STORE_RETURN_ADDRESS(gtid);
2820
#endif
2821
__kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2822
}
2823
2824
/*!
2825
See @ref __kmpc_dispatch_init_4
2826
*/
2827
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2828
enum sched_type schedule, kmp_int64 lb,
2829
kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2830
KMP_DEBUG_ASSERT(__kmp_init_serial);
2831
#if OMPT_SUPPORT && OMPT_OPTIONAL
2832
OMPT_STORE_RETURN_ADDRESS(gtid);
2833
#endif
2834
__kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2835
}
2836
2837
/*!
2838
See @ref __kmpc_dispatch_init_4
2839
*/
2840
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2841
enum sched_type schedule, kmp_uint64 lb,
2842
kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2843
KMP_DEBUG_ASSERT(__kmp_init_serial);
2844
#if OMPT_SUPPORT && OMPT_OPTIONAL
2845
OMPT_STORE_RETURN_ADDRESS(gtid);
2846
#endif
2847
__kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2848
}
2849
2850
/*!
2851
See @ref __kmpc_dispatch_init_4
2852
2853
Difference from __kmpc_dispatch_init set of functions is these functions
2854
are called for composite distribute parallel for construct. Thus before
2855
regular iterations dispatching we need to calc per-team iteration space.
2856
2857
These functions are all identical apart from the types of the arguments.
2858
*/
2859
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2860
enum sched_type schedule, kmp_int32 *p_last,
2861
kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2862
kmp_int32 chunk) {
2863
KMP_DEBUG_ASSERT(__kmp_init_serial);
2864
#if OMPT_SUPPORT && OMPT_OPTIONAL
2865
OMPT_STORE_RETURN_ADDRESS(gtid);
2866
#endif
2867
__kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2868
__kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2869
}
2870
2871
void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2872
enum sched_type schedule, kmp_int32 *p_last,
2873
kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2874
kmp_int32 chunk) {
2875
KMP_DEBUG_ASSERT(__kmp_init_serial);
2876
#if OMPT_SUPPORT && OMPT_OPTIONAL
2877
OMPT_STORE_RETURN_ADDRESS(gtid);
2878
#endif
2879
__kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2880
__kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2881
}
2882
2883
void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2884
enum sched_type schedule, kmp_int32 *p_last,
2885
kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2886
kmp_int64 chunk) {
2887
KMP_DEBUG_ASSERT(__kmp_init_serial);
2888
#if OMPT_SUPPORT && OMPT_OPTIONAL
2889
OMPT_STORE_RETURN_ADDRESS(gtid);
2890
#endif
2891
__kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2892
__kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2893
}
2894
2895
void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2896
enum sched_type schedule, kmp_int32 *p_last,
2897
kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2898
kmp_int64 chunk) {
2899
KMP_DEBUG_ASSERT(__kmp_init_serial);
2900
#if OMPT_SUPPORT && OMPT_OPTIONAL
2901
OMPT_STORE_RETURN_ADDRESS(gtid);
2902
#endif
2903
__kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2904
__kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2905
}
2906
2907
/*!
2908
@param loc Source code location
2909
@param gtid Global thread id
2910
@param p_last Pointer to a flag set to one if this is the last chunk or zero
2911
otherwise
2912
@param p_lb Pointer to the lower bound for the next chunk of work
2913
@param p_ub Pointer to the upper bound for the next chunk of work
2914
@param p_st Pointer to the stride for the next chunk of work
2915
@return one if there is work to be done, zero otherwise
2916
2917
Get the next dynamically allocated chunk of work for this thread.
2918
If there is no more work, then the lb,ub and stride need not be modified.
2919
*/
2920
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2921
kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2922
#if OMPT_SUPPORT && OMPT_OPTIONAL
2923
OMPT_STORE_RETURN_ADDRESS(gtid);
2924
#endif
2925
return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2926
#if OMPT_SUPPORT && OMPT_OPTIONAL
2927
,
2928
OMPT_LOAD_RETURN_ADDRESS(gtid)
2929
#endif
2930
);
2931
}
2932
2933
/*!
2934
See @ref __kmpc_dispatch_next_4
2935
*/
2936
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2937
kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2938
kmp_int32 *p_st) {
2939
#if OMPT_SUPPORT && OMPT_OPTIONAL
2940
OMPT_STORE_RETURN_ADDRESS(gtid);
2941
#endif
2942
return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2943
#if OMPT_SUPPORT && OMPT_OPTIONAL
2944
,
2945
OMPT_LOAD_RETURN_ADDRESS(gtid)
2946
#endif
2947
);
2948
}
2949
2950
/*!
2951
See @ref __kmpc_dispatch_next_4
2952
*/
2953
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2954
kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2955
#if OMPT_SUPPORT && OMPT_OPTIONAL
2956
OMPT_STORE_RETURN_ADDRESS(gtid);
2957
#endif
2958
return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2959
#if OMPT_SUPPORT && OMPT_OPTIONAL
2960
,
2961
OMPT_LOAD_RETURN_ADDRESS(gtid)
2962
#endif
2963
);
2964
}
2965
2966
/*!
2967
See @ref __kmpc_dispatch_next_4
2968
*/
2969
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2970
kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2971
kmp_int64 *p_st) {
2972
#if OMPT_SUPPORT && OMPT_OPTIONAL
2973
OMPT_STORE_RETURN_ADDRESS(gtid);
2974
#endif
2975
return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2976
#if OMPT_SUPPORT && OMPT_OPTIONAL
2977
,
2978
OMPT_LOAD_RETURN_ADDRESS(gtid)
2979
#endif
2980
);
2981
}
2982
2983
/*!
2984
@param loc Source code location
2985
@param gtid Global thread id
2986
2987
Mark the end of a dynamic loop.
2988
*/
2989
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2990
__kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2991
}
2992
2993
/*!
2994
See @ref __kmpc_dispatch_fini_4
2995
*/
2996
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2997
__kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2998
}
2999
3000
/*!
3001
See @ref __kmpc_dispatch_fini_4
3002
*/
3003
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
3004
__kmp_dispatch_finish<kmp_uint32>(gtid, loc);
3005
}
3006
3007
/*!
3008
See @ref __kmpc_dispatch_fini_4
3009
*/
3010
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
3011
__kmp_dispatch_finish<kmp_uint64>(gtid, loc);
3012
}
3013
3014
/*!
3015
See @ref __kmpc_dispatch_deinit
3016
*/
3017
void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid) {}
3018
/*! @} */
3019
3020
//-----------------------------------------------------------------------------
3021
// Non-template routines from kmp_dispatch.cpp used in other sources
3022
3023
kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
3024
return value == checker;
3025
}
3026
3027
kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
3028
return value != checker;
3029
}
3030
3031
kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
3032
return value < checker;
3033
}
3034
3035
kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
3036
return value >= checker;
3037
}
3038
3039
kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
3040
return value <= checker;
3041
}
3042
3043
kmp_uint32
3044
__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
3045
kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
3046
void *obj // Higher-level synchronization object, or NULL.
3047
) {
3048
// note: we may not belong to a team at this point
3049
volatile kmp_uint32 *spin = spinner;
3050
kmp_uint32 check = checker;
3051
kmp_uint32 spins;
3052
kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
3053
kmp_uint32 r;
3054
kmp_uint64 time;
3055
3056
KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
3057
KMP_INIT_YIELD(spins);
3058
KMP_INIT_BACKOFF(time);
3059
// main wait spin loop
3060
while (!f(r = TCR_4(*spin), check)) {
3061
KMP_FSYNC_SPIN_PREPARE(obj);
3062
/* GEH - remove this since it was accidentally introduced when kmp_wait was
3063
split. It causes problems with infinite recursion because of exit lock */
3064
/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
3065
__kmp_abort_thread(); */
3066
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3067
}
3068
KMP_FSYNC_SPIN_ACQUIRED(obj);
3069
return r;
3070
}
3071
3072
void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
3073
kmp_uint32 (*pred)(void *, kmp_uint32),
3074
void *obj // Higher-level synchronization object, or NULL.
3075
) {
3076
// note: we may not belong to a team at this point
3077
void *spin = spinner;
3078
kmp_uint32 check = checker;
3079
kmp_uint32 spins;
3080
kmp_uint32 (*f)(void *, kmp_uint32) = pred;
3081
kmp_uint64 time;
3082
3083
KMP_FSYNC_SPIN_INIT(obj, spin);
3084
KMP_INIT_YIELD(spins);
3085
KMP_INIT_BACKOFF(time);
3086
// main wait spin loop
3087
while (!f(spin, check)) {
3088
KMP_FSYNC_SPIN_PREPARE(obj);
3089
/* if we have waited a bit, or are noversubscribed, yield */
3090
/* pause is in the following code */
3091
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3092
}
3093
KMP_FSYNC_SPIN_ACQUIRED(obj);
3094
}
3095
3096
} // extern "C"
3097
3098
#ifdef KMP_GOMP_COMPAT
3099
3100
void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
3101
enum sched_type schedule, kmp_int32 lb,
3102
kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
3103
int push_ws) {
3104
__kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
3105
push_ws);
3106
}
3107
3108
void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
3109
enum sched_type schedule, kmp_uint32 lb,
3110
kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
3111
int push_ws) {
3112
__kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
3113
push_ws);
3114
}
3115
3116
void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
3117
enum sched_type schedule, kmp_int64 lb,
3118
kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
3119
int push_ws) {
3120
__kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
3121
push_ws);
3122
}
3123
3124
void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
3125
enum sched_type schedule, kmp_uint64 lb,
3126
kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
3127
int push_ws) {
3128
__kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
3129
push_ws);
3130
}
3131
3132
void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
3133
__kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3134
}
3135
3136
void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
3137
__kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3138
}
3139
3140
void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
3141
__kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3142
}
3143
3144
void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
3145
__kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3146
}
3147
3148
#endif /* KMP_GOMP_COMPAT */
3149
3150
/* ------------------------------------------------------------------------ */
3151
3152