36 #include "kmp_wait_release.h"
37 #include "kmp_stats.h"
43 #include <immintrin.h>
44 #define USE_NGO_STORES 1
47 #if KMP_MIC && USE_NGO_STORES
49 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
50 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
51 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
52 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
54 #define ngo_load(src) ((void)0)
55 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
56 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
57 #define ngo_sync() ((void)0)
60 void __kmp_print_structure(
void);
66 __kmp_linear_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
67 void (*reduce)(
void *,
void *)
68 USE_ITT_BUILD_ARG(
void * itt_sync_obj) )
71 register kmp_team_t *team = this_thr->th.th_team;
72 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
73 register kmp_info_t **other_threads = team->t.t_threads;
75 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
76 gtid, team->t.t_id, tid, bt));
77 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
79 #if USE_ITT_BUILD && USE_ITT_NOTIFY
81 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
82 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
86 if (!KMP_MASTER_TID(tid)) {
87 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
88 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
89 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
90 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
94 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
97 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
98 register int nproc = this_thr->th.th_team_nproc;
101 register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
104 for (i=1; i<nproc; ++i) {
108 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
110 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
111 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
112 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
113 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
116 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
117 flag.wait(this_thr, FALSE
118 USE_ITT_BUILD_ARG(itt_sync_obj) );
119 #if USE_ITT_BUILD && USE_ITT_NOTIFY
121 if (__kmp_forkjoin_frames_mode == 2) {
122 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
123 other_threads[i]->th.th_bar_min_time);
127 KA_TRACE(100, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
128 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
129 (*reduce)(this_thr->th.th_local.reduce_data,
130 other_threads[i]->th.th_local.reduce_data);
134 team_bar->b_arrived = new_state;
135 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
136 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
138 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
139 gtid, team->t.t_id, tid, bt));
143 __kmp_linear_barrier_release(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
145 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
148 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
149 register kmp_team_t *team;
151 if (KMP_MASTER_TID(tid)) {
152 register unsigned int i;
153 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
154 register kmp_info_t **other_threads;
156 team = __kmp_threads[gtid]->th.th_team;
157 KMP_DEBUG_ASSERT(team != NULL);
158 other_threads = team->t.t_threads;
160 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
161 gtid, team->t.t_id, tid, bt));
164 #if KMP_BARRIER_ICV_PUSH
166 if (propagate_icvs) {
167 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
168 for (i=1; i<nproc; ++i) {
169 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
170 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
171 &team->t.t_implicit_task_taskdata[0].td_icvs);
176 #endif // KMP_BARRIER_ICV_PUSH
179 for (i=1; i<nproc; ++i) {
183 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
185 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
186 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
187 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
188 &other_threads[i]->th.th_bar[bt].bb.b_go,
189 other_threads[i]->th.th_bar[bt].bb.b_go,
190 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
191 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
196 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
197 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
198 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
199 flag.wait(this_thr, TRUE
200 USE_ITT_BUILD_ARG(itt_sync_obj) );
201 #if USE_ITT_BUILD && USE_ITT_NOTIFY
202 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
204 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
206 __kmp_itt_task_starting(itt_sync_obj);
208 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
211 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
212 if (itt_sync_obj != NULL)
214 __kmp_itt_task_finished(itt_sync_obj);
218 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
222 tid = __kmp_tid_from_gtid(gtid);
223 team = __kmp_threads[gtid]->th.th_team;
225 KMP_DEBUG_ASSERT(team != NULL);
226 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
227 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
228 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
231 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
232 gtid, team->t.t_id, tid, bt));
237 __kmp_tree_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
238 void (*reduce)(
void *,
void *)
239 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
242 register kmp_team_t *team = this_thr->th.th_team;
243 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
244 register kmp_info_t **other_threads = team->t.t_threads;
245 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
246 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
247 register kmp_uint32 branch_factor = 1 << branch_bits;
248 register kmp_uint32 child;
249 register kmp_uint32 child_tid;
250 register kmp_uint64 new_state;
252 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
253 gtid, team->t.t_id, tid, bt));
254 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
256 #if USE_ITT_BUILD && USE_ITT_NOTIFY
258 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
259 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
263 child_tid = (tid << branch_bits) + 1;
264 if (child_tid < nproc) {
266 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
269 register kmp_info_t *child_thr = other_threads[child_tid];
270 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
273 if (child+1 <= branch_factor && child_tid+1 < nproc)
274 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
276 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
277 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
278 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
279 &child_bar->b_arrived, new_state));
281 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
282 flag.wait(this_thr, FALSE
283 USE_ITT_BUILD_ARG(itt_sync_obj) );
284 #if USE_ITT_BUILD && USE_ITT_NOTIFY
286 if (__kmp_forkjoin_frames_mode == 2) {
287 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
288 child_thr->th.th_bar_min_time);
292 KA_TRACE(100, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
293 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
294 team->t.t_id, child_tid));
295 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
300 while (child <= branch_factor && child_tid < nproc);
303 if (!KMP_MASTER_TID(tid)) {
304 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
306 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
307 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
308 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
309 &thr_bar->b_arrived, thr_bar->b_arrived,
310 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
315 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
320 team->t.t_bar[bt].b_arrived = new_state;
322 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
323 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
324 gtid, team->t.t_id, tid, team->t.t_id,
325 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
327 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
328 gtid, team->t.t_id, tid, bt));
332 __kmp_tree_barrier_release(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
334 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
337 register kmp_team_t *team;
338 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
339 register kmp_uint32 nproc;
340 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
341 register kmp_uint32 branch_factor = 1 << branch_bits;
342 register kmp_uint32 child;
343 register kmp_uint32 child_tid;
346 if (!KMP_MASTER_TID(tid)) {
347 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
348 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
350 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
351 flag.wait(this_thr, TRUE
352 USE_ITT_BUILD_ARG(itt_sync_obj) );
353 #if USE_ITT_BUILD && USE_ITT_NOTIFY
354 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
356 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
358 __kmp_itt_task_starting(itt_sync_obj);
360 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
363 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
364 if (itt_sync_obj != NULL)
366 __kmp_itt_task_finished(itt_sync_obj);
370 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
374 team = __kmp_threads[gtid]->th.th_team;
375 KMP_DEBUG_ASSERT(team != NULL);
376 tid = __kmp_tid_from_gtid(gtid);
378 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
379 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
380 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
383 team = __kmp_threads[gtid]->th.th_team;
384 KMP_DEBUG_ASSERT(team != NULL);
385 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
386 gtid, team->t.t_id, tid, bt));
388 nproc = this_thr->th.th_team_nproc;
389 child_tid = (tid << branch_bits) + 1;
391 if (child_tid < nproc) {
392 register kmp_info_t **other_threads = team->t.t_threads;
396 register kmp_info_t *child_thr = other_threads[child_tid];
397 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
400 if (child+1 <= branch_factor && child_tid+1 < nproc)
401 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
404 #if KMP_BARRIER_ICV_PUSH
406 if (propagate_icvs) {
407 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
408 team, child_tid, FALSE);
409 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
410 &team->t.t_implicit_task_taskdata[0].td_icvs);
413 #endif // KMP_BARRIER_ICV_PUSH
414 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
415 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
416 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
417 child_tid, &child_bar->b_go, child_bar->b_go,
418 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
420 kmp_flag_64 flag(&child_bar->b_go, child_thr);
425 while (child <= branch_factor && child_tid < nproc);
427 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
428 gtid, team->t.t_id, tid, bt));
434 __kmp_hyper_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
435 void (*reduce)(
void *,
void *)
436 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
439 register kmp_team_t *team = this_thr->th.th_team;
440 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
441 register kmp_info_t **other_threads = team->t.t_threads;
442 register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
443 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
444 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
445 register kmp_uint32 branch_factor = 1 << branch_bits;
446 register kmp_uint32 offset;
447 register kmp_uint32 level;
449 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
450 gtid, team->t.t_id, tid, bt));
452 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
454 #if USE_ITT_BUILD && USE_ITT_NOTIFY
456 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
457 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
462 kmp_flag_64 p_flag(&thr_bar->b_arrived);
463 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
465 register kmp_uint32 child;
466 register kmp_uint32 child_tid;
468 if (((tid >> level) & (branch_factor - 1)) != 0) {
469 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
471 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
472 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
473 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
474 &thr_bar->b_arrived, thr_bar->b_arrived,
475 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
480 p_flag.set_waiter(other_threads[parent_tid]);
486 if (new_state == KMP_BARRIER_UNUSED_STATE)
487 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
488 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
489 child++, child_tid+=(1 << level))
491 register kmp_info_t *child_thr = other_threads[child_tid];
492 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
494 register kmp_uint32 next_child_tid = child_tid + (1 << level);
496 if (child+1 < branch_factor && next_child_tid < num_threads)
497 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
499 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
500 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
501 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
502 &child_bar->b_arrived, new_state));
504 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
505 c_flag.wait(this_thr, FALSE
506 USE_ITT_BUILD_ARG(itt_sync_obj) );
507 #if USE_ITT_BUILD && USE_ITT_NOTIFY
509 if (__kmp_forkjoin_frames_mode == 2) {
510 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
511 child_thr->th.th_bar_min_time);
515 KA_TRACE(100, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
516 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
517 team->t.t_id, child_tid));
518 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
523 if (KMP_MASTER_TID(tid)) {
525 if (new_state == KMP_BARRIER_UNUSED_STATE)
526 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
528 team->t.t_bar[bt].b_arrived = new_state;
529 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
530 gtid, team->t.t_id, tid, team->t.t_id,
531 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
533 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
534 gtid, team->t.t_id, tid, bt));
538 #define KMP_REVERSE_HYPER_BAR
540 __kmp_hyper_barrier_release(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
542 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
545 register kmp_team_t *team;
546 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
547 register kmp_info_t **other_threads;
548 register kmp_uint32 num_threads;
549 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
550 register kmp_uint32 branch_factor = 1 << branch_bits;
551 register kmp_uint32 child;
552 register kmp_uint32 child_tid;
553 register kmp_uint32 offset;
554 register kmp_uint32 level;
559 if (KMP_MASTER_TID(tid)) {
560 team = __kmp_threads[gtid]->th.th_team;
561 KMP_DEBUG_ASSERT(team != NULL);
562 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
563 gtid, team->t.t_id, tid, bt));
564 #if KMP_BARRIER_ICV_PUSH
565 if (propagate_icvs) {
566 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
571 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
572 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
574 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
575 flag.wait(this_thr, TRUE
576 USE_ITT_BUILD_ARG(itt_sync_obj) );
577 #if USE_ITT_BUILD && USE_ITT_NOTIFY
578 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
580 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
582 __kmp_itt_task_starting(itt_sync_obj);
584 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
587 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
588 if (itt_sync_obj != NULL)
590 __kmp_itt_task_finished(itt_sync_obj);
594 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
598 team = __kmp_threads[gtid]->th.th_team;
599 KMP_DEBUG_ASSERT(team != NULL);
600 tid = __kmp_tid_from_gtid(gtid);
602 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
603 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
604 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
607 num_threads = this_thr->th.th_team_nproc;
608 other_threads = team->t.t_threads;
610 #ifdef KMP_REVERSE_HYPER_BAR
612 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
613 level+=branch_bits, offset<<=branch_bits);
616 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
617 level-=branch_bits, offset>>=branch_bits)
620 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
623 #ifdef KMP_REVERSE_HYPER_BAR
626 child = num_threads >> ((level==0)?level:level-1);
627 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
628 child>=1; child--, child_tid-=(1<<level))
630 if (((tid >> level) & (branch_factor - 1)) != 0)
634 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
635 child++, child_tid+=(1<<level))
636 #endif // KMP_REVERSE_HYPER_BAR
638 if (child_tid >= num_threads)
continue;
640 register kmp_info_t *child_thr = other_threads[child_tid];
641 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
643 register kmp_uint32 next_child_tid = child_tid - (1 << level);
645 # ifdef KMP_REVERSE_HYPER_BAR
646 if (child-1 >= 1 && next_child_tid < num_threads)
648 if (child+1 < branch_factor && next_child_tid < num_threads)
649 # endif // KMP_REVERSE_HYPER_BAR
650 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
653 #if KMP_BARRIER_ICV_PUSH
655 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
656 #endif // KMP_BARRIER_ICV_PUSH
658 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
659 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
660 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
661 child_tid, &child_bar->b_go, child_bar->b_go,
662 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
664 kmp_flag_64 flag(&child_bar->b_go, child_thr);
669 #if KMP_BARRIER_ICV_PUSH
670 if (propagate_icvs && !KMP_MASTER_TID(tid)) {
671 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
672 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
675 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
676 gtid, team->t.t_id, tid, bt));
688 __kmp_init_hierarchical_barrier_thread(
enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
689 int gtid,
int tid, kmp_team_t *team)
692 bool uninitialized = thr_bar->team == NULL;
693 bool team_changed = team != thr_bar->team;
694 bool team_sz_changed = nproc != thr_bar->nproc;
695 bool tid_changed = tid != thr_bar->old_tid;
698 if (uninitialized || team_sz_changed) {
699 __kmp_get_hierarchy(nproc, thr_bar);
702 if (uninitialized || team_sz_changed || tid_changed) {
703 thr_bar->my_level = thr_bar->depth-1;
704 thr_bar->parent_tid = -1;
705 if (!KMP_MASTER_TID(tid)) {
707 while (d<thr_bar->depth) {
709 if (d == thr_bar->depth-2) {
710 thr_bar->parent_tid = 0;
711 thr_bar->my_level = d;
714 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) {
716 thr_bar->parent_tid = tid - rem;
717 thr_bar->my_level = d;
723 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
724 thr_bar->old_tid = tid;
725 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
727 if (uninitialized || team_changed || tid_changed) {
728 thr_bar->team = team;
729 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
732 if (uninitialized || team_sz_changed || tid_changed) {
733 thr_bar->nproc = nproc;
734 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
735 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
736 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
737 thr_bar->leaf_kids = nproc - tid - 1;
738 thr_bar->leaf_state = 0;
739 for (
int i=0; i<thr_bar->leaf_kids; ++i) ((
char *)&(thr_bar->leaf_state))[7-i] = 1;
745 __kmp_hierarchical_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
746 int gtid,
int tid,
void (*reduce) (
void *,
void *)
747 USE_ITT_BUILD_ARG(
void * itt_sync_obj) )
750 register kmp_team_t *team = this_thr->th.th_team;
751 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
752 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
753 register kmp_info_t **other_threads = team->t.t_threads;
754 register kmp_uint64 new_state;
756 int level = team->t.t_level;
757 if (other_threads[0]->th.th_teams_microtask)
758 if (this_thr->th.th_teams_size.nteams > 1)
760 if (level == 1) thr_bar->use_oncore_barrier = 1;
761 else thr_bar->use_oncore_barrier = 0;
763 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
764 gtid, team->t.t_id, tid, bt));
765 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
767 #if USE_ITT_BUILD && USE_ITT_NOTIFY
769 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
770 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
774 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
776 if (thr_bar->my_level) {
777 register kmp_int32 child_tid;
778 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
779 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
780 if (thr_bar->leaf_kids) {
781 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
782 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
783 flag.wait(this_thr, FALSE
784 USE_ITT_BUILD_ARG(itt_sync_obj) );
786 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
787 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
788 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
789 team->t.t_id, child_tid));
790 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
793 (void) KMP_TEST_THEN_AND64((
volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state));
796 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) {
797 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
798 if (last > nproc) last = nproc;
799 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
800 register kmp_info_t *child_thr = other_threads[child_tid];
801 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
802 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
803 "arrived(%p) == %llu\n",
804 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
805 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
806 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
807 flag.wait(this_thr, FALSE
808 USE_ITT_BUILD_ARG(itt_sync_obj) );
810 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
811 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
812 team->t.t_id, child_tid));
813 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
819 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) {
820 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
821 if (last > nproc) last = nproc;
822 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
823 register kmp_info_t *child_thr = other_threads[child_tid];
824 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
825 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
826 "arrived(%p) == %llu\n",
827 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
828 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
829 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
830 flag.wait(this_thr, FALSE
831 USE_ITT_BUILD_ARG(itt_sync_obj) );
833 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
834 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
835 team->t.t_id, child_tid));
836 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
844 if (!KMP_MASTER_TID(tid)) {
845 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
846 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
847 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
848 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
851 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
852 || !thr_bar->use_oncore_barrier) {
853 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
857 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
858 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
859 flag.set_waiter(other_threads[thr_bar->parent_tid]);
863 team->t.t_bar[bt].b_arrived = new_state;
864 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
865 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
868 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
869 gtid, team->t.t_id, tid, bt));
873 __kmp_hierarchical_barrier_release(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
875 USE_ITT_BUILD_ARG(
void * itt_sync_obj) )
878 register kmp_team_t *team;
879 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
880 register kmp_uint32 nproc;
881 bool team_change =
false;
883 if (KMP_MASTER_TID(tid)) {
884 team = __kmp_threads[gtid]->th.th_team;
885 KMP_DEBUG_ASSERT(team != NULL);
886 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
887 gtid, team->t.t_id, tid, bt));
891 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
892 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
894 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
895 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
896 flag.wait(this_thr, TRUE
897 USE_ITT_BUILD_ARG(itt_sync_obj) );
898 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
902 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
903 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
905 USE_ITT_BUILD_ARG(itt_sync_obj) );
906 flag.wait(this_thr, TRUE);
907 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) {
908 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
911 ((
char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
914 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
916 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
919 team = __kmp_threads[gtid]->th.th_team;
920 KMP_DEBUG_ASSERT(team != NULL);
921 tid = __kmp_tid_from_gtid(gtid);
923 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
924 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
928 nproc = this_thr->th.th_team_nproc;
929 int level = team->t.t_level;
930 if (team->t.t_threads[0]->th.th_teams_microtask ) {
931 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
933 if( this_thr->th.th_teams_size.nteams > 1 )
936 if (level == 1) thr_bar->use_oncore_barrier = 1;
937 else thr_bar->use_oncore_barrier = 0;
940 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
941 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
942 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
944 if (team_change) old_leaf_kids = 0;
946 #if KMP_BARRIER_ICV_PUSH
947 if (propagate_icvs) {
948 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
949 if (KMP_MASTER_TID(tid)) {
950 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
952 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
953 if (!thr_bar->my_level)
955 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
956 &thr_bar->parent_bar->th_fixed_icvs);
960 if (thr_bar->my_level)
961 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
963 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
964 &thr_bar->parent_bar->th_fixed_icvs);
967 #endif // KMP_BARRIER_ICV_PUSH
970 if (thr_bar->my_level) {
971 register kmp_int32 child_tid;
973 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
974 if (KMP_MASTER_TID(tid)) {
976 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
978 ngo_load(&thr_bar->th_fixed_icvs);
980 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(
int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
981 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
982 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
983 " go(%p): %u => %u\n",
984 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
985 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
986 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
988 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
992 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
994 if (thr_bar->leaf_kids) {
996 if (team_change || old_leaf_kids < thr_bar->leaf_kids) {
998 thr_bar->b_go |= old_leaf_state;
1001 last = tid+thr_bar->skip_per_level[1];
1002 if (last > nproc) last = nproc;
1003 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) {
1004 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1005 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1006 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1007 " T#%d(%d:%d) go(%p): %u => %u\n",
1008 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1009 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1010 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1012 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1017 thr_bar->b_go |= thr_bar->leaf_state;
1022 for (
int d=thr_bar->my_level-1; d>=0; --d) {
1023 last = tid+thr_bar->skip_per_level[d+1];
1024 kmp_uint32 skip = thr_bar->skip_per_level[d];
1025 if (last > nproc) last = nproc;
1026 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1027 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1028 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1029 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1030 " go(%p): %u => %u\n",
1031 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1032 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1033 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1035 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1040 #if KMP_BARRIER_ICV_PUSH
1041 if (propagate_icvs && !KMP_MASTER_TID(tid))
1042 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1043 #endif // KMP_BARRIER_ICV_PUSH
1045 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1046 gtid, team->t.t_id, tid, bt));
1057 __kmp_barrier(
enum barrier_type bt,
int gtid,
int is_split,
size_t reduce_size,
1058 void *reduce_data,
void (*reduce)(
void *,
void *))
1061 register int tid = __kmp_tid_from_gtid(gtid);
1062 register kmp_info_t *this_thr = __kmp_threads[gtid];
1063 register kmp_team_t *team = this_thr->th.th_team;
1064 register int status = 0;
1065 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1067 ompt_task_id_t my_task_id;
1068 ompt_parallel_id_t my_parallel_id;
1071 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) has arrived\n",
1072 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1074 #if OMPT_SUPPORT && OMPT_TRACE
1075 if (ompt_status & ompt_status_track) {
1076 if (ompt_status == ompt_status_track_callback) {
1077 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1078 my_parallel_id = team->t.ompt_team_info.parallel_id;
1080 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1081 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1082 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1083 my_parallel_id, my_task_id);
1086 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1087 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1088 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1089 my_parallel_id, my_task_id);
1092 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1097 if (! team->t.t_serialized) {
1100 void *itt_sync_obj = NULL;
1102 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1103 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1106 if (__kmp_tasking_mode == tskm_extra_barrier) {
1107 __kmp_tasking_barrier(team, this_thr, gtid);
1108 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1109 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1115 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1116 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1117 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1121 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1122 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1126 if (KMP_MASTER_TID(tid)) {
1127 team->t.t_bar[bt].b_master_arrived += 1;
1129 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1132 if (reduce != NULL) {
1134 this_thr->th.th_local.reduce_data = reduce_data;
1137 switch (__kmp_barrier_gather_pattern[bt]) {
1138 case bp_hyper_bar: {
1139 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1140 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1141 USE_ITT_BUILD_ARG(itt_sync_obj) );
1144 case bp_hierarchical_bar: {
1145 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1146 USE_ITT_BUILD_ARG(itt_sync_obj));
1150 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1151 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1152 USE_ITT_BUILD_ARG(itt_sync_obj) );
1156 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1157 USE_ITT_BUILD_ARG(itt_sync_obj) );
1163 if (KMP_MASTER_TID(tid)) {
1165 if (__kmp_tasking_mode != tskm_immediate_exec) {
1167 __kmp_task_team_wait(this_thr, team
1168 USE_ITT_BUILD_ARG(itt_sync_obj) );
1170 __kmp_task_team_setup(this_thr, team, 0, 0);
1174 team->t.t_bar[bt].b_team_arrived += 1;
1181 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1182 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1184 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1186 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1188 this_thr->th.th_teams_microtask == NULL &&
1190 team->t.t_active_level == 1)
1192 kmp_uint64 cur_time = __itt_get_timestamp();
1193 kmp_info_t **other_threads = team->t.t_threads;
1194 int nproc = this_thr->th.th_team_nproc;
1196 switch(__kmp_forkjoin_frames_mode) {
1198 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1199 this_thr->th.th_frame_time = cur_time;
1202 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1205 if( __itt_metadata_add_ptr ) {
1207 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1208 for (i=1; i<nproc; ++i) {
1209 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1211 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1213 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1214 this_thr->th.th_frame_time = cur_time;
1222 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1223 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1226 if (status == 1 || ! is_split) {
1227 switch (__kmp_barrier_release_pattern[bt]) {
1228 case bp_hyper_bar: {
1229 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1230 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1231 USE_ITT_BUILD_ARG(itt_sync_obj) );
1234 case bp_hierarchical_bar: {
1235 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1236 USE_ITT_BUILD_ARG(itt_sync_obj) );
1240 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1241 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1242 USE_ITT_BUILD_ARG(itt_sync_obj) );
1246 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1247 USE_ITT_BUILD_ARG(itt_sync_obj) );
1250 if (__kmp_tasking_mode != tskm_immediate_exec) {
1251 __kmp_task_team_sync(this_thr, team);
1259 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1260 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1264 if (__kmp_tasking_mode != tskm_immediate_exec) {
1266 if ( this_thr->th.th_task_team != NULL ) {
1267 void *itt_sync_obj = NULL;
1269 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1270 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1271 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1275 kmp_task_team_t * task_team = this_thr->th.th_task_team;
1276 KMP_DEBUG_ASSERT(task_team->tt.tt_found_proxy_tasks == TRUE);
1277 __kmp_task_team_wait(this_thr, team
1278 USE_ITT_BUILD_ARG(itt_sync_obj));
1279 __kmp_task_team_setup(this_thr, team, 0, 0);
1282 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1283 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1288 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1289 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1293 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1294 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1297 if (ompt_status & ompt_status_track) {
1299 if ((ompt_status == ompt_status_track_callback) &&
1300 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1301 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1302 my_parallel_id, my_task_id);
1305 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1314 __kmp_end_split_barrier(
enum barrier_type bt,
int gtid)
1317 int tid = __kmp_tid_from_gtid(gtid);
1318 kmp_info_t *this_thr = __kmp_threads[gtid];
1319 kmp_team_t *team = this_thr->th.th_team;
1321 if (!team->t.t_serialized) {
1322 if (KMP_MASTER_GTID(gtid)) {
1323 switch (__kmp_barrier_release_pattern[bt]) {
1324 case bp_hyper_bar: {
1325 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1326 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1327 USE_ITT_BUILD_ARG(NULL) );
1330 case bp_hierarchical_bar: {
1331 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1332 USE_ITT_BUILD_ARG(NULL));
1336 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1337 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1338 USE_ITT_BUILD_ARG(NULL) );
1342 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1343 USE_ITT_BUILD_ARG(NULL) );
1346 if (__kmp_tasking_mode != tskm_immediate_exec) {
1347 __kmp_task_team_sync(this_thr, team);
1355 __kmp_join_barrier(
int gtid)
1358 register kmp_info_t *this_thr = __kmp_threads[gtid];
1359 register kmp_team_t *team;
1360 register kmp_uint nproc;
1361 kmp_info_t *master_thread;
1367 void *itt_sync_obj = NULL;
1369 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1371 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1377 team = this_thr->th.th_team;
1378 nproc = this_thr->th.th_team_nproc;
1379 KMP_DEBUG_ASSERT((
int)nproc == team->t.t_nproc);
1380 tid = __kmp_tid_from_gtid(gtid);
1382 team_id = team->t.t_id;
1384 master_thread = this_thr->th.th_team_master;
1386 if (master_thread != team->t.t_threads[0]) {
1387 __kmp_print_structure();
1390 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1394 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1395 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1396 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1397 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1398 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1400 #if OMPT_SUPPORT && OMPT_TRACE
1401 if ((ompt_status == ompt_status_track_callback) &&
1402 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1403 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1404 team->t.ompt_team_info.parallel_id,
1405 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1407 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1410 if (__kmp_tasking_mode == tskm_extra_barrier) {
1411 __kmp_tasking_barrier(team, this_thr, gtid);
1412 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1415 if (__kmp_tasking_mode != tskm_immediate_exec) {
1416 KA_TRACE(20, (
"__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1417 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
1418 this_thr->th.th_task_team));
1419 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
1427 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1428 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1429 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1433 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1434 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1437 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1438 case bp_hyper_bar: {
1439 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1440 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1441 USE_ITT_BUILD_ARG(itt_sync_obj) );
1444 case bp_hierarchical_bar: {
1445 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1446 USE_ITT_BUILD_ARG(itt_sync_obj) );
1450 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1451 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1452 USE_ITT_BUILD_ARG(itt_sync_obj) );
1456 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1457 USE_ITT_BUILD_ARG(itt_sync_obj) );
1465 if (KMP_MASTER_TID(tid)) {
1466 if (__kmp_tasking_mode != tskm_immediate_exec) {
1469 __kmp_task_team_wait(this_thr, team
1470 USE_ITT_BUILD_ARG(itt_sync_obj) );
1473 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1474 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1477 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1479 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1481 this_thr->th.th_teams_microtask == NULL &&
1483 team->t.t_active_level == 1)
1485 kmp_uint64 cur_time = __itt_get_timestamp();
1486 ident_t * loc = team->t.t_ident;
1487 kmp_info_t **other_threads = team->t.t_threads;
1488 int nproc = this_thr->th.th_team_nproc;
1490 switch(__kmp_forkjoin_frames_mode) {
1492 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1495 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1498 if( __itt_metadata_add_ptr ) {
1500 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1501 for (i=1; i<nproc; ++i) {
1502 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1504 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1506 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1507 this_thr->th.th_frame_time = cur_time;
1515 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1516 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1521 if (KMP_MASTER_TID(tid)) {
1522 KA_TRACE(15, (
"__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1523 gtid, team_id, tid, nproc));
1529 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1532 if (ompt_status == ompt_status_track) {
1534 if ((ompt_status == ompt_status_track_callback) &&
1535 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1536 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1537 team->t.ompt_team_info.parallel_id,
1538 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1543 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1551 __kmp_fork_barrier(
int gtid,
int tid)
1554 kmp_info_t *this_thr = __kmp_threads[gtid];
1555 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1557 void * itt_sync_obj = NULL;
1560 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1561 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1564 if (KMP_MASTER_TID(tid)) {
1565 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1566 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1568 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1569 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1574 register kmp_info_t **other_threads = team->t.t_threads;
1580 for(i=1; i<team->t.t_nproc; ++i) {
1581 KA_TRACE(500, (
"__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1582 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1583 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1584 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1585 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1586 & ~(KMP_BARRIER_SLEEP_STATE))
1587 == KMP_INIT_BARRIER_STATE);
1588 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1592 if (__kmp_tasking_mode != tskm_immediate_exec) {
1593 __kmp_task_team_setup(this_thr, team, 1, 0);
1600 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1601 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1602 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1606 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1607 case bp_hyper_bar: {
1608 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1609 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1610 USE_ITT_BUILD_ARG(itt_sync_obj) );
1613 case bp_hierarchical_bar: {
1614 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1615 USE_ITT_BUILD_ARG(itt_sync_obj) );
1619 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1620 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1621 USE_ITT_BUILD_ARG(itt_sync_obj) );
1625 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1626 USE_ITT_BUILD_ARG(itt_sync_obj) );
1631 if (TCR_4(__kmp_global.g.g_done)) {
1632 if (this_thr->th.th_task_team != NULL) {
1633 if (KMP_MASTER_TID(tid)) {
1634 TCW_PTR(this_thr->th.th_task_team, NULL);
1637 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1641 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1642 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1643 if (!KMP_MASTER_TID(tid)) {
1644 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1646 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1650 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1657 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1658 KMP_DEBUG_ASSERT(team != NULL);
1659 tid = __kmp_tid_from_gtid(gtid);
1662 #if KMP_BARRIER_ICV_PULL
1669 if (!KMP_MASTER_TID(tid)) {
1671 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1672 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1673 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1674 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1677 #endif // KMP_BARRIER_ICV_PULL
1679 if (__kmp_tasking_mode != tskm_immediate_exec) {
1680 __kmp_task_team_sync(this_thr, team);
1683 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1684 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1685 if (proc_bind == proc_bind_intel) {
1687 #if KMP_AFFINITY_SUPPORTED
1689 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1690 __kmp_balanced_affinity(tid, team->t.t_nproc);
1692 #endif // KMP_AFFINITY_SUPPORTED
1693 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1695 else if (proc_bind != proc_bind_false) {
1696 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1697 KA_TRACE(100, (
"__kmp_fork_barrier: T#%d already in correct place %d\n",
1698 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1701 __kmp_affinity_set_place(gtid);
1706 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1707 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1708 if (!KMP_MASTER_TID(tid)) {
1710 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1711 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1715 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1720 __kmp_setup_icv_copy(kmp_team_t *team,
int new_nproc, kmp_internal_control_t *new_icvs,
ident_t *loc )
1725 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1726 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1731 #if KMP_BARRIER_ICV_PULL
1734 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
1735 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1736 KF_TRACE(10, (
"__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1737 0, team->t.t_threads[0], team));
1738 #elif KMP_BARRIER_ICV_PUSH
1740 KF_TRACE(10, (
"__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1741 0, team->t.t_threads[0], team));
1745 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
1746 for (f=1; f<new_nproc; ++f) {
1748 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1749 f, team->t.t_threads[f], team));
1750 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1751 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1752 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1753 f, team->t.t_threads[f], team));
1756 #endif // KMP_BARRIER_ICV_PULL
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.