Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_csupport.c
1 /*
2  * kmp_csupport.c -- kfront linkage support for OpenMP.
3  */
4 
5 /* <copyright>
6  Copyright (c) 1997-2015 Intel Corporation. All Rights Reserved.
7 
8  Redistribution and use in source and binary forms, with or without
9  modification, are permitted provided that the following conditions
10  are met:
11 
12  * Redistributions of source code must retain the above copyright
13  notice, this list of conditions and the following disclaimer.
14  * Redistributions in binary form must reproduce the above copyright
15  notice, this list of conditions and the following disclaimer in the
16  documentation and/or other materials provided with the distribution.
17  * Neither the name of Intel Corporation nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 </copyright> */
34 
35 #include "omp.h" /* extern "C" declarations of user-visible routines */
36 #include "kmp.h"
37 #include "kmp_i18n.h"
38 #include "kmp_itt.h"
39 #include "kmp_error.h"
40 #include "kmp_stats.h"
41 
42 #if OMPT_SUPPORT
43 #include "ompt-internal.h"
44 #include "ompt-specific.h"
45 #endif
46 
47 #define MAX_MESSAGE 512
48 
49 /* ------------------------------------------------------------------------ */
50 /* ------------------------------------------------------------------------ */
51 
52 /* flags will be used in future, e.g., to implement */
53 /* openmp_strict library restrictions */
54 
64 void
65 __kmpc_begin(ident_t *loc, kmp_int32 flags)
66 {
67  // By default __kmp_ignore_mppbeg() returns TRUE.
68  if (__kmp_ignore_mppbeg() == FALSE) {
69  __kmp_internal_begin();
70 
71  KC_TRACE( 10, ("__kmpc_begin: called\n" ) );
72  }
73 }
74 
82 void
84 {
85  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() call no-op.
86  // However, this can be overridden with KMP_IGNORE_MPPEND environment variable.
87  // If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() returns FALSE and __kmpc_end()
88  // will unregister this root (it can cause library shut down).
89  if (__kmp_ignore_mppend() == FALSE) {
90  KC_TRACE( 10, ("__kmpc_end: called\n" ) );
91  KA_TRACE( 30, ("__kmpc_end\n" ));
92 
93  __kmp_internal_end_thread( -1 );
94  }
95 }
96 
116 kmp_int32
118 {
119  kmp_int32 gtid = __kmp_entry_gtid();
120 
121  KC_TRACE( 10, ("__kmpc_global_thread_num: T#%d\n", gtid ) );
122 
123  return gtid;
124 }
125 
139 kmp_int32
141 {
142  KC_TRACE( 10, ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_nth ) );
143 
144  return TCR_4(__kmp_nth);
145 }
146 
153 kmp_int32
155 {
156  KC_TRACE( 10, ("__kmpc_bound_thread_num: called\n" ) );
157  return __kmp_tid_from_gtid( __kmp_entry_gtid() );
158 }
159 
165 kmp_int32
167 {
168  KC_TRACE( 10, ("__kmpc_bound_num_threads: called\n" ) );
169 
170  return __kmp_entry_thread() -> th.th_team -> t.t_nproc;
171 }
172 
179 kmp_int32
181 {
182 #ifndef KMP_DEBUG
183 
184  return TRUE;
185 
186 #else
187 
188  const char *semi2;
189  const char *semi3;
190  int line_no;
191 
192  if (__kmp_par_range == 0) {
193  return TRUE;
194  }
195  semi2 = loc->psource;
196  if (semi2 == NULL) {
197  return TRUE;
198  }
199  semi2 = strchr(semi2, ';');
200  if (semi2 == NULL) {
201  return TRUE;
202  }
203  semi2 = strchr(semi2 + 1, ';');
204  if (semi2 == NULL) {
205  return TRUE;
206  }
207  if (__kmp_par_range_filename[0]) {
208  const char *name = semi2 - 1;
209  while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
210  name--;
211  }
212  if ((*name == '/') || (*name == ';')) {
213  name++;
214  }
215  if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
216  return __kmp_par_range < 0;
217  }
218  }
219  semi3 = strchr(semi2 + 1, ';');
220  if (__kmp_par_range_routine[0]) {
221  if ((semi3 != NULL) && (semi3 > semi2)
222  && (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
223  return __kmp_par_range < 0;
224  }
225  }
226  if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
227  if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
228  return __kmp_par_range > 0;
229  }
230  return __kmp_par_range < 0;
231  }
232  return TRUE;
233 
234 #endif /* KMP_DEBUG */
235 
236 }
237 
243 kmp_int32
245 {
246  return __kmp_entry_thread() -> th.th_root -> r.r_active;
247 }
248 
258 void
259 __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads )
260 {
261  KA_TRACE( 20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
262  global_tid, num_threads ) );
263 
264  __kmp_push_num_threads( loc, global_tid, num_threads );
265 }
266 
267 void
268 __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid )
269 {
270  KA_TRACE( 20, ("__kmpc_pop_num_threads: enter\n" ) );
271 
272  /* the num_threads are automatically popped */
273 }
274 
275 
276 #if OMP_40_ENABLED
277 
278 void
279 __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, kmp_int32 proc_bind )
280 {
281  KA_TRACE( 20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n",
282  global_tid, proc_bind ) );
283 
284  __kmp_push_proc_bind( loc, global_tid, (kmp_proc_bind_t)proc_bind );
285 }
286 
287 #endif /* OMP_40_ENABLED */
288 
289 
299 void
300 __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
301 {
302  KMP_STOP_EXPLICIT_TIMER(OMP_serial);
303  KMP_COUNT_BLOCK(OMP_PARALLEL);
304  int gtid = __kmp_entry_gtid();
305  // maybe to save thr_state is enough here
306  {
307  va_list ap;
308  va_start( ap, microtask );
309 
310 #if OMPT_SUPPORT
311  kmp_info_t *master_th = __kmp_threads[ gtid ];
312  kmp_team_t *parent_team = master_th->th.th_team;
313  int tid = __kmp_tid_from_gtid( gtid );
314  parent_team->t.t_implicit_task_taskdata[tid].
315  ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(0);
316 #endif
317 
318 #if INCLUDE_SSC_MARKS
319  SSC_MARK_FORKING();
320 #endif
321  __kmp_fork_call( loc, gtid, fork_context_intel,
322  argc,
323 #if OMPT_SUPPORT
324  VOLATILE_CAST(void *) microtask, // "unwrapped" task
325 #endif
326  VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
327  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
328 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
329 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
330  &ap
331 #else
332  ap
333 #endif
334  );
335 #if INCLUDE_SSC_MARKS
336  SSC_MARK_JOINING();
337 #endif
338  __kmp_join_call( loc, gtid );
339 
340  va_end( ap );
341 
342 #if OMPT_SUPPORT
343  if (ompt_status & ompt_status_track) {
344  parent_team->t.t_implicit_task_taskdata[tid].
345  ompt_task_info.frame.reenter_runtime_frame = 0;
346  }
347 #endif
348  }
349  KMP_START_EXPLICIT_TIMER(OMP_serial);
350 }
351 
352 #if OMP_40_ENABLED
353 
364 void
365 __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads )
366 {
367  KA_TRACE( 20, ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
368  global_tid, num_teams, num_threads ) );
369 
370  __kmp_push_num_teams( loc, global_tid, num_teams, num_threads );
371 }
372 
382 void
383 __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
384 {
385  int gtid = __kmp_entry_gtid();
386  kmp_info_t *this_thr = __kmp_threads[ gtid ];
387  va_list ap;
388  va_start( ap, microtask );
389 
390  // remember teams entry point and nesting level
391  this_thr->th.th_teams_microtask = microtask;
392  this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
393 
394  // check if __kmpc_push_num_teams called, set default number of teams otherwise
395  if ( this_thr->th.th_teams_size.nteams == 0 ) {
396  __kmp_push_num_teams( loc, gtid, 0, 0 );
397  }
398  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
399  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
400  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
401 
402  __kmp_fork_call( loc, gtid, fork_context_intel,
403  argc,
404 #if OMPT_SUPPORT
405  VOLATILE_CAST(void *) microtask, // "unwrapped" task
406 #endif
407  VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
408  VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
409 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
410  &ap
411 #else
412  ap
413 #endif
414  );
415  __kmp_join_call( loc, gtid );
416  this_thr->th.th_teams_microtask = NULL;
417  this_thr->th.th_teams_level = 0;
418  *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L;
419  va_end( ap );
420 }
421 #endif /* OMP_40_ENABLED */
422 
423 
424 //
425 // I don't think this function should ever have been exported.
426 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
427 // openmp code ever called it, but it's been exported from the RTL for so
428 // long that I'm afraid to remove the definition.
429 //
430 int
431 __kmpc_invoke_task_func( int gtid )
432 {
433  return __kmp_invoke_task_func( gtid );
434 }
435 
448 void
449 __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
450 {
451  __kmp_serialized_parallel(loc, global_tid); /* The implementation is now in kmp_runtime.c so that it can share static functions with
452  * kmp_fork_call since the tasks to be done are similar in each case.
453  */
454 }
455 
463 void
464 __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
465 {
466  kmp_internal_control_t *top;
467  kmp_info_t *this_thr;
468  kmp_team_t *serial_team;
469 
470  KC_TRACE( 10, ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid ) );
471 
472  /* skip all this code for autopar serialized loops since it results in
473  unacceptable overhead */
474  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
475  return;
476 
477  // Not autopar code
478  if( ! TCR_4( __kmp_init_parallel ) )
479  __kmp_parallel_initialize();
480 
481  this_thr = __kmp_threads[ global_tid ];
482  serial_team = this_thr->th.th_serial_team;
483 
484  #if OMP_41_ENABLED
485  kmp_task_team_t * task_team = this_thr->th.th_task_team;
486 
487  // we need to wait for the proxy tasks before finishing the thread
488  if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
489  __kmp_task_team_wait(this_thr, serial_team, NULL ); // is an ITT object needed here?
490  #endif
491 
492  KMP_MB();
493  KMP_DEBUG_ASSERT( serial_team );
494  KMP_ASSERT( serial_team -> t.t_serialized );
495  KMP_DEBUG_ASSERT( this_thr -> th.th_team == serial_team );
496  KMP_DEBUG_ASSERT( serial_team != this_thr->th.th_root->r.r_root_team );
497  KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
498  KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
499 
500  /* If necessary, pop the internal control stack values and replace the team values */
501  top = serial_team -> t.t_control_stack_top;
502  if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) {
503  copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top );
504  serial_team -> t.t_control_stack_top = top -> next;
505  __kmp_free(top);
506  }
507 
508  //if( serial_team -> t.t_serialized > 1 )
509  serial_team -> t.t_level--;
510 
511  /* pop dispatch buffers stack */
512  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
513  {
514  dispatch_private_info_t * disp_buffer = serial_team->t.t_dispatch->th_disp_buffer;
515  serial_team->t.t_dispatch->th_disp_buffer =
516  serial_team->t.t_dispatch->th_disp_buffer->next;
517  __kmp_free( disp_buffer );
518  }
519 
520  -- serial_team -> t.t_serialized;
521  if ( serial_team -> t.t_serialized == 0 ) {
522 
523  /* return to the parallel section */
524 
525 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
526  if ( __kmp_inherit_fp_control && serial_team->t.t_fp_control_saved ) {
527  __kmp_clear_x87_fpu_status_word();
528  __kmp_load_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
529  __kmp_load_mxcsr( &serial_team->t.t_mxcsr );
530  }
531 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
532 
533  this_thr -> th.th_team = serial_team -> t.t_parent;
534  this_thr -> th.th_info.ds.ds_tid = serial_team -> t.t_master_tid;
535 
536  /* restore values cached in the thread */
537  this_thr -> th.th_team_nproc = serial_team -> t.t_parent -> t.t_nproc; /* JPH */
538  this_thr -> th.th_team_master = serial_team -> t.t_parent -> t.t_threads[0]; /* JPH */
539  this_thr -> th.th_team_serialized = this_thr -> th.th_team -> t.t_serialized;
540 
541  /* TODO the below shouldn't need to be adjusted for serialized teams */
542  this_thr -> th.th_dispatch = & this_thr -> th.th_team ->
543  t.t_dispatch[ serial_team -> t.t_master_tid ];
544 
545  __kmp_pop_current_task_from_thread( this_thr );
546 
547  KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 );
548  this_thr -> th.th_current_task -> td_flags.executing = 1;
549 
550  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
551  // Copy the task team from the new child / old parent team to the thread.
552  this_thr->th.th_task_team = this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
553  KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n",
554  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
555  }
556  } else {
557  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
558  KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n",
559  global_tid, serial_team, serial_team -> t.t_serialized ) );
560  }
561  }
562 
563 #if USE_ITT_BUILD
564  kmp_uint64 cur_time = 0;
565 #if USE_ITT_NOTIFY
566  if ( __itt_get_timestamp_ptr ) {
567  cur_time = __itt_get_timestamp();
568  }
569 #endif /* USE_ITT_NOTIFY */
570  if ( this_thr->th.th_team->t.t_level == 0
571 #if OMP_40_ENABLED
572  && this_thr->th.th_teams_microtask == NULL
573 #endif
574  ) {
575  // Report the barrier
576  this_thr->th.th_ident = loc;
577  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
578  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
579  {
580  __kmp_itt_frame_submit( global_tid, this_thr->th.th_frame_time_serialized,
581  cur_time, 0, loc, this_thr->th.th_team_nproc, 0 );
582  if ( __kmp_forkjoin_frames_mode == 3 )
583  // Since barrier frame for serialized region is equal to the region we use the same begin timestamp as for the barrier.
584  __kmp_itt_frame_submit( global_tid, serial_team->t.t_region_time,
585  cur_time, 0, loc, this_thr->th.th_team_nproc, 2 );
586  } else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
587  ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
588  // Mark the end of the "parallel" region for VTune. Only use one of frame notification scheme at the moment.
589  __kmp_itt_region_joined( global_tid, 1 );
590  }
591 #endif /* USE_ITT_BUILD */
592 
593  if ( __kmp_env_consistency_check )
594  __kmp_pop_parallel( global_tid, NULL );
595 }
596 
605 void
607 {
608  KC_TRACE( 10, ("__kmpc_flush: called\n" ) );
609 
610  /* need explicit __mf() here since use volatile instead in library */
611  KMP_MB(); /* Flush all pending memory write invalidates. */
612 
613  #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
614  #if KMP_MIC
615  // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
616  // We shouldn't need it, though, since the ABI rules require that
617  // * If the compiler generates NGO stores it also generates the fence
618  // * If users hand-code NGO stores they should insert the fence
619  // therefore no incomplete unordered stores should be visible.
620  #else
621  // C74404
622  // This is to address non-temporal store instructions (sfence needed).
623  // The clflush instruction is addressed either (mfence needed).
624  // Probably the non-temporal load monvtdqa instruction should also be addressed.
625  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
626  if ( ! __kmp_cpuinfo.initialized ) {
627  __kmp_query_cpuid( & __kmp_cpuinfo );
628  }; // if
629  if ( ! __kmp_cpuinfo.sse2 ) {
630  // CPU cannot execute SSE2 instructions.
631  } else {
632  #if KMP_COMPILER_ICC || KMP_COMPILER_MSVC
633  _mm_mfence();
634  #else
635  __sync_synchronize();
636  #endif // KMP_COMPILER_ICC
637  }; // if
638  #endif // KMP_MIC
639  #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64)
640  // Nothing to see here move along
641  #elif KMP_ARCH_PPC64
642  // Nothing needed here (we have a real MB above).
643  #if KMP_OS_CNK
644  // The flushing thread needs to yield here; this prevents a
645  // busy-waiting thread from saturating the pipeline. flush is
646  // often used in loops like this:
647  // while (!flag) {
648  // #pragma omp flush(flag)
649  // }
650  // and adding the yield here is good for at least a 10x speedup
651  // when running >2 threads per core (on the NAS LU benchmark).
652  __kmp_yield(TRUE);
653  #endif
654  #else
655  #error Unknown or unsupported architecture
656  #endif
657 
658 }
659 
660 /* -------------------------------------------------------------------------- */
661 
662 /* -------------------------------------------------------------------------- */
663 
671 void
672 __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
673 {
674  KMP_COUNT_BLOCK(OMP_BARRIER);
675  KMP_TIME_BLOCK(OMP_barrier);
676  int explicit_barrier_flag;
677  KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
678 
679  if (! TCR_4(__kmp_init_parallel))
680  __kmp_parallel_initialize();
681 
682  if ( __kmp_env_consistency_check ) {
683  if ( loc == 0 ) {
684  KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
685  }; // if
686 
687  __kmp_check_barrier( global_tid, ct_barrier, loc );
688  }
689 
690  __kmp_threads[ global_tid ]->th.th_ident = loc;
691  // TODO: explicit barrier_wait_id:
692  // this function is called when 'barrier' directive is present or
693  // implicit barrier at the end of a worksharing construct.
694  // 1) better to add a per-thread barrier counter to a thread data structure
695  // 2) set to 0 when a new team is created
696  // 4) no sync is required
697 
698  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
699 }
700 
701 /* The BARRIER for a MASTER section is always explicit */
708 kmp_int32
709 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
710 {
711  KMP_COUNT_BLOCK(OMP_MASTER);
712  int status = 0;
713 
714  KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) );
715 
716  if( ! TCR_4( __kmp_init_parallel ) )
717  __kmp_parallel_initialize();
718 
719  if( KMP_MASTER_GTID( global_tid ))
720  status = 1;
721 
722 #if OMPT_SUPPORT && OMPT_TRACE
723  if (status) {
724  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
725  kmp_team_t *team = this_thr -> th.th_team;
726  if ((ompt_status == ompt_status_track_callback) &&
727  ompt_callbacks.ompt_callback(ompt_event_master_begin)) {
728  int tid = __kmp_tid_from_gtid( global_tid );
729  ompt_callbacks.ompt_callback(ompt_event_master_begin)(
730  team->t.ompt_team_info.parallel_id,
731  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
732  }
733  }
734 #endif
735 
736  if ( __kmp_env_consistency_check ) {
737 #if KMP_USE_DYNAMIC_LOCK
738  if (status)
739  __kmp_push_sync( global_tid, ct_master, loc, NULL, 0 );
740  else
741  __kmp_check_sync( global_tid, ct_master, loc, NULL, 0 );
742 #else
743  if (status)
744  __kmp_push_sync( global_tid, ct_master, loc, NULL );
745  else
746  __kmp_check_sync( global_tid, ct_master, loc, NULL );
747 #endif
748  }
749 
750  return status;
751 }
752 
761 void
762 __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
763 {
764  KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) );
765 
766  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
767 
768 #if OMPT_SUPPORT && OMPT_TRACE
769  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
770  kmp_team_t *team = this_thr -> th.th_team;
771  if ((ompt_status == ompt_status_track_callback) &&
772  ompt_callbacks.ompt_callback(ompt_event_master_end)) {
773  int tid = __kmp_tid_from_gtid( global_tid );
774  ompt_callbacks.ompt_callback(ompt_event_master_end)(
775  team->t.ompt_team_info.parallel_id,
776  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
777  }
778 #endif
779 
780  if ( __kmp_env_consistency_check ) {
781  if( global_tid < 0 )
782  KMP_WARNING( ThreadIdentInvalid );
783 
784  if( KMP_MASTER_GTID( global_tid ))
785  __kmp_pop_sync( global_tid, ct_master, loc );
786  }
787 }
788 
796 void
797 __kmpc_ordered( ident_t * loc, kmp_int32 gtid )
798 {
799  int cid = 0;
800  kmp_info_t *th;
801  KMP_DEBUG_ASSERT( __kmp_init_serial );
802 
803  KC_TRACE( 10, ("__kmpc_ordered: called T#%d\n", gtid ));
804 
805  if (! TCR_4(__kmp_init_parallel))
806  __kmp_parallel_initialize();
807 
808 #if USE_ITT_BUILD
809  __kmp_itt_ordered_prep( gtid );
810  // TODO: ordered_wait_id
811 #endif /* USE_ITT_BUILD */
812 
813  th = __kmp_threads[ gtid ];
814 
815 #if OMPT_SUPPORT && OMPT_TRACE
816  if (ompt_status & ompt_status_track) {
817  /* OMPT state update */
818  th->th.ompt_thread_info.wait_id = (uint64_t) loc;
819  th->th.ompt_thread_info.state = ompt_state_wait_ordered;
820 
821  /* OMPT event callback */
822  if ((ompt_status == ompt_status_track_callback) &&
823  ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) {
824  ompt_callbacks.ompt_callback(ompt_event_wait_ordered)(
825  th->th.ompt_thread_info.wait_id);
826  }
827  }
828 #endif
829 
830  if ( th -> th.th_dispatch -> th_deo_fcn != 0 )
831  (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc );
832  else
833  __kmp_parallel_deo( & gtid, & cid, loc );
834 
835 #if OMPT_SUPPORT && OMPT_TRACE
836  if (ompt_status & ompt_status_track) {
837  /* OMPT state update */
838  th->th.ompt_thread_info.state = ompt_state_work_parallel;
839  th->th.ompt_thread_info.wait_id = 0;
840 
841  /* OMPT event callback */
842  if ((ompt_status == ompt_status_track_callback) &&
843  ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) {
844  ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)(
845  th->th.ompt_thread_info.wait_id);
846  }
847  }
848 #endif
849 
850 #if USE_ITT_BUILD
851  __kmp_itt_ordered_start( gtid );
852 #endif /* USE_ITT_BUILD */
853 }
854 
862 void
863 __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
864 {
865  int cid = 0;
866  kmp_info_t *th;
867 
868  KC_TRACE( 10, ("__kmpc_end_ordered: called T#%d\n", gtid ) );
869 
870 #if USE_ITT_BUILD
871  __kmp_itt_ordered_end( gtid );
872  // TODO: ordered_wait_id
873 #endif /* USE_ITT_BUILD */
874 
875  th = __kmp_threads[ gtid ];
876 
877  if ( th -> th.th_dispatch -> th_dxo_fcn != 0 )
878  (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc );
879  else
880  __kmp_parallel_dxo( & gtid, & cid, loc );
881 
882 #if OMPT_SUPPORT && OMPT_BLAME
883  if ((ompt_status == ompt_status_track_callback) &&
884  ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
885  ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
886  th->th.ompt_thread_info.wait_id);
887  }
888 #endif
889 }
890 
891 #if KMP_USE_DYNAMIC_LOCK
892 
893 static __forceinline kmp_indirect_lock_t *
894 __kmp_get_indirect_csptr(kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid, kmp_dyna_lockseq_t seq)
895 {
896  // Code from __kmp_get_critical_section_ptr
897  // This function returns an indirect lock object instead of a user lock.
898  kmp_indirect_lock_t **lck, *ret;
899  lck = (kmp_indirect_lock_t **)crit;
900  ret = (kmp_indirect_lock_t *)TCR_PTR(*lck);
901  if (ret == NULL) {
902  void *idx;
903  kmp_indirect_locktag_t tag = DYNA_GET_I_TAG(seq);
904  kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
905  ret = ilk;
906  DYNA_I_LOCK_FUNC(ilk, init)(ilk->lock);
907  DYNA_SET_I_LOCK_LOCATION(ilk, loc);
908  DYNA_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
909  KA_TRACE(20, ("__kmp_get_indirect_csptr: initialized indirect lock #%d\n", tag));
910 #if USE_ITT_BUILD
911  __kmp_itt_critical_creating(ilk->lock, loc);
912 #endif
913  int status = KMP_COMPARE_AND_STORE_PTR(lck, 0, ilk);
914  if (status == 0) {
915 #if USE_ITT_BUILD
916  __kmp_itt_critical_destroyed(ilk->lock);
917 #endif
918  // Postponing destroy, to avoid costly dispatch here.
919  //DYNA_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
920  ret = (kmp_indirect_lock_t *)TCR_PTR(*lck);
921  KMP_DEBUG_ASSERT(ret != NULL);
922  }
923  }
924  return ret;
925 }
926 
927 // Fast-path acquire tas lock
928 #define DYNA_ACQUIRE_TAS_LOCK(lock, gtid) { \
929  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
930  if (l->lk.poll != DYNA_LOCK_FREE(tas) || \
931  ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), DYNA_LOCK_FREE(tas), DYNA_LOCK_BUSY(gtid+1, tas))) { \
932  kmp_uint32 spins; \
933  KMP_FSYNC_PREPARE(l); \
934  KMP_INIT_YIELD(spins); \
935  if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
936  KMP_YIELD(TRUE); \
937  } else { \
938  KMP_YIELD_SPIN(spins); \
939  } \
940  while (l->lk.poll != DYNA_LOCK_FREE(tas) || \
941  ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), DYNA_LOCK_FREE(tas), DYNA_LOCK_BUSY(gtid+1, tas))) { \
942  if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
943  KMP_YIELD(TRUE); \
944  } else { \
945  KMP_YIELD_SPIN(spins); \
946  } \
947  } \
948  } \
949  KMP_FSYNC_ACQUIRED(l); \
950 }
951 
952 // Fast-path test tas lock
953 #define DYNA_TEST_TAS_LOCK(lock, gtid, rc) { \
954  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
955  rc = l->lk.poll == DYNA_LOCK_FREE(tas) && \
956  KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), DYNA_LOCK_FREE(tas), DYNA_LOCK_BUSY(gtid+1, tas)); \
957 }
958 
959 // Fast-path release tas lock
960 #define DYNA_RELEASE_TAS_LOCK(lock, gtid) { \
961  TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, DYNA_LOCK_FREE(tas)); \
962  KMP_MB(); \
963 }
964 
965 #if DYNA_HAS_FUTEX
966 
967 # include <unistd.h>
968 # include <sys/syscall.h>
969 # ifndef FUTEX_WAIT
970 # define FUTEX_WAIT 0
971 # endif
972 # ifndef FUTEX_WAKE
973 # define FUTEX_WAKE 1
974 # endif
975 
976 // Fast-path acquire futex lock
977 #define DYNA_ACQUIRE_FUTEX_LOCK(lock, gtid) { \
978  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
979  kmp_int32 gtid_code = (gtid+1) << 1; \
980  KMP_MB(); \
981  KMP_FSYNC_PREPARE(ftx); \
982  kmp_int32 poll_val; \
983  while ((poll_val = KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), DYNA_LOCK_FREE(futex), \
984  DYNA_LOCK_BUSY(gtid_code, futex))) != DYNA_LOCK_FREE(futex)) { \
985  kmp_int32 cond = DYNA_LOCK_STRIP(poll_val) & 1; \
986  if (!cond) { \
987  if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, poll_val | DYNA_LOCK_BUSY(1, futex))) { \
988  continue; \
989  } \
990  poll_val |= DYNA_LOCK_BUSY(1, futex); \
991  } \
992  kmp_int32 rc; \
993  if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, NULL, NULL, 0)) != 0) { \
994  continue; \
995  } \
996  gtid_code |= 1; \
997  } \
998  KMP_FSYNC_ACQUIRED(ftx); \
999 }
1000 
1001 // Fast-path test futex lock
1002 #define DYNA_TEST_FUTEX_LOCK(lock, gtid, rc) { \
1003  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1004  if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), DYNA_LOCK_FREE(futex), DYNA_LOCK_BUSY(gtid+1, futex) << 1)) { \
1005  KMP_FSYNC_ACQUIRED(ftx); \
1006  rc = TRUE; \
1007  } else { \
1008  rc = FALSE; \
1009  } \
1010 }
1011 
1012 // Fast-path release futex lock
1013 #define DYNA_RELEASE_FUTEX_LOCK(lock, gtid) { \
1014  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1015  KMP_MB(); \
1016  KMP_FSYNC_RELEASING(ftx); \
1017  kmp_int32 poll_val = KMP_XCHG_FIXED32(&(ftx->lk.poll), DYNA_LOCK_FREE(futex)); \
1018  if (DYNA_LOCK_STRIP(poll_val) & 1) { \
1019  syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, DYNA_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1020  } \
1021  KMP_MB(); \
1022  KMP_YIELD(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \
1023 }
1024 
1025 #endif // DYNA_HAS_FUTEX
1026 
1027 #else // KMP_USE_DYNAMIC_LOCK
1028 
1029 static kmp_user_lock_p
1030 __kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid )
1031 {
1032  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1033 
1034  //
1035  // Because of the double-check, the following load
1036  // doesn't need to be volatile.
1037  //
1038  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1039 
1040  if ( lck == NULL ) {
1041  void * idx;
1042 
1043  // Allocate & initialize the lock.
1044  // Remember allocated locks in table in order to free them in __kmp_cleanup()
1045  lck = __kmp_user_lock_allocate( &idx, gtid, kmp_lf_critical_section );
1046  __kmp_init_user_lock_with_checks( lck );
1047  __kmp_set_user_lock_location( lck, loc );
1048 #if USE_ITT_BUILD
1049  __kmp_itt_critical_creating( lck );
1050  // __kmp_itt_critical_creating() should be called *before* the first usage of underlying
1051  // lock. It is the only place where we can guarantee it. There are chances the lock will
1052  // destroyed with no usage, but it is not a problem, because this is not real event seen
1053  // by user but rather setting name for object (lock). See more details in kmp_itt.h.
1054 #endif /* USE_ITT_BUILD */
1055 
1056  //
1057  // Use a cmpxchg instruction to slam the start of the critical
1058  // section with the lock pointer. If another thread beat us
1059  // to it, deallocate the lock, and use the lock that the other
1060  // thread allocated.
1061  //
1062  int status = KMP_COMPARE_AND_STORE_PTR( lck_pp, 0, lck );
1063 
1064  if ( status == 0 ) {
1065  // Deallocate the lock and reload the value.
1066 #if USE_ITT_BUILD
1067  __kmp_itt_critical_destroyed( lck );
1068  // Let ITT know the lock is destroyed and the same memory location may be reused for
1069  // another purpose.
1070 #endif /* USE_ITT_BUILD */
1071  __kmp_destroy_user_lock_with_checks( lck );
1072  __kmp_user_lock_free( &idx, gtid, lck );
1073  lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1074  KMP_DEBUG_ASSERT( lck != NULL );
1075  }
1076  }
1077  return lck;
1078 }
1079 
1080 #endif // KMP_USE_DYNAMIC_LOCK
1081 
1092 void
1093 __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
1094  KMP_COUNT_BLOCK(OMP_CRITICAL);
1095  kmp_user_lock_p lck;
1096 
1097  KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
1098 
1099 #if KMP_USE_DYNAMIC_LOCK
1100  // Assumption: all direct locks fit in OMP_CRITICAL_SIZE.
1101  // The global sequence __kmp_user_lock_seq is used unless compiler pushes a value.
1102  if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
1103  lck = (kmp_user_lock_p)crit;
1104  // The thread that reaches here first needs to tag the lock word.
1105  if (*((kmp_dyna_lock_t *)lck) == 0) {
1106  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)lck, 0, DYNA_GET_D_TAG(__kmp_user_lock_seq));
1107  }
1108  if (__kmp_env_consistency_check) {
1109  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
1110  }
1111 # if USE_ITT_BUILD
1112  __kmp_itt_critical_acquiring(lck);
1113 # endif
1114 # if DYNA_USE_FAST_TAS
1115  if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1116  DYNA_ACQUIRE_TAS_LOCK(lck, global_tid);
1117  } else
1118 # elif DYNA_USE_FAST_FUTEX
1119  if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1120  DYNA_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1121  } else
1122 # endif
1123  {
1124  DYNA_D_LOCK_FUNC(lck, set)((kmp_dyna_lock_t *)lck, global_tid);
1125  }
1126  } else {
1127  kmp_indirect_lock_t *ilk = __kmp_get_indirect_csptr(crit, loc, global_tid, __kmp_user_lock_seq);
1128  lck = ilk->lock;
1129  if (__kmp_env_consistency_check) {
1130  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
1131  }
1132 # if USE_ITT_BUILD
1133  __kmp_itt_critical_acquiring(lck);
1134 # endif
1135  DYNA_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1136  }
1137 
1138 #else // KMP_USE_DYNAMIC_LOCK
1139 
1140  //TODO: add THR_OVHD_STATE
1141 
1142  KMP_CHECK_USER_LOCK_INIT();
1143 
1144  if ( ( __kmp_user_lock_kind == lk_tas )
1145  && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1146  lck = (kmp_user_lock_p)crit;
1147  }
1148 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1149  else if ( ( __kmp_user_lock_kind == lk_futex )
1150  && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1151  lck = (kmp_user_lock_p)crit;
1152  }
1153 #endif
1154  else { // ticket, queuing or drdpa
1155  lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
1156  }
1157 
1158  if ( __kmp_env_consistency_check )
1159  __kmp_push_sync( global_tid, ct_critical, loc, lck );
1160 
1161  /* since the critical directive binds to all threads, not just
1162  * the current team we have to check this even if we are in a
1163  * serialized team */
1164  /* also, even if we are the uber thread, we still have to conduct the lock,
1165  * as we have to contend with sibling threads */
1166 
1167 #if USE_ITT_BUILD
1168  __kmp_itt_critical_acquiring( lck );
1169 #endif /* USE_ITT_BUILD */
1170  // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1171  __kmp_acquire_user_lock_with_checks( lck, global_tid );
1172 
1173 #endif // KMP_USE_DYNAMIC_LOCK
1174 
1175 #if USE_ITT_BUILD
1176  __kmp_itt_critical_acquired( lck );
1177 #endif /* USE_ITT_BUILD */
1178 
1179  KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
1180 } // __kmpc_critical
1181 
1191 void
1192 __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
1193 {
1194  kmp_user_lock_p lck;
1195 
1196  KC_TRACE( 10, ("__kmpc_end_critical: called T#%d\n", global_tid ));
1197 
1198 #if KMP_USE_DYNAMIC_LOCK
1199  if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
1200  lck = (kmp_user_lock_p)crit;
1201  KMP_ASSERT(lck != NULL);
1202  if (__kmp_env_consistency_check) {
1203  __kmp_pop_sync(global_tid, ct_critical, loc);
1204  }
1205 # if USE_ITT_BUILD
1206  __kmp_itt_critical_releasing( lck );
1207 # endif
1208 # if DYNA_USE_FAST_TAS
1209  if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1210  DYNA_RELEASE_TAS_LOCK(lck, global_tid);
1211  } else
1212 # elif DYNA_USE_FAST_FUTEX
1213  if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1214  DYNA_RELEASE_FUTEX_LOCK(lck, global_tid);
1215  } else
1216 # endif
1217  {
1218  DYNA_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1219  }
1220  } else {
1221  kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1222  KMP_ASSERT(ilk != NULL);
1223  lck = ilk->lock;
1224  if (__kmp_env_consistency_check) {
1225  __kmp_pop_sync(global_tid, ct_critical, loc);
1226  }
1227 # if USE_ITT_BUILD
1228  __kmp_itt_critical_releasing( lck );
1229 # endif
1230  DYNA_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1231  }
1232 
1233 #else // KMP_USE_DYNAMIC_LOCK
1234 
1235  if ( ( __kmp_user_lock_kind == lk_tas )
1236  && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1237  lck = (kmp_user_lock_p)crit;
1238  }
1239 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1240  else if ( ( __kmp_user_lock_kind == lk_futex )
1241  && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1242  lck = (kmp_user_lock_p)crit;
1243  }
1244 #endif
1245  else { // ticket, queuing or drdpa
1246  lck = (kmp_user_lock_p) TCR_PTR(*((kmp_user_lock_p *)crit));
1247  }
1248 
1249  KMP_ASSERT(lck != NULL);
1250 
1251  if ( __kmp_env_consistency_check )
1252  __kmp_pop_sync( global_tid, ct_critical, loc );
1253 
1254 #if USE_ITT_BUILD
1255  __kmp_itt_critical_releasing( lck );
1256 #endif /* USE_ITT_BUILD */
1257  // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1258  __kmp_release_user_lock_with_checks( lck, global_tid );
1259 
1260 #if OMPT_SUPPORT && OMPT_BLAME
1261  if ((ompt_status == ompt_status_track_callback) &&
1262  ompt_callbacks.ompt_callback(ompt_event_release_critical)) {
1263  ompt_callbacks.ompt_callback(ompt_event_release_critical)(
1264  (uint64_t) lck);
1265  }
1266 #endif
1267 
1268 #endif // KMP_USE_DYNAMIC_LOCK
1269 
1270  KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid ));
1271 }
1272 
1281 kmp_int32
1282 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
1283 {
1284  int status;
1285 
1286  KC_TRACE( 10, ("__kmpc_barrier_master: called T#%d\n", global_tid ) );
1287 
1288  if (! TCR_4(__kmp_init_parallel))
1289  __kmp_parallel_initialize();
1290 
1291  if ( __kmp_env_consistency_check )
1292  __kmp_check_barrier( global_tid, ct_barrier, loc );
1293 
1294 #if USE_ITT_NOTIFY
1295  __kmp_threads[global_tid]->th.th_ident = loc;
1296 #endif
1297  status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL );
1298 
1299  return (status != 0) ? 0 : 1;
1300 }
1301 
1311 void
1312 __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
1313 {
1314  KC_TRACE( 10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid ));
1315 
1316  __kmp_end_split_barrier ( bs_plain_barrier, global_tid );
1317 }
1318 
1329 kmp_int32
1330 __kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid )
1331 {
1332  kmp_int32 ret;
1333 
1334  KC_TRACE( 10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid ));
1335 
1336  if (! TCR_4(__kmp_init_parallel))
1337  __kmp_parallel_initialize();
1338 
1339  if ( __kmp_env_consistency_check ) {
1340  if ( loc == 0 ) {
1341  KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
1342  }
1343  __kmp_check_barrier( global_tid, ct_barrier, loc );
1344  }
1345 
1346 #if USE_ITT_NOTIFY
1347  __kmp_threads[global_tid]->th.th_ident = loc;
1348 #endif
1349  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
1350 
1351  ret = __kmpc_master (loc, global_tid);
1352 
1353  if ( __kmp_env_consistency_check ) {
1354  /* there's no __kmpc_end_master called; so the (stats) */
1355  /* actions of __kmpc_end_master are done here */
1356 
1357  if ( global_tid < 0 ) {
1358  KMP_WARNING( ThreadIdentInvalid );
1359  }
1360  if (ret) {
1361  /* only one thread should do the pop since only */
1362  /* one did the push (see __kmpc_master()) */
1363 
1364  __kmp_pop_sync( global_tid, ct_master, loc );
1365  }
1366  }
1367 
1368  return (ret);
1369 }
1370 
1371 /* The BARRIER for a SINGLE process section is always explicit */
1383 kmp_int32
1384 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
1385 {
1386  KMP_COUNT_BLOCK(OMP_SINGLE);
1387  kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
1388 
1389 #if OMPT_SUPPORT && OMPT_TRACE
1390  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
1391  kmp_team_t *team = this_thr -> th.th_team;
1392  int tid = __kmp_tid_from_gtid( global_tid );
1393 
1394  if ((ompt_status == ompt_status_track_callback)) {
1395  if (rc) {
1396  if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) {
1397  ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)(
1398  team->t.ompt_team_info.parallel_id,
1399  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id,
1400  team->t.ompt_team_info.microtask);
1401  }
1402  } else {
1403  if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) {
1404  ompt_callbacks.ompt_callback(ompt_event_single_others_begin)(
1405  team->t.ompt_team_info.parallel_id,
1406  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1407  }
1408  this_thr->th.ompt_thread_info.state = ompt_state_wait_single;
1409  }
1410  }
1411 #endif
1412 
1413  return rc;
1414 }
1415 
1425 void
1426 __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
1427 {
1428  __kmp_exit_single( global_tid );
1429 
1430 #if OMPT_SUPPORT && OMPT_TRACE
1431  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
1432  kmp_team_t *team = this_thr -> th.th_team;
1433  int tid = __kmp_tid_from_gtid( global_tid );
1434 
1435  if ((ompt_status == ompt_status_track_callback) &&
1436  ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) {
1437  ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)(
1438  team->t.ompt_team_info.parallel_id,
1439  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1440  }
1441 #endif
1442 }
1443 
1451 void
1452 __kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid )
1453 {
1454  KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1455 
1456 #if OMPT_SUPPORT && OMPT_TRACE
1457  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
1458  kmp_team_t *team = this_thr -> th.th_team;
1459  int tid = __kmp_tid_from_gtid( global_tid );
1460 
1461  if ((ompt_status == ompt_status_track_callback) &&
1462  ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
1463  ompt_callbacks.ompt_callback(ompt_event_loop_end)(
1464  team->t.ompt_team_info.parallel_id,
1465  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1466  }
1467 #endif
1468 
1469  if ( __kmp_env_consistency_check )
1470  __kmp_pop_workshare( global_tid, ct_pdo, loc );
1471 }
1472 
1473 /*
1474  * User routines which take C-style arguments (call by value)
1475  * different from the Fortran equivalent routines
1476  */
1477 
1478 void
1479 ompc_set_num_threads( int arg )
1480 {
1481 // !!!!! TODO: check the per-task binding
1482  __kmp_set_num_threads( arg, __kmp_entry_gtid() );
1483 }
1484 
1485 void
1486 ompc_set_dynamic( int flag )
1487 {
1488  kmp_info_t *thread;
1489 
1490  /* For the thread-private implementation of the internal controls */
1491  thread = __kmp_entry_thread();
1492 
1493  __kmp_save_internal_controls( thread );
1494 
1495  set__dynamic( thread, flag ? TRUE : FALSE );
1496 }
1497 
1498 void
1499 ompc_set_nested( int flag )
1500 {
1501  kmp_info_t *thread;
1502 
1503  /* For the thread-private internal controls implementation */
1504  thread = __kmp_entry_thread();
1505 
1506  __kmp_save_internal_controls( thread );
1507 
1508  set__nested( thread, flag ? TRUE : FALSE );
1509 }
1510 
1511 void
1512 ompc_set_max_active_levels( int max_active_levels )
1513 {
1514  /* TO DO */
1515  /* we want per-task implementation of this internal control */
1516 
1517  /* For the per-thread internal controls implementation */
1518  __kmp_set_max_active_levels( __kmp_entry_gtid(), max_active_levels );
1519 }
1520 
1521 void
1522 ompc_set_schedule( omp_sched_t kind, int modifier )
1523 {
1524 // !!!!! TODO: check the per-task binding
1525  __kmp_set_schedule( __kmp_entry_gtid(), ( kmp_sched_t ) kind, modifier );
1526 }
1527 
1528 int
1529 ompc_get_ancestor_thread_num( int level )
1530 {
1531  return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), level );
1532 }
1533 
1534 int
1535 ompc_get_team_size( int level )
1536 {
1537  return __kmp_get_team_size( __kmp_entry_gtid(), level );
1538 }
1539 
1540 void
1541 kmpc_set_stacksize( int arg )
1542 {
1543  // __kmp_aux_set_stacksize initializes the library if needed
1544  __kmp_aux_set_stacksize( arg );
1545 }
1546 
1547 void
1548 kmpc_set_stacksize_s( size_t arg )
1549 {
1550  // __kmp_aux_set_stacksize initializes the library if needed
1551  __kmp_aux_set_stacksize( arg );
1552 }
1553 
1554 void
1555 kmpc_set_blocktime( int arg )
1556 {
1557  int gtid, tid;
1558  kmp_info_t *thread;
1559 
1560  gtid = __kmp_entry_gtid();
1561  tid = __kmp_tid_from_gtid(gtid);
1562  thread = __kmp_thread_from_gtid(gtid);
1563 
1564  __kmp_aux_set_blocktime( arg, thread, tid );
1565 }
1566 
1567 void
1568 kmpc_set_library( int arg )
1569 {
1570  // __kmp_user_set_library initializes the library if needed
1571  __kmp_user_set_library( (enum library_type)arg );
1572 }
1573 
1574 void
1575 kmpc_set_defaults( char const * str )
1576 {
1577  // __kmp_aux_set_defaults initializes the library if needed
1578  __kmp_aux_set_defaults( str, KMP_STRLEN( str ) );
1579 }
1580 
1581 int
1582 kmpc_set_affinity_mask_proc( int proc, void **mask )
1583 {
1584 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1585  return -1;
1586 #else
1587  if ( ! TCR_4(__kmp_init_middle) ) {
1588  __kmp_middle_initialize();
1589  }
1590  return __kmp_aux_set_affinity_mask_proc( proc, mask );
1591 #endif
1592 }
1593 
1594 int
1595 kmpc_unset_affinity_mask_proc( int proc, void **mask )
1596 {
1597 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1598  return -1;
1599 #else
1600  if ( ! TCR_4(__kmp_init_middle) ) {
1601  __kmp_middle_initialize();
1602  }
1603  return __kmp_aux_unset_affinity_mask_proc( proc, mask );
1604 #endif
1605 }
1606 
1607 int
1608 kmpc_get_affinity_mask_proc( int proc, void **mask )
1609 {
1610 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1611  return -1;
1612 #else
1613  if ( ! TCR_4(__kmp_init_middle) ) {
1614  __kmp_middle_initialize();
1615  }
1616  return __kmp_aux_get_affinity_mask_proc( proc, mask );
1617 #endif
1618 }
1619 
1620 
1621 /* -------------------------------------------------------------------------- */
1662 void
1663 __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit )
1664 {
1665  void **data_ptr;
1666 
1667  KC_TRACE( 10, ("__kmpc_copyprivate: called T#%d\n", gtid ));
1668 
1669  KMP_MB();
1670 
1671  data_ptr = & __kmp_team_from_gtid( gtid )->t.t_copypriv_data;
1672 
1673  if ( __kmp_env_consistency_check ) {
1674  if ( loc == 0 ) {
1675  KMP_WARNING( ConstructIdentInvalid );
1676  }
1677  }
1678 
1679  /* ToDo: Optimize the following two barriers into some kind of split barrier */
1680 
1681  if (didit) *data_ptr = cpy_data;
1682 
1683  /* This barrier is not a barrier region boundary */
1684 #if USE_ITT_NOTIFY
1685  __kmp_threads[gtid]->th.th_ident = loc;
1686 #endif
1687  __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1688 
1689  if (! didit) (*cpy_func)( cpy_data, *data_ptr );
1690 
1691  /* Consider next barrier the user-visible barrier for barrier region boundaries */
1692  /* Nesting checks are already handled by the single construct checks */
1693 
1694 #if USE_ITT_NOTIFY
1695  __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location)
1696 #endif
1697  __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1698 }
1699 
1700 /* -------------------------------------------------------------------------- */
1701 
1702 #define INIT_LOCK __kmp_init_user_lock_with_checks
1703 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
1704 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
1705 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
1706 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
1707 #define ACQUIRE_NESTED_LOCK_TIMED __kmp_acquire_nested_user_lock_with_checks_timed
1708 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
1709 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
1710 #define TEST_LOCK __kmp_test_user_lock_with_checks
1711 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
1712 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
1713 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
1714 
1715 /*
1716  * TODO: Make check abort messages use location info & pass it
1717  * into with_checks routines
1718  */
1719 
1720 /* initialize the lock */
1721 void
1722 __kmpc_init_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1723 #if KMP_USE_DYNAMIC_LOCK
1724  KMP_DEBUG_ASSERT(__kmp_init_serial);
1725  if (__kmp_env_consistency_check && user_lock == NULL) {
1726  KMP_FATAL(LockIsUninitialized, "omp_init_lock");
1727  }
1728  if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
1729  DYNA_INIT_D_LOCK(user_lock, __kmp_user_lock_seq);
1730 # if USE_ITT_BUILD
1731  __kmp_itt_lock_creating((kmp_user_lock_p)user_lock, NULL);
1732 # endif
1733  } else {
1734  DYNA_INIT_I_LOCK(user_lock, __kmp_user_lock_seq);
1735  kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(user_lock);
1736  DYNA_SET_I_LOCK_LOCATION(ilk, loc);
1737 # if USE_ITT_BUILD
1738  __kmp_itt_lock_creating(ilk->lock, loc);
1739 # endif
1740  }
1741 
1742 #else // KMP_USE_DYNAMIC_LOCK
1743 
1744  static char const * const func = "omp_init_lock";
1745  kmp_user_lock_p lck;
1746  KMP_DEBUG_ASSERT( __kmp_init_serial );
1747 
1748  if ( __kmp_env_consistency_check ) {
1749  if ( user_lock == NULL ) {
1750  KMP_FATAL( LockIsUninitialized, func );
1751  }
1752  }
1753 
1754  KMP_CHECK_USER_LOCK_INIT();
1755 
1756  if ( ( __kmp_user_lock_kind == lk_tas )
1757  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1758  lck = (kmp_user_lock_p)user_lock;
1759  }
1760 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1761  else if ( ( __kmp_user_lock_kind == lk_futex )
1762  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1763  lck = (kmp_user_lock_p)user_lock;
1764  }
1765 #endif
1766  else {
1767  lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1768  }
1769  INIT_LOCK( lck );
1770  __kmp_set_user_lock_location( lck, loc );
1771 
1772 #if USE_ITT_BUILD
1773  __kmp_itt_lock_creating( lck );
1774 #endif /* USE_ITT_BUILD */
1775 
1776 #endif // KMP_USE_DYNAMIC_LOCK
1777 } // __kmpc_init_lock
1778 
1779 /* initialize the lock */
1780 void
1781 __kmpc_init_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1782 #if KMP_USE_DYNAMIC_LOCK
1783 
1784  KMP_DEBUG_ASSERT(__kmp_init_serial);
1785  if (__kmp_env_consistency_check && user_lock == NULL) {
1786  KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
1787  }
1788  // Invoke init function after converting to nested version.
1789  kmp_dyna_lockseq_t nested_seq;
1790  switch (__kmp_user_lock_seq) {
1791  case lockseq_tas: nested_seq = lockseq_nested_tas; break;
1792 #if DYNA_HAS_FUTEX
1793  case lockseq_futex: nested_seq = lockseq_nested_futex; break;
1794 #endif
1795  case lockseq_ticket: nested_seq = lockseq_nested_ticket; break;
1796  case lockseq_queuing: nested_seq = lockseq_nested_queuing; break;
1797  case lockseq_drdpa: nested_seq = lockseq_nested_drdpa; break;
1798  default: nested_seq = lockseq_nested_queuing; break;
1799  // Use nested queuing lock for lock kinds without "nested" implementation.
1800  }
1801  DYNA_INIT_I_LOCK(user_lock, nested_seq);
1802  // All nested locks are indirect locks.
1803  kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(user_lock);
1804  DYNA_SET_I_LOCK_LOCATION(ilk, loc);
1805 # if USE_ITT_BUILD
1806  __kmp_itt_lock_creating(ilk->lock, loc);
1807 # endif
1808 
1809 #else // KMP_USE_DYNAMIC_LOCK
1810 
1811  static char const * const func = "omp_init_nest_lock";
1812  kmp_user_lock_p lck;
1813  KMP_DEBUG_ASSERT( __kmp_init_serial );
1814 
1815  if ( __kmp_env_consistency_check ) {
1816  if ( user_lock == NULL ) {
1817  KMP_FATAL( LockIsUninitialized, func );
1818  }
1819  }
1820 
1821  KMP_CHECK_USER_LOCK_INIT();
1822 
1823  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1824  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1825  lck = (kmp_user_lock_p)user_lock;
1826  }
1827 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1828  else if ( ( __kmp_user_lock_kind == lk_futex )
1829  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1830  <= OMP_NEST_LOCK_T_SIZE ) ) {
1831  lck = (kmp_user_lock_p)user_lock;
1832  }
1833 #endif
1834  else {
1835  lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1836  }
1837 
1838  INIT_NESTED_LOCK( lck );
1839  __kmp_set_user_lock_location( lck, loc );
1840 
1841 #if USE_ITT_BUILD
1842  __kmp_itt_lock_creating( lck );
1843 #endif /* USE_ITT_BUILD */
1844 
1845 #endif // KMP_USE_DYNAMIC_LOCK
1846 } // __kmpc_init_nest_lock
1847 
1848 void
1849 __kmpc_destroy_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1850 #if KMP_USE_DYNAMIC_LOCK
1851 
1852 # if USE_ITT_BUILD
1853  kmp_user_lock_p lck;
1854  if (DYNA_EXTRACT_D_TAG(user_lock) == 0) {
1855  lck = ((kmp_indirect_lock_t *)DYNA_LOOKUP_I_LOCK(user_lock))->lock;
1856  } else {
1857  lck = (kmp_user_lock_p)user_lock;
1858  }
1859  __kmp_itt_lock_destroyed(lck);
1860 # endif
1861  DYNA_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
1862 #else
1863  kmp_user_lock_p lck;
1864 
1865  if ( ( __kmp_user_lock_kind == lk_tas )
1866  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1867  lck = (kmp_user_lock_p)user_lock;
1868  }
1869 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1870  else if ( ( __kmp_user_lock_kind == lk_futex )
1871  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1872  lck = (kmp_user_lock_p)user_lock;
1873  }
1874 #endif
1875  else {
1876  lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_lock" );
1877  }
1878 
1879 #if USE_ITT_BUILD
1880  __kmp_itt_lock_destroyed( lck );
1881 #endif /* USE_ITT_BUILD */
1882  DESTROY_LOCK( lck );
1883 
1884  if ( ( __kmp_user_lock_kind == lk_tas )
1885  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1886  ;
1887  }
1888 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1889  else if ( ( __kmp_user_lock_kind == lk_futex )
1890  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1891  ;
1892  }
1893 #endif
1894  else {
1895  __kmp_user_lock_free( user_lock, gtid, lck );
1896  }
1897 #endif // KMP_USE_DYNAMIC_LOCK
1898 } // __kmpc_destroy_lock
1899 
1900 /* destroy the lock */
1901 void
1902 __kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1903 #if KMP_USE_DYNAMIC_LOCK
1904 
1905 # if USE_ITT_BUILD
1906  kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(user_lock);
1907  __kmp_itt_lock_destroyed(ilk->lock);
1908 # endif
1909  DYNA_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
1910 
1911 #else // KMP_USE_DYNAMIC_LOCK
1912 
1913  kmp_user_lock_p lck;
1914 
1915  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1916  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1917  lck = (kmp_user_lock_p)user_lock;
1918  }
1919 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1920  else if ( ( __kmp_user_lock_kind == lk_futex )
1921  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1922  <= OMP_NEST_LOCK_T_SIZE ) ) {
1923  lck = (kmp_user_lock_p)user_lock;
1924  }
1925 #endif
1926  else {
1927  lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_nest_lock" );
1928  }
1929 
1930 #if USE_ITT_BUILD
1931  __kmp_itt_lock_destroyed( lck );
1932 #endif /* USE_ITT_BUILD */
1933 
1934  DESTROY_NESTED_LOCK( lck );
1935 
1936  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1937  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1938  ;
1939  }
1940 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1941  else if ( ( __kmp_user_lock_kind == lk_futex )
1942  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1943  <= OMP_NEST_LOCK_T_SIZE ) ) {
1944  ;
1945  }
1946 #endif
1947  else {
1948  __kmp_user_lock_free( user_lock, gtid, lck );
1949  }
1950 #endif // KMP_USE_DYNAMIC_LOCK
1951 } // __kmpc_destroy_nest_lock
1952 
1953 void
1954 __kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1955  KMP_COUNT_BLOCK(OMP_set_lock);
1956 #if KMP_USE_DYNAMIC_LOCK
1957  int tag = DYNA_EXTRACT_D_TAG(user_lock);
1958 # if USE_ITT_BUILD
1959  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); // itt function will get to the right lock object.
1960 # endif
1961 # if DYNA_USE_FAST_TAS
1962  if (tag == locktag_tas && !__kmp_env_consistency_check) {
1963  DYNA_ACQUIRE_TAS_LOCK(user_lock, gtid);
1964  } else
1965 # elif DYNA_USE_FAST_FUTEX
1966  if (tag == locktag_futex && !__kmp_env_consistency_check) {
1967  DYNA_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
1968  } else
1969 # endif
1970  {
1971  __kmp_direct_set_ops[tag]((kmp_dyna_lock_t *)user_lock, gtid);
1972  }
1973 # if USE_ITT_BUILD
1974  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
1975 # endif
1976 
1977 #else // KMP_USE_DYNAMIC_LOCK
1978 
1979  kmp_user_lock_p lck;
1980 
1981  if ( ( __kmp_user_lock_kind == lk_tas )
1982  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1983  lck = (kmp_user_lock_p)user_lock;
1984  }
1985 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1986  else if ( ( __kmp_user_lock_kind == lk_futex )
1987  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1988  lck = (kmp_user_lock_p)user_lock;
1989  }
1990 #endif
1991  else {
1992  lck = __kmp_lookup_user_lock( user_lock, "omp_set_lock" );
1993  }
1994 
1995 #if USE_ITT_BUILD
1996  __kmp_itt_lock_acquiring( lck );
1997 #endif /* USE_ITT_BUILD */
1998 
1999  ACQUIRE_LOCK( lck, gtid );
2000 
2001 #if USE_ITT_BUILD
2002  __kmp_itt_lock_acquired( lck );
2003 #endif /* USE_ITT_BUILD */
2004 
2005 #endif // KMP_USE_DYNAMIC_LOCK
2006 }
2007 
2008 void
2009 __kmpc_set_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2010 #if KMP_USE_DYNAMIC_LOCK
2011 
2012 # if USE_ITT_BUILD
2013  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2014 # endif
2015  DYNA_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2016 # if USE_ITT_BUILD
2017  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2018 #endif
2019 
2020 #else // KMP_USE_DYNAMIC_LOCK
2021  kmp_user_lock_p lck;
2022 
2023  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2024  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2025  lck = (kmp_user_lock_p)user_lock;
2026  }
2027 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2028  else if ( ( __kmp_user_lock_kind == lk_futex )
2029  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2030  <= OMP_NEST_LOCK_T_SIZE ) ) {
2031  lck = (kmp_user_lock_p)user_lock;
2032  }
2033 #endif
2034  else {
2035  lck = __kmp_lookup_user_lock( user_lock, "omp_set_nest_lock" );
2036  }
2037 
2038 #if USE_ITT_BUILD
2039  __kmp_itt_lock_acquiring( lck );
2040 #endif /* USE_ITT_BUILD */
2041 
2042  ACQUIRE_NESTED_LOCK( lck, gtid );
2043 
2044 #if USE_ITT_BUILD
2045  __kmp_itt_lock_acquired( lck );
2046 #endif /* USE_ITT_BUILD */
2047 #endif // KMP_USE_DYNAMIC_LOCK
2048 }
2049 
2050 void
2051 __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2052 {
2053 #if KMP_USE_DYNAMIC_LOCK
2054 
2055  int tag = DYNA_EXTRACT_D_TAG(user_lock);
2056 # if USE_ITT_BUILD
2057  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2058 # endif
2059 # if DYNA_USE_FAST_TAS
2060  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2061  DYNA_RELEASE_TAS_LOCK(user_lock, gtid);
2062  } else
2063 # elif DYNA_USE_FAST_FUTEX
2064  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2065  DYNA_RELEASE_FUTEX_LOCK(user_lock, gtid);
2066  } else
2067 # endif
2068  {
2069  __kmp_direct_unset_ops[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2070  }
2071 
2072 #else // KMP_USE_DYNAMIC_LOCK
2073 
2074  kmp_user_lock_p lck;
2075 
2076  /* Can't use serial interval since not block structured */
2077  /* release the lock */
2078 
2079  if ( ( __kmp_user_lock_kind == lk_tas )
2080  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2081 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2082  // "fast" path implemented to fix customer performance issue
2083 #if USE_ITT_BUILD
2084  __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
2085 #endif /* USE_ITT_BUILD */
2086  TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2087  KMP_MB();
2088  return;
2089 #else
2090  lck = (kmp_user_lock_p)user_lock;
2091 #endif
2092  }
2093 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2094  else if ( ( __kmp_user_lock_kind == lk_futex )
2095  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2096  lck = (kmp_user_lock_p)user_lock;
2097  }
2098 #endif
2099  else {
2100  lck = __kmp_lookup_user_lock( user_lock, "omp_unset_lock" );
2101  }
2102 
2103 #if USE_ITT_BUILD
2104  __kmp_itt_lock_releasing( lck );
2105 #endif /* USE_ITT_BUILD */
2106 
2107  RELEASE_LOCK( lck, gtid );
2108 
2109 #if OMPT_SUPPORT && OMPT_BLAME
2110  if ((ompt_status == ompt_status_track_callback) &&
2111  ompt_callbacks.ompt_callback(ompt_event_release_lock)) {
2112  ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t) lck);
2113  }
2114 #endif
2115 
2116 #endif // KMP_USE_DYNAMIC_LOCK
2117 }
2118 
2119 /* release the lock */
2120 void
2121 __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2122 {
2123 #if KMP_USE_DYNAMIC_LOCK
2124 
2125 # if USE_ITT_BUILD
2126  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2127 # endif
2128  DYNA_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2129 
2130 #else // KMP_USE_DYNAMIC_LOCK
2131 
2132  kmp_user_lock_p lck;
2133 
2134  /* Can't use serial interval since not block structured */
2135 
2136  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2137  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2138 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2139  // "fast" path implemented to fix customer performance issue
2140  kmp_tas_lock_t *tl = (kmp_tas_lock_t*)user_lock;
2141 #if USE_ITT_BUILD
2142  __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
2143 #endif /* USE_ITT_BUILD */
2144  if ( --(tl->lk.depth_locked) == 0 ) {
2145  TCW_4(tl->lk.poll, 0);
2146  }
2147  KMP_MB();
2148  return;
2149 #else
2150  lck = (kmp_user_lock_p)user_lock;
2151 #endif
2152  }
2153 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2154  else if ( ( __kmp_user_lock_kind == lk_futex )
2155  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2156  <= OMP_NEST_LOCK_T_SIZE ) ) {
2157  lck = (kmp_user_lock_p)user_lock;
2158  }
2159 #endif
2160  else {
2161  lck = __kmp_lookup_user_lock( user_lock, "omp_unset_nest_lock" );
2162  }
2163 
2164 #if USE_ITT_BUILD
2165  __kmp_itt_lock_releasing( lck );
2166 #endif /* USE_ITT_BUILD */
2167 
2168  int release_status = RELEASE_NESTED_LOCK( lck, gtid );
2169 #if OMPT_SUPPORT && OMPT_BLAME
2170  if (ompt_status == ompt_status_track_callback) {
2171  if (release_status == KMP_LOCK_RELEASED) {
2172  if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) {
2173  ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)(
2174  (uint64_t) lck);
2175  }
2176  } else if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)) {
2177  ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)(
2178  (uint64_t) lck);
2179  }
2180  }
2181 #endif
2182 
2183 #endif // KMP_USE_DYNAMIC_LOCK
2184 }
2185 
2186 /* try to acquire the lock */
2187 int
2188 __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2189 {
2190  KMP_COUNT_BLOCK(OMP_test_lock);
2191  KMP_TIME_BLOCK(OMP_test_lock);
2192 
2193 #if KMP_USE_DYNAMIC_LOCK
2194  int rc;
2195  int tag = DYNA_EXTRACT_D_TAG(user_lock);
2196 # if USE_ITT_BUILD
2197  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2198 # endif
2199 # if DYNA_USE_FAST_TAS
2200  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2201  DYNA_TEST_TAS_LOCK(user_lock, gtid, rc);
2202  } else
2203 # elif DYNA_USE_FAST_FUTEX
2204  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2205  DYNA_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2206  } else
2207 # endif
2208  {
2209  rc = __kmp_direct_test_ops[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2210  }
2211  if (rc) {
2212 # if USE_ITT_BUILD
2213  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2214 # endif
2215  return FTN_TRUE;
2216  } else {
2217 # if USE_ITT_BUILD
2218  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2219 # endif
2220  return FTN_FALSE;
2221  }
2222 
2223 #else // KMP_USE_DYNAMIC_LOCK
2224 
2225  kmp_user_lock_p lck;
2226  int rc;
2227 
2228  if ( ( __kmp_user_lock_kind == lk_tas )
2229  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2230  lck = (kmp_user_lock_p)user_lock;
2231  }
2232 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2233  else if ( ( __kmp_user_lock_kind == lk_futex )
2234  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2235  lck = (kmp_user_lock_p)user_lock;
2236  }
2237 #endif
2238  else {
2239  lck = __kmp_lookup_user_lock( user_lock, "omp_test_lock" );
2240  }
2241 
2242 #if USE_ITT_BUILD
2243  __kmp_itt_lock_acquiring( lck );
2244 #endif /* USE_ITT_BUILD */
2245 
2246  rc = TEST_LOCK( lck, gtid );
2247 #if USE_ITT_BUILD
2248  if ( rc ) {
2249  __kmp_itt_lock_acquired( lck );
2250  } else {
2251  __kmp_itt_lock_cancelled( lck );
2252  }
2253 #endif /* USE_ITT_BUILD */
2254  return ( rc ? FTN_TRUE : FTN_FALSE );
2255 
2256  /* Can't use serial interval since not block structured */
2257 
2258 #endif // KMP_USE_DYNAMIC_LOCK
2259 }
2260 
2261 /* try to acquire the lock */
2262 int
2263 __kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2264 {
2265 #if KMP_USE_DYNAMIC_LOCK
2266  int rc;
2267 # if USE_ITT_BUILD
2268  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2269 # endif
2270  rc = DYNA_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
2271 # if USE_ITT_BUILD
2272  if (rc) {
2273  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2274  } else {
2275  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2276  }
2277 # endif
2278  return rc;
2279 
2280 #else // KMP_USE_DYNAMIC_LOCK
2281 
2282  kmp_user_lock_p lck;
2283  int rc;
2284 
2285  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2286  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2287  lck = (kmp_user_lock_p)user_lock;
2288  }
2289 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2290  else if ( ( __kmp_user_lock_kind == lk_futex )
2291  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2292  <= OMP_NEST_LOCK_T_SIZE ) ) {
2293  lck = (kmp_user_lock_p)user_lock;
2294  }
2295 #endif
2296  else {
2297  lck = __kmp_lookup_user_lock( user_lock, "omp_test_nest_lock" );
2298  }
2299 
2300 #if USE_ITT_BUILD
2301  __kmp_itt_lock_acquiring( lck );
2302 #endif /* USE_ITT_BUILD */
2303 
2304  rc = TEST_NESTED_LOCK( lck, gtid );
2305 #if USE_ITT_BUILD
2306  if ( rc ) {
2307  __kmp_itt_lock_acquired( lck );
2308  } else {
2309  __kmp_itt_lock_cancelled( lck );
2310  }
2311 #endif /* USE_ITT_BUILD */
2312  return rc;
2313 
2314  /* Can't use serial interval since not block structured */
2315 
2316 #endif // KMP_USE_DYNAMIC_LOCK
2317 }
2318 
2319 
2320 /*--------------------------------------------------------------------------------------------------------------------*/
2321 
2322 /*
2323  * Interface to fast scalable reduce methods routines
2324  */
2325 
2326 // keep the selected method in a thread local structure for cross-function usage: will be used in __kmpc_end_reduce* functions;
2327 // another solution: to re-determine the method one more time in __kmpc_end_reduce* functions (new prototype required then)
2328 // AT: which solution is better?
2329 #define __KMP_SET_REDUCTION_METHOD(gtid,rmethod) \
2330  ( ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) = ( rmethod ) )
2331 
2332 #define __KMP_GET_REDUCTION_METHOD(gtid) \
2333  ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method )
2334 
2335 // description of the packed_reduction_method variable: look at the macros in kmp.h
2336 
2337 
2338 // used in a critical section reduce block
2339 static __forceinline void
2340 __kmp_enter_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2341 
2342  // this lock was visible to a customer and to the Intel(R) Thread Profiler as a serial overhead span
2343  // (although it's used for an internal purpose only)
2344  // why was it visible in previous implementation?
2345  // should we keep it visible in new reduce block?
2346  kmp_user_lock_p lck;
2347 
2348 #if KMP_USE_DYNAMIC_LOCK
2349 
2350  if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
2351  lck = (kmp_user_lock_p)crit;
2352  if (*((kmp_dyna_lock_t *)lck) == 0) {
2353  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)lck, 0, DYNA_GET_D_TAG(__kmp_user_lock_seq));
2354  }
2355  KMP_DEBUG_ASSERT(lck != NULL);
2356  if (__kmp_env_consistency_check) {
2357  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2358  }
2359  DYNA_D_LOCK_FUNC(lck, set)((kmp_dyna_lock_t *)lck, global_tid);
2360  } else {
2361  kmp_indirect_lock_t *ilk = __kmp_get_indirect_csptr(crit, loc, global_tid, __kmp_user_lock_seq);
2362  KMP_DEBUG_ASSERT(ilk != NULL);
2363  if (__kmp_env_consistency_check) {
2364  __kmp_push_sync(global_tid, ct_critical, loc, ilk->lock, __kmp_user_lock_seq);
2365  }
2366  DYNA_I_LOCK_FUNC(ilk, set)(ilk->lock, global_tid);
2367  }
2368 
2369 #else // KMP_USE_DYNAMIC_LOCK
2370 
2371  // We know that the fast reduction code is only emitted by Intel compilers
2372  // with 32 byte critical sections. If there isn't enough space, then we
2373  // have to use a pointer.
2374  if ( __kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE ) {
2375  lck = (kmp_user_lock_p)crit;
2376  }
2377  else {
2378  lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
2379  }
2380  KMP_DEBUG_ASSERT( lck != NULL );
2381 
2382  if ( __kmp_env_consistency_check )
2383  __kmp_push_sync( global_tid, ct_critical, loc, lck );
2384 
2385  __kmp_acquire_user_lock_with_checks( lck, global_tid );
2386 
2387 #endif // KMP_USE_DYNAMIC_LOCK
2388 }
2389 
2390 // used in a critical section reduce block
2391 static __forceinline void
2392 __kmp_end_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2393 
2394  kmp_user_lock_p lck;
2395 
2396 #if KMP_USE_DYNAMIC_LOCK
2397 
2398  if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
2399  lck = (kmp_user_lock_p)crit;
2400  if (__kmp_env_consistency_check)
2401  __kmp_pop_sync(global_tid, ct_critical, loc);
2402  DYNA_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
2403  } else {
2404  kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
2405  if (__kmp_env_consistency_check)
2406  __kmp_pop_sync(global_tid, ct_critical, loc);
2407  DYNA_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
2408  }
2409 
2410 #else // KMP_USE_DYNAMIC_LOCK
2411 
2412  // We know that the fast reduction code is only emitted by Intel compilers with 32 byte critical
2413  // sections. If there isn't enough space, then we have to use a pointer.
2414  if ( __kmp_base_user_lock_size > 32 ) {
2415  lck = *( (kmp_user_lock_p *) crit );
2416  KMP_ASSERT( lck != NULL );
2417  } else {
2418  lck = (kmp_user_lock_p) crit;
2419  }
2420 
2421  if ( __kmp_env_consistency_check )
2422  __kmp_pop_sync( global_tid, ct_critical, loc );
2423 
2424  __kmp_release_user_lock_with_checks( lck, global_tid );
2425 
2426 #endif // KMP_USE_DYNAMIC_LOCK
2427 } // __kmp_end_critical_section_reduce_block
2428 
2429 
2430 /* 2.a.i. Reduce Block without a terminating barrier */
2444 kmp_int32
2446  ident_t *loc, kmp_int32 global_tid,
2447  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
2448  kmp_critical_name *lck ) {
2449 
2450  KMP_COUNT_BLOCK(REDUCE_nowait);
2451  int retval = 0;
2452  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2453 #if OMP_40_ENABLED
2454  kmp_team_t *team;
2455  kmp_info_t *th;
2456  int teams_swapped = 0, task_state;
2457 #endif
2458  KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) );
2459 
2460  // why do we need this initialization here at all?
2461  // Reduction clause can not be used as a stand-alone directive.
2462 
2463  // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2464  // possible detection of false-positive race by the threadchecker ???
2465  if( ! TCR_4( __kmp_init_parallel ) )
2466  __kmp_parallel_initialize();
2467 
2468  // check correctness of reduce block nesting
2469 #if KMP_USE_DYNAMIC_LOCK
2470  if ( __kmp_env_consistency_check )
2471  __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
2472 #else
2473  if ( __kmp_env_consistency_check )
2474  __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2475 #endif
2476 
2477 #if OMP_40_ENABLED
2478  th = __kmp_thread_from_gtid(global_tid);
2479  if( th->th.th_teams_microtask ) { // AC: check if we are inside the teams construct?
2480  team = th->th.th_team;
2481  if( team->t.t_level == th->th.th_teams_level ) {
2482  // this is reduction at teams construct
2483  KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
2484  // Let's swap teams temporarily for the reduction barrier
2485  teams_swapped = 1;
2486  th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2487  th->th.th_team = team->t.t_parent;
2488  th->th.th_team_nproc = th->th.th_team->t.t_nproc;
2489  th->th.th_task_team = th->th.th_team->t.t_task_team[0];
2490  task_state = th->th.th_task_state;
2491  th->th.th_task_state = 0;
2492  }
2493  }
2494 #endif // OMP_40_ENABLED
2495 
2496  // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable
2497  // the variable should be either a construct-specific or thread-specific property, not a team specific property
2498  // (a thread can reach the next reduce block on the next construct, reduce method may differ on the next construct)
2499  // an ident_t "loc" parameter could be used as a construct-specific property (what if loc == 0?)
2500  // (if both construct-specific and team-specific variables were shared, then unness extra syncs should be needed)
2501  // a thread-specific variable is better regarding two issues above (next construct and extra syncs)
2502  // a thread-specific "th_local.reduction_method" variable is used currently
2503  // each thread executes 'determine' and 'set' lines (no need to execute by one thread, to avoid unness extra syncs)
2504 
2505  packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2506  __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2507 
2508  if( packed_reduction_method == critical_reduce_block ) {
2509 
2510  __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2511  retval = 1;
2512 
2513  } else if( packed_reduction_method == empty_reduce_block ) {
2514 
2515  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2516  retval = 1;
2517 
2518  } else if( packed_reduction_method == atomic_reduce_block ) {
2519 
2520  retval = 2;
2521 
2522  // all threads should do this pop here (because __kmpc_end_reduce_nowait() won't be called by the code gen)
2523  // (it's not quite good, because the checking block has been closed by this 'pop',
2524  // but atomic operation has not been executed yet, will be executed slightly later, literally on next instruction)
2525  if ( __kmp_env_consistency_check )
2526  __kmp_pop_sync( global_tid, ct_reduce, loc );
2527 
2528  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2529 
2530  //AT: performance issue: a real barrier here
2531  //AT: (if master goes slow, other threads are blocked here waiting for the master to come and release them)
2532  //AT: (it's not what a customer might expect specifying NOWAIT clause)
2533  //AT: (specifying NOWAIT won't result in improvement of performance, it'll be confusing to a customer)
2534  //AT: another implementation of *barrier_gather*nowait() (or some other design) might go faster
2535  // and be more in line with sense of NOWAIT
2536  //AT: TO DO: do epcc test and compare times
2537 
2538  // this barrier should be invisible to a customer and to the Intel(R) Thread Profiler
2539  // (it's neither a terminating barrier nor customer's code, it's used for an internal purpose)
2540 #if USE_ITT_NOTIFY
2541  __kmp_threads[global_tid]->th.th_ident = loc;
2542 #endif
2543  retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func );
2544  retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2545 
2546  // all other workers except master should do this pop here
2547  // ( none of other workers will get to __kmpc_end_reduce_nowait() )
2548  if ( __kmp_env_consistency_check ) {
2549  if( retval == 0 ) {
2550  __kmp_pop_sync( global_tid, ct_reduce, loc );
2551  }
2552  }
2553 
2554  } else {
2555 
2556  // should never reach this block
2557  KMP_ASSERT( 0 ); // "unexpected method"
2558 
2559  }
2560 #if OMP_40_ENABLED
2561  if( teams_swapped ) {
2562  // Restore thread structure
2563  th->th.th_info.ds.ds_tid = 0;
2564  th->th.th_team = team;
2565  th->th.th_team_nproc = team->t.t_nproc;
2566  th->th.th_task_team = team->t.t_task_team[task_state];
2567  th->th.th_task_state = task_state;
2568  }
2569 #endif
2570  KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2571 
2572  return retval;
2573 }
2574 
2583 void
2584 __kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2585 
2586  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2587 
2588  KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid ) );
2589 
2590  packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2591 
2592  if( packed_reduction_method == critical_reduce_block ) {
2593 
2594  __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2595 
2596  } else if( packed_reduction_method == empty_reduce_block ) {
2597 
2598  // usage: if team size == 1, no synchronization is required ( on Intel platforms only )
2599 
2600  } else if( packed_reduction_method == atomic_reduce_block ) {
2601 
2602  // neither master nor other workers should get here
2603  // (code gen does not generate this call in case 2: atomic reduce block)
2604  // actually it's better to remove this elseif at all;
2605  // after removal this value will checked by the 'else' and will assert
2606 
2607  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2608 
2609  // only master gets here
2610 
2611  } else {
2612 
2613  // should never reach this block
2614  KMP_ASSERT( 0 ); // "unexpected method"
2615 
2616  }
2617 
2618  if ( __kmp_env_consistency_check )
2619  __kmp_pop_sync( global_tid, ct_reduce, loc );
2620 
2621  KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2622 
2623  return;
2624 }
2625 
2626 /* 2.a.ii. Reduce Block with a terminating barrier */
2627 
2641 kmp_int32
2643  ident_t *loc, kmp_int32 global_tid,
2644  kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
2645  void (*reduce_func)(void *lhs_data, void *rhs_data),
2646  kmp_critical_name *lck )
2647 {
2648  KMP_COUNT_BLOCK(REDUCE_wait);
2649  int retval = 0;
2650  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2651 
2652  KA_TRACE( 10, ( "__kmpc_reduce() enter: called T#%d\n", global_tid ) );
2653 
2654  // why do we need this initialization here at all?
2655  // Reduction clause can not be a stand-alone directive.
2656 
2657  // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2658  // possible detection of false-positive race by the threadchecker ???
2659  if( ! TCR_4( __kmp_init_parallel ) )
2660  __kmp_parallel_initialize();
2661 
2662  // check correctness of reduce block nesting
2663 #if KMP_USE_DYNAMIC_LOCK
2664  if ( __kmp_env_consistency_check )
2665  __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
2666 #else
2667  if ( __kmp_env_consistency_check )
2668  __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2669 #endif
2670 
2671  packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2672  __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2673 
2674  if( packed_reduction_method == critical_reduce_block ) {
2675 
2676  __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2677  retval = 1;
2678 
2679  } else if( packed_reduction_method == empty_reduce_block ) {
2680 
2681  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2682  retval = 1;
2683 
2684  } else if( packed_reduction_method == atomic_reduce_block ) {
2685 
2686  retval = 2;
2687 
2688  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2689 
2690  //case tree_reduce_block:
2691  // this barrier should be visible to a customer and to the Intel(R) Thread Profiler
2692  // (it's a terminating barrier on constructs if NOWAIT not specified)
2693 #if USE_ITT_NOTIFY
2694  __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames
2695 #endif
2696  retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func );
2697  retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2698 
2699  // all other workers except master should do this pop here
2700  // ( none of other workers except master will enter __kmpc_end_reduce() )
2701  if ( __kmp_env_consistency_check ) {
2702  if( retval == 0 ) { // 0: all other workers; 1: master
2703  __kmp_pop_sync( global_tid, ct_reduce, loc );
2704  }
2705  }
2706 
2707  } else {
2708 
2709  // should never reach this block
2710  KMP_ASSERT( 0 ); // "unexpected method"
2711 
2712  }
2713 
2714  KA_TRACE( 10, ( "__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2715 
2716  return retval;
2717 }
2718 
2728 void
2729 __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2730 
2731  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2732 
2733  KA_TRACE( 10, ( "__kmpc_end_reduce() enter: called T#%d\n", global_tid ) );
2734 
2735  packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2736 
2737  // this barrier should be visible to a customer and to the Intel(R) Thread Profiler
2738  // (it's a terminating barrier on constructs if NOWAIT not specified)
2739 
2740  if( packed_reduction_method == critical_reduce_block ) {
2741 
2742  __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2743 
2744  // TODO: implicit barrier: should be exposed
2745 #if USE_ITT_NOTIFY
2746  __kmp_threads[global_tid]->th.th_ident = loc;
2747 #endif
2748  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2749 
2750  } else if( packed_reduction_method == empty_reduce_block ) {
2751 
2752  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2753 
2754  // TODO: implicit barrier: should be exposed
2755 #if USE_ITT_NOTIFY
2756  __kmp_threads[global_tid]->th.th_ident = loc;
2757 #endif
2758  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2759 
2760  } else if( packed_reduction_method == atomic_reduce_block ) {
2761 
2762  // TODO: implicit barrier: should be exposed
2763 #if USE_ITT_NOTIFY
2764  __kmp_threads[global_tid]->th.th_ident = loc;
2765 #endif
2766  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2767 
2768  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2769 
2770  // only master executes here (master releases all other workers)
2771  __kmp_end_split_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid );
2772 
2773  } else {
2774 
2775  // should never reach this block
2776  KMP_ASSERT( 0 ); // "unexpected method"
2777 
2778  }
2779 
2780  if ( __kmp_env_consistency_check )
2781  __kmp_pop_sync( global_tid, ct_reduce, loc );
2782 
2783  KA_TRACE( 10, ( "__kmpc_end_reduce() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2784 
2785  return;
2786 }
2787 
2788 #undef __KMP_GET_REDUCTION_METHOD
2789 #undef __KMP_SET_REDUCTION_METHOD
2790 
2791 /*-- end of interface to fast scalable reduce routines ---------------------------------------------------------------*/
2792 
2793 kmp_uint64
2794 __kmpc_get_taskid() {
2795 
2796  kmp_int32 gtid;
2797  kmp_info_t * thread;
2798 
2799  gtid = __kmp_get_gtid();
2800  if ( gtid < 0 ) {
2801  return 0;
2802  }; // if
2803  thread = __kmp_thread_from_gtid( gtid );
2804  return thread->th.th_current_task->td_task_id;
2805 
2806 } // __kmpc_get_taskid
2807 
2808 
2809 kmp_uint64
2810 __kmpc_get_parent_taskid() {
2811 
2812  kmp_int32 gtid;
2813  kmp_info_t * thread;
2814  kmp_taskdata_t * parent_task;
2815 
2816  gtid = __kmp_get_gtid();
2817  if ( gtid < 0 ) {
2818  return 0;
2819  }; // if
2820  thread = __kmp_thread_from_gtid( gtid );
2821  parent_task = thread->th.th_current_task->td_parent;
2822  return ( parent_task == NULL ? 0 : parent_task->td_task_id );
2823 
2824 } // __kmpc_get_parent_taskid
2825 
2826 void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT)
2827 {
2828  if ( ! __kmp_init_serial ) {
2829  __kmp_serial_initialize();
2830  }
2831  __kmp_place_num_sockets = nS;
2832  __kmp_place_socket_offset = sO;
2833  __kmp_place_num_cores = nC;
2834  __kmp_place_core_offset = cO;
2835  __kmp_place_num_threads_per_core = nT;
2836 }
2837 
2838 // end of file //
2839 
kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:709
kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
void(* kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...)
Definition: kmp.h:1307
kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
kmp_int32 __kmpc_global_thread_num(ident_t *loc)
Definition: kmp_csupport.c:117
void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid)
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:668
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
Definition: kmp_stats.h:682
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:629
void __kmpc_flush(ident_t *loc)
Definition: kmp_csupport.c:606
kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end(ident_t *loc)
Definition: kmp_csupport.c:83
void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid)
Definition: kmp_csupport.c:863
void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:464
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:201
void __kmpc_begin(ident_t *loc, kmp_int32 flags)
Definition: kmp_csupport.c:65
kmp_int32 __kmpc_bound_thread_num(ident_t *loc)
Definition: kmp_csupport.c:154
kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *), kmp_int32 didit)
void __kmpc_ordered(ident_t *loc, kmp_int32 gtid)
Definition: kmp_csupport.c:797
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:654
void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
Definition: kmp.h:218
void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:762
void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)
Definition: kmp_csupport.c:259
void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
Definition: kmp_csupport.c:383
kmp_int32 __kmpc_in_parallel(ident_t *loc)
Definition: kmp_csupport.c:244
kmp_int32 __kmpc_ok_to_fork(ident_t *loc)
Definition: kmp_csupport.c:180
kmp_int32 __kmpc_global_num_threads(ident_t *loc)
Definition: kmp_csupport.c:140
kmp_int32 __kmpc_bound_num_threads(ident_t *loc)
Definition: kmp_csupport.c:166
void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:672
void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads)
Definition: kmp_csupport.c:365
kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid)
void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:449
void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
Definition: kmp_csupport.c:300
char const * psource
Definition: kmp.h:227
kmp_int32 flags
Definition: kmp.h:220