Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_runtime.c
1 /*
2  * kmp_runtime.c -- KPTS runtime support library
3  */
4 
5 /* <copyright>
6  Copyright (c) 1997-2015 Intel Corporation. All Rights Reserved.
7 
8  Redistribution and use in source and binary forms, with or without
9  modification, are permitted provided that the following conditions
10  are met:
11 
12  * Redistributions of source code must retain the above copyright
13  notice, this list of conditions and the following disclaimer.
14  * Redistributions in binary form must reproduce the above copyright
15  notice, this list of conditions and the following disclaimer in the
16  documentation and/or other materials provided with the distribution.
17  * Neither the name of Intel Corporation nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 </copyright> */
34 
35 #include "kmp.h"
36 #include "kmp_atomic.h"
37 #include "kmp_wrapper_getpid.h"
38 #include "kmp_environment.h"
39 #include "kmp_itt.h"
40 #include "kmp_str.h"
41 #include "kmp_settings.h"
42 #include "kmp_i18n.h"
43 #include "kmp_io.h"
44 #include "kmp_error.h"
45 #include "kmp_stats.h"
46 #include "kmp_wait_release.h"
47 
48 #if OMPT_SUPPORT
49 #include "ompt-specific.h"
50 #endif
51 
52 /* these are temporary issues to be dealt with */
53 #define KMP_USE_PRCTL 0
54 #define KMP_USE_POOLED_ALLOC 0
55 
56 #if KMP_OS_WINDOWS
57 #include <process.h>
58 #endif
59 
60 
61 
62 #if defined(KMP_GOMP_COMPAT)
63 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
67 #if OMP_40_ENABLED
68  "4.0 (201307)";
69 #else
70  "3.1 (201107)";
71 #endif
72 
73 #ifdef KMP_DEBUG
74 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
75 #endif /* KMP_DEBUG */
76 
77 
78 #define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
79 
80 /* ------------------------------------------------------------------------ */
81 /* ------------------------------------------------------------------------ */
82 
83 kmp_info_t __kmp_monitor;
84 
85 /* ------------------------------------------------------------------------ */
86 /* ------------------------------------------------------------------------ */
87 
88 /* Forward declarations */
89 
90 void __kmp_cleanup( void );
91 
92 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
93 static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
94 static void __kmp_partition_places( kmp_team_t *team );
95 static void __kmp_do_serial_initialize( void );
96 void __kmp_fork_barrier( int gtid, int tid );
97 void __kmp_join_barrier( int gtid );
98 void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
99 
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
103 #endif
104 
105 static int __kmp_expand_threads(int nWish, int nNeed);
106 static int __kmp_unregister_root_other_thread( int gtid );
107 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
108 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
109 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
110 
111 /* ------------------------------------------------------------------------ */
112 /* ------------------------------------------------------------------------ */
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique */
116 /* identifier of executing thread. */
117 /* returns KMP_GTID_DNE if we haven't been assigned a gtid */
118 
119 int
120 __kmp_get_global_thread_id( )
121 {
122  int i;
123  kmp_info_t **other_threads;
124  size_t stack_data;
125  char *stack_addr;
126  size_t stack_size;
127  char *stack_base;
128 
129  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
130  __kmp_nth, __kmp_all_nth ));
131 
132  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
133  parallel region, made it return KMP_GTID_DNE to force serial_initialize by
134  caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
135  __kmp_init_gtid for this to work. */
136 
137  if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
138 
139 #ifdef KMP_TDATA_GTID
140  if ( TCR_4(__kmp_gtid_mode) >= 3) {
141  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
142  return __kmp_gtid;
143  }
144 #endif
145  if ( TCR_4(__kmp_gtid_mode) >= 2) {
146  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
147  return __kmp_gtid_get_specific();
148  }
149  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
150 
151  stack_addr = (char*) & stack_data;
152  other_threads = __kmp_threads;
153 
154  /*
155  ATT: The code below is a source of potential bugs due to unsynchronized access to
156  __kmp_threads array. For example:
157  1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
158  2. Current thread is suspended by OS.
159  3. Another thread unregisters and finishes (debug versions of free() may fill memory
160  with something like 0xEF).
161  4. Current thread is resumed.
162  5. Current thread reads junk from *thr.
163  TODO: Fix it.
164  --ln
165  */
166 
167  for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
168 
169  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
170  if( !thr ) continue;
171 
172  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
173  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
174 
175  /* stack grows down -- search through all of the active threads */
176 
177  if( stack_addr <= stack_base ) {
178  size_t stack_diff = stack_base - stack_addr;
179 
180  if( stack_diff <= stack_size ) {
181  /* The only way we can be closer than the allocated */
182  /* stack size is if we are running on this thread. */
183  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
184  return i;
185  }
186  }
187  }
188 
189  /* get specific to try and determine our gtid */
190  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
191  "thread, using TLS\n" ));
192  i = __kmp_gtid_get_specific();
193 
194  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
195 
196  /* if we havn't been assigned a gtid, then return code */
197  if( i<0 ) return i;
198 
199  /* dynamically updated stack window for uber threads to avoid get_specific call */
200  if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
201  KMP_FATAL( StackOverflow, i );
202  }
203 
204  stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
205  if( stack_addr > stack_base ) {
206  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
208  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
209  } else {
210  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
211  }
212 
213  /* Reprint stack bounds for ubermaster since they have been refined */
214  if ( __kmp_storage_map ) {
215  char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
216  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
217  __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
218  other_threads[i]->th.th_info.ds.ds_stacksize,
219  "th_%d stack (refinement)", i );
220  }
221  return i;
222 }
223 
224 int
225 __kmp_get_global_thread_id_reg( )
226 {
227  int gtid;
228 
229  if ( !__kmp_init_serial ) {
230  gtid = KMP_GTID_DNE;
231  } else
232 #ifdef KMP_TDATA_GTID
233  if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
234  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
235  gtid = __kmp_gtid;
236  } else
237 #endif
238  if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
239  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
240  gtid = __kmp_gtid_get_specific();
241  } else {
242  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
243  gtid = __kmp_get_global_thread_id();
244  }
245 
246  /* we must be a new uber master sibling thread */
247  if( gtid == KMP_GTID_DNE ) {
248  KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
249  "Registering a new gtid.\n" ));
250  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
251  if( !__kmp_init_serial ) {
252  __kmp_do_serial_initialize();
253  gtid = __kmp_gtid_get_specific();
254  } else {
255  gtid = __kmp_register_root(FALSE);
256  }
257  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
258  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
259  }
260 
261  KMP_DEBUG_ASSERT( gtid >=0 );
262 
263  return gtid;
264 }
265 
266 /* caller must hold forkjoin_lock */
267 void
268 __kmp_check_stack_overlap( kmp_info_t *th )
269 {
270  int f;
271  char *stack_beg = NULL;
272  char *stack_end = NULL;
273  int gtid;
274 
275  KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
276  if ( __kmp_storage_map ) {
277  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
278  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
279 
280  gtid = __kmp_gtid_from_thread( th );
281 
282  if (gtid == KMP_GTID_MONITOR) {
283  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
284  "th_%s stack (%s)", "mon",
285  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
286  } else {
287  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
288  "th_%d stack (%s)", gtid,
289  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
290  }
291  }
292 
293  /* No point in checking ubermaster threads since they use refinement and cannot overlap */
294  gtid = __kmp_gtid_from_thread( th );
295  if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid))
296  {
297  KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
298  if ( stack_beg == NULL ) {
299  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
300  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
301  }
302 
303  for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
304  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
305 
306  if( f_th && f_th != th ) {
307  char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
308  char *other_stack_beg = other_stack_end -
309  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
310  if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
311  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
312 
313  /* Print the other stack values before the abort */
314  if ( __kmp_storage_map )
315  __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
316  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
317  "th_%d stack (overlapped)",
318  __kmp_gtid_from_thread( f_th ) );
319 
320  __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
321  }
322  }
323  }
324  }
325  KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
326 }
327 
328 
329 /* ------------------------------------------------------------------------ */
330 
331 /* ------------------------------------------------------------------------ */
332 
333 void
334 __kmp_infinite_loop( void )
335 {
336  static int done = FALSE;
337 
338  while (! done) {
339  KMP_YIELD( 1 );
340  }
341 }
342 
343 #define MAX_MESSAGE 512
344 
345 void
346 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
347  char buffer[MAX_MESSAGE];
348  int node;
349  va_list ap;
350 
351  va_start( ap, format);
352  KMP_SNPRINTF( buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
353  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
354  __kmp_vprintf( kmp_err, buffer, ap );
355 #if KMP_PRINT_DATA_PLACEMENT
356  if(gtid >= 0) {
357  if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
358  if( __kmp_storage_map_verbose ) {
359  node = __kmp_get_host_node(p1);
360  if(node < 0) /* doesn't work, so don't try this next time */
361  __kmp_storage_map_verbose = FALSE;
362  else {
363  char *last;
364  int lastNode;
365  int localProc = __kmp_get_cpu_from_gtid(gtid);
366 
367  p1 = (void *)( (size_t)p1 & ~((size_t)PAGE_SIZE - 1) );
368  p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)PAGE_SIZE - 1) );
369  if(localProc >= 0)
370  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, localProc>>1);
371  else
372  __kmp_printf_no_lock(" GTID %d\n", gtid);
373 # if KMP_USE_PRCTL
374 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
375  do {
376  last = p1;
377  lastNode = node;
378  /* This loop collates adjacent pages with the same host node. */
379  do {
380  (char*)p1 += PAGE_SIZE;
381  } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
382  __kmp_printf_no_lock(" %p-%p memNode %d\n", last,
383  (char*)p1 - 1, lastNode);
384  } while(p1 <= p2);
385 # else
386  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
387  (char*)p1 + (PAGE_SIZE - 1), __kmp_get_host_node(p1));
388  if(p1 < p2) {
389  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
390  (char*)p2 + (PAGE_SIZE - 1), __kmp_get_host_node(p2));
391  }
392 # endif
393  }
394  }
395  } else
396  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR( StorageMapWarning ) );
397  }
398 #endif /* KMP_PRINT_DATA_PLACEMENT */
399  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
400 }
401 
402 void
403 __kmp_warn( char const * format, ... )
404 {
405  char buffer[MAX_MESSAGE];
406  va_list ap;
407 
408  if ( __kmp_generate_warnings == kmp_warnings_off ) {
409  return;
410  }
411 
412  va_start( ap, format );
413 
414  KMP_SNPRINTF( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
415  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
416  __kmp_vprintf( kmp_err, buffer, ap );
417  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
418 
419  va_end( ap );
420 }
421 
422 void
423 __kmp_abort_process()
424 {
425 
426  // Later threads may stall here, but that's ok because abort() will kill them.
427  __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
428 
429  if ( __kmp_debug_buf ) {
430  __kmp_dump_debug_buffer();
431  }; // if
432 
433  if ( KMP_OS_WINDOWS ) {
434  // Let other threads know of abnormal termination and prevent deadlock
435  // if abort happened during library initialization or shutdown
436  __kmp_global.g.g_abort = SIGABRT;
437 
438  /*
439  On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
440  Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
441  works well, but this function is not available in VS7 (this is not problem for DLL, but
442  it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
443  not help, at least in some versions of MS C RTL.
444 
445  It seems following sequence is the only way to simulate abort() and avoid pop-up error
446  box.
447  */
448  raise( SIGABRT );
449  _exit( 3 ); // Just in case, if signal ignored, exit anyway.
450  } else {
451  abort();
452  }; // if
453 
454  __kmp_infinite_loop();
455  __kmp_release_bootstrap_lock( & __kmp_exit_lock );
456 
457 } // __kmp_abort_process
458 
459 void
460 __kmp_abort_thread( void )
461 {
462  // TODO: Eliminate g_abort global variable and this function.
463  // In case of abort just call abort(), it will kill all the threads.
464  __kmp_infinite_loop();
465 } // __kmp_abort_thread
466 
467 /* ------------------------------------------------------------------------ */
468 
469 /*
470  * Print out the storage map for the major kmp_info_t thread data structures
471  * that are allocated together.
472  */
473 
474 static void
475 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
476 {
477  __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
478 
479  __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
480  "th_%d.th_info", gtid );
481 
482  __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
483  "th_%d.th_local", gtid );
484 
485  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
486  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
487 
488  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
489  &thr->th.th_bar[bs_plain_barrier+1],
490  sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
491 
492  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
493  &thr->th.th_bar[bs_forkjoin_barrier+1],
494  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
495 
496  #if KMP_FAST_REDUCTION_BARRIER
497  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
498  &thr->th.th_bar[bs_reduction_barrier+1],
499  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
500  #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /*
504  * Print out the storage map for the major kmp_team_t team data structures
505  * that are allocated together.
506  */
507 
508 static void
509 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
510 {
511  int num_disp_buff = team->t.t_max_nproc > 1 ? KMP_MAX_DISP_BUF : 2;
512  __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
513  header, team_id );
514 
515  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
516  sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
517 
518 
519  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
520  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
521 
522  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
523  sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
524 
525  #if KMP_FAST_REDUCTION_BARRIER
526  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
527  sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
528  #endif // KMP_FAST_REDUCTION_BARRIER
529 
530  __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
531  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
532 
533  __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
534  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
535 
536  __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
537  sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
538  header, team_id );
539 
540  /*
541  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nproc[0], &team->t.t_set_nproc[num_thr],
542  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
543 
544  __kmp_print_storage_map_gtid( -1, &team->t.t_set_dynamic[0], &team->t.t_set_dynamic[num_thr],
545  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
546 
547  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nested[0], &team->t.t_set_nested[num_thr],
548  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
549 
550  __kmp_print_storage_map_gtid( -1, &team->t.t_set_blocktime[0], &team->t.t_set_blocktime[num_thr],
551  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
552 
553  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_intervals[0], &team->t.t_set_bt_intervals[num_thr],
554  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
555 
556  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_set[0], &team->t.t_set_bt_set[num_thr],
557  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
558 
559  //__kmp_print_storage_map_gtid( -1, &team->t.t_set_max_active_levels[0], &team->t.t_set_max_active_levels[num_thr],
560  // sizeof(int) * num_thr, "%s_%d.t_set_max_active_levels", header, team_id );
561 
562  __kmp_print_storage_map_gtid( -1, &team->t.t_set_sched[0], &team->t.t_set_sched[num_thr],
563  sizeof(kmp_r_sched_t) * num_thr, "%s_%d.t_set_sched", header, team_id );
564 #if OMP_40_ENABLED
565  __kmp_print_storage_map_gtid( -1, &team->t.t_set_proc_bind[0], &team->t.t_set_proc_bind[num_thr],
566  sizeof(kmp_proc_bind_t) * num_thr, "%s_%d.t_set_proc_bind", header, team_id );
567 #endif
568  */
569 
570  __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
571  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
572 }
573 
574 static void __kmp_init_allocator() {}
575 static void __kmp_fini_allocator() {}
576 static void __kmp_fini_allocator_thread() {}
577 
578 /* ------------------------------------------------------------------------ */
579 
580 #ifdef KMP_DYNAMIC_LIB
581 # if KMP_OS_WINDOWS
582 
583 
584 static void
585 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
586  // TODO: Change to __kmp_break_bootstrap_lock().
587  __kmp_init_bootstrap_lock( lck ); // make the lock released
588 }
589 
590 static void
591 __kmp_reset_locks_on_process_detach( int gtid_req ) {
592  int i;
593  int thread_count;
594 
595  // PROCESS_DETACH is expected to be called by a thread
596  // that executes ProcessExit() or FreeLibrary().
597  // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
598  // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
599  // However, in fact, some threads can be still alive here, although being about to be terminated.
600  // The threads in the array with ds_thread==0 are most suspicious.
601  // Actually, it can be not safe to access the __kmp_threads[].
602 
603  // TODO: does it make sense to check __kmp_roots[] ?
604 
605  // Let's check that there are no other alive threads registered with the OMP lib.
606  while( 1 ) {
607  thread_count = 0;
608  for( i = 0; i < __kmp_threads_capacity; ++i ) {
609  if( !__kmp_threads ) continue;
610  kmp_info_t* th = __kmp_threads[ i ];
611  if( th == NULL ) continue;
612  int gtid = th->th.th_info.ds.ds_gtid;
613  if( gtid == gtid_req ) continue;
614  if( gtid < 0 ) continue;
615  DWORD exit_val;
616  int alive = __kmp_is_thread_alive( th, &exit_val );
617  if( alive ) {
618  ++thread_count;
619  }
620  }
621  if( thread_count == 0 ) break; // success
622  }
623 
624  // Assume that I'm alone.
625 
626  // Now it might be probably safe to check and reset locks.
627  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
628  __kmp_reset_lock( &__kmp_forkjoin_lock );
629  #ifdef KMP_DEBUG
630  __kmp_reset_lock( &__kmp_stdio_lock );
631  #endif // KMP_DEBUG
632 
633 
634 }
635 
636 BOOL WINAPI
637 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
638  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
639 
640  switch( fdwReason ) {
641 
642  case DLL_PROCESS_ATTACH:
643  KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
644 
645  return TRUE;
646 
647  case DLL_PROCESS_DETACH:
648  KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
649  __kmp_gtid_get_specific() ));
650 
651  if( lpReserved != NULL )
652  {
653  // lpReserved is used for telling the difference:
654  // lpReserved == NULL when FreeLibrary() was called,
655  // lpReserved != NULL when the process terminates.
656  // When FreeLibrary() is called, worker threads remain alive.
657  // So they will release the forkjoin lock by themselves.
658  // When the process terminates, worker threads disappear triggering
659  // the problem of unreleased forkjoin lock as described below.
660 
661  // A worker thread can take the forkjoin lock
662  // in __kmp_suspend_template()->__kmp_rml_decrease_load_before_sleep().
663  // The problem comes up if that worker thread becomes dead
664  // before it releases the forkjoin lock.
665  // The forkjoin lock remains taken, while the thread
666  // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
667  // will try to take the forkjoin lock and will always fail,
668  // so that the application will never finish [normally].
669  // This scenario is possible if __kmpc_end() has not been executed.
670  // It looks like it's not a corner case, but common cases:
671  // - the main function was compiled by an alternative compiler;
672  // - the main function was compiled by icl but without /Qopenmp (application with plugins);
673  // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
674  // - alive foreign thread prevented __kmpc_end from doing cleanup.
675 
676  // This is a hack to work around the problem.
677  // TODO: !!! to figure out something better.
678  __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
679  }
680 
681  __kmp_internal_end_library( __kmp_gtid_get_specific() );
682 
683  return TRUE;
684 
685  case DLL_THREAD_ATTACH:
686  KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
687 
688  /* if we wanted to register new siblings all the time here call
689  * __kmp_get_gtid(); */
690  return TRUE;
691 
692  case DLL_THREAD_DETACH:
693  KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
694  __kmp_gtid_get_specific() ));
695 
696  __kmp_internal_end_thread( __kmp_gtid_get_specific() );
697  return TRUE;
698  }
699 
700  return TRUE;
701 }
702 
703 # endif /* KMP_OS_WINDOWS */
704 #endif /* KMP_DYNAMIC_LIB */
705 
706 
707 /* ------------------------------------------------------------------------ */
708 
709 /* Change the library type to "status" and return the old type */
710 /* called from within initialization routines where __kmp_initz_lock is held */
711 int
712 __kmp_change_library( int status )
713 {
714  int old_status;
715 
716  old_status = __kmp_yield_init & 1; // check whether KMP_LIBRARY=throughput (even init count)
717 
718  if (status) {
719  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
720  }
721  else {
722  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
723  }
724 
725  return old_status; // return previous setting of whether KMP_LIBRARY=throughput
726 }
727 
728 /* ------------------------------------------------------------------------ */
729 /* ------------------------------------------------------------------------ */
730 
731 /* __kmp_parallel_deo --
732  * Wait until it's our turn.
733  */
734 void
735 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
736 {
737  int gtid = *gtid_ref;
738 #ifdef BUILD_PARALLEL_ORDERED
739  kmp_team_t *team = __kmp_team_from_gtid( gtid );
740 #endif /* BUILD_PARALLEL_ORDERED */
741 
742  if( __kmp_env_consistency_check ) {
743  if( __kmp_threads[gtid]->th.th_root->r.r_active )
744 #if KMP_USE_DYNAMIC_LOCK
745  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL, 0 );
746 #else
747  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
748 #endif
749  }
750 #ifdef BUILD_PARALLEL_ORDERED
751  if( !team->t.t_serialized ) {
752  kmp_uint32 spins;
753 
754  KMP_MB();
755  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
756  KMP_MB();
757  }
758 #endif /* BUILD_PARALLEL_ORDERED */
759 }
760 
761 /* __kmp_parallel_dxo --
762  * Signal the next task.
763  */
764 
765 void
766 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
767 {
768  int gtid = *gtid_ref;
769 #ifdef BUILD_PARALLEL_ORDERED
770  int tid = __kmp_tid_from_gtid( gtid );
771  kmp_team_t *team = __kmp_team_from_gtid( gtid );
772 #endif /* BUILD_PARALLEL_ORDERED */
773 
774  if( __kmp_env_consistency_check ) {
775  if( __kmp_threads[gtid]->th.th_root->r.r_active )
776  __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
777  }
778 #ifdef BUILD_PARALLEL_ORDERED
779  if ( ! team->t.t_serialized ) {
780  KMP_MB(); /* Flush all pending memory write invalidates. */
781 
782  /* use the tid of the next thread in this team */
783  /* TODO replace with general release procedure */
784  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
785 
786 #if OMPT_SUPPORT && OMPT_BLAME
787  if ((ompt_status == ompt_status_track_callback) &&
788  ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
789  /* accept blame for "ordered" waiting */
790  kmp_info_t *this_thread = __kmp_threads[gtid];
791  ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
792  this_thread->th.ompt_thread_info.wait_id);
793  }
794 #endif
795 
796  KMP_MB(); /* Flush all pending memory write invalidates. */
797  }
798 #endif /* BUILD_PARALLEL_ORDERED */
799 }
800 
801 /* ------------------------------------------------------------------------ */
802 /* ------------------------------------------------------------------------ */
803 
804 /* ------------------------------------------------------------------------ */
805 /* ------------------------------------------------------------------------ */
806 
807 /* The BARRIER for a SINGLE process section is always explicit */
808 
809 int
810 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
811 {
812  int status;
813  kmp_info_t *th;
814  kmp_team_t *team;
815 
816  if( ! TCR_4(__kmp_init_parallel) )
817  __kmp_parallel_initialize();
818 
819  th = __kmp_threads[ gtid ];
820  team = th->th.th_team;
821  status = 0;
822 
823  th->th.th_ident = id_ref;
824 
825  if ( team->t.t_serialized ) {
826  status = 1;
827  } else {
828  kmp_int32 old_this = th->th.th_local.this_construct;
829 
830  ++th->th.th_local.this_construct;
831  /* try to set team count to thread count--success means thread got the
832  single block
833  */
834  /* TODO: Should this be acquire or release? */
835  status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
836  th->th.th_local.this_construct);
837 #if USE_ITT_BUILD
838  if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) &&
839 #if OMP_40_ENABLED
840  th->th.th_teams_microtask == NULL &&
841 #endif
842  team->t.t_active_level == 1 )
843  { // Only report metadata by master of active team at level 1
844  __kmp_itt_metadata_single( id_ref );
845  }
846 #endif /* USE_ITT_BUILD */
847  }
848 
849  if( __kmp_env_consistency_check ) {
850  if (status && push_ws) {
851  __kmp_push_workshare( gtid, ct_psingle, id_ref );
852  } else {
853  __kmp_check_workshare( gtid, ct_psingle, id_ref );
854  }
855  }
856 #if USE_ITT_BUILD
857  if ( status ) {
858  __kmp_itt_single_start( gtid );
859  }
860 #endif /* USE_ITT_BUILD */
861  return status;
862 }
863 
864 void
865 __kmp_exit_single( int gtid )
866 {
867 #if USE_ITT_BUILD
868  __kmp_itt_single_end( gtid );
869 #endif /* USE_ITT_BUILD */
870  if( __kmp_env_consistency_check )
871  __kmp_pop_workshare( gtid, ct_psingle, NULL );
872 }
873 
874 
875 /*
876  * determine if we can go parallel or must use a serialized parallel region and
877  * how many threads we can use
878  * set_nproc is the number of threads requested for the team
879  * returns 0 if we should serialize or only use one thread,
880  * otherwise the number of threads to use
881  * The forkjoin lock is held by the caller.
882  */
883 static int
884 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
885  int master_tid, int set_nthreads
886 #if OMP_40_ENABLED
887  , int enter_teams
888 #endif /* OMP_40_ENABLED */
889 )
890 {
891  int capacity;
892  int new_nthreads;
893  KMP_DEBUG_ASSERT( __kmp_init_serial );
894  KMP_DEBUG_ASSERT( root && parent_team );
895 
896  //
897  // If dyn-var is set, dynamically adjust the number of desired threads,
898  // according to the method specified by dynamic_mode.
899  //
900  new_nthreads = set_nthreads;
901  if ( ! get__dynamic_2( parent_team, master_tid ) ) {
902  ;
903  }
904 #ifdef USE_LOAD_BALANCE
905  else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
906  new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
907  if ( new_nthreads == 1 ) {
908  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
909  master_tid ));
910  return 1;
911  }
912  if ( new_nthreads < set_nthreads ) {
913  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
914  master_tid, new_nthreads ));
915  }
916  }
917 #endif /* USE_LOAD_BALANCE */
918  else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
919  new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
920  : root->r.r_hot_team->t.t_nproc);
921  if ( new_nthreads <= 1 ) {
922  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
923  master_tid ));
924  return 1;
925  }
926  if ( new_nthreads < set_nthreads ) {
927  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
928  master_tid, new_nthreads ));
929  }
930  else {
931  new_nthreads = set_nthreads;
932  }
933  }
934  else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
935  if ( set_nthreads > 2 ) {
936  new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
937  new_nthreads = ( new_nthreads % set_nthreads ) + 1;
938  if ( new_nthreads == 1 ) {
939  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
940  master_tid ));
941  return 1;
942  }
943  if ( new_nthreads < set_nthreads ) {
944  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
945  master_tid, new_nthreads ));
946  }
947  }
948  }
949  else {
950  KMP_ASSERT( 0 );
951  }
952 
953  //
954  // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
955  //
956  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
957  root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
958  int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
959  root->r.r_hot_team->t.t_nproc );
960  if ( tl_nthreads <= 0 ) {
961  tl_nthreads = 1;
962  }
963 
964  //
965  // If dyn-var is false, emit a 1-time warning.
966  //
967  if ( ! get__dynamic_2( parent_team, master_tid )
968  && ( ! __kmp_reserve_warn ) ) {
969  __kmp_reserve_warn = 1;
970  __kmp_msg(
971  kmp_ms_warning,
972  KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
973  KMP_HNT( Unset_ALL_THREADS ),
974  __kmp_msg_null
975  );
976  }
977  if ( tl_nthreads == 1 ) {
978  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
979  master_tid ));
980  return 1;
981  }
982  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
983  master_tid, tl_nthreads ));
984  new_nthreads = tl_nthreads;
985  }
986 
987 
988  //
989  // Check if the threads array is large enough, or needs expanding.
990  //
991  // See comment in __kmp_register_root() about the adjustment if
992  // __kmp_threads[0] == NULL.
993  //
994  capacity = __kmp_threads_capacity;
995  if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
996  --capacity;
997  }
998  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
999  root->r.r_hot_team->t.t_nproc ) > capacity ) {
1000  //
1001  // Expand the threads array.
1002  //
1003  int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
1004  root->r.r_hot_team->t.t_nproc ) - capacity;
1005  int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
1006  if ( slotsAdded < slotsRequired ) {
1007  //
1008  // The threads array was not expanded enough.
1009  //
1010  new_nthreads -= ( slotsRequired - slotsAdded );
1011  KMP_ASSERT( new_nthreads >= 1 );
1012 
1013  //
1014  // If dyn-var is false, emit a 1-time warning.
1015  //
1016  if ( ! get__dynamic_2( parent_team, master_tid )
1017  && ( ! __kmp_reserve_warn ) ) {
1018  __kmp_reserve_warn = 1;
1019  if ( __kmp_tp_cached ) {
1020  __kmp_msg(
1021  kmp_ms_warning,
1022  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
1023  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
1024  KMP_HNT( PossibleSystemLimitOnThreads ),
1025  __kmp_msg_null
1026  );
1027  }
1028  else {
1029  __kmp_msg(
1030  kmp_ms_warning,
1031  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
1032  KMP_HNT( SystemLimitOnThreads ),
1033  __kmp_msg_null
1034  );
1035  }
1036  }
1037  }
1038  }
1039 
1040  if ( new_nthreads == 1 ) {
1041  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
1042  __kmp_get_gtid(), set_nthreads ) );
1043  return 1;
1044  }
1045 
1046  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
1047  __kmp_get_gtid(), new_nthreads, set_nthreads ));
1048  return new_nthreads;
1049 }
1050 
1051 /* ------------------------------------------------------------------------ */
1052 /* ------------------------------------------------------------------------ */
1053 
1054 /* allocate threads from the thread pool and assign them to the new team */
1055 /* we are assured that there are enough threads available, because we
1056  * checked on that earlier within critical section forkjoin */
1057 
1058 static void
1059 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
1060  kmp_info_t *master_th, int master_gtid )
1061 {
1062  int i;
1063  int use_hot_team;
1064 
1065  KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
1066  KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
1067  KMP_MB();
1068 
1069  /* first, let's setup the master thread */
1070  master_th->th.th_info.ds.ds_tid = 0;
1071  master_th->th.th_team = team;
1072  master_th->th.th_team_nproc = team->t.t_nproc;
1073  master_th->th.th_team_master = master_th;
1074  master_th->th.th_team_serialized = FALSE;
1075  master_th->th.th_dispatch = & team->t.t_dispatch[ 0 ];
1076 
1077  /* make sure we are not the optimized hot team */
1078 #if KMP_NESTED_HOT_TEAMS
1079  use_hot_team = 0;
1080  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1081  if( hot_teams ) { // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
1082  int level = team->t.t_active_level - 1; // index in array of hot teams
1083  if( master_th->th.th_teams_microtask ) { // are we inside the teams?
1084  if( master_th->th.th_teams_size.nteams > 1 ) {
1085  ++level; // level was not increased in teams construct for team_of_masters
1086  }
1087  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1088  master_th->th.th_teams_level == team->t.t_level ) {
1089  ++level; // level was not increased in teams construct for team_of_workers before the parallel
1090  } // team->t.t_level will be increased inside parallel
1091  }
1092  if( level < __kmp_hot_teams_max_level ) {
1093  if( hot_teams[level].hot_team ) {
1094  // hot team has already been allocated for given level
1095  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1096  use_hot_team = 1; // the team is ready to use
1097  } else {
1098  use_hot_team = 0; // AC: threads are not allocated yet
1099  hot_teams[level].hot_team = team; // remember new hot team
1100  hot_teams[level].hot_team_nth = team->t.t_nproc;
1101  }
1102  } else {
1103  use_hot_team = 0;
1104  }
1105  }
1106 #else
1107  use_hot_team = team == root->r.r_hot_team;
1108 #endif
1109  if ( !use_hot_team ) {
1110 
1111  /* install the master thread */
1112  team->t.t_threads[ 0 ] = master_th;
1113  __kmp_initialize_info( master_th, team, 0, master_gtid );
1114 
1115  /* now, install the worker threads */
1116  for ( i=1 ; i < team->t.t_nproc ; i++ ) {
1117 
1118  /* fork or reallocate a new thread and install it in team */
1119  kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
1120  team->t.t_threads[ i ] = thr;
1121  KMP_DEBUG_ASSERT( thr );
1122  KMP_DEBUG_ASSERT( thr->th.th_team == team );
1123  /* align team and thread arrived states */
1124  KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%llu, plain=%llu\n",
1125  __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1126  __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1127  team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
1128  team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
1129 #if OMP_40_ENABLED
1130  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1131  thr->th.th_teams_level = master_th->th.th_teams_level;
1132  thr->th.th_teams_size = master_th->th.th_teams_size;
1133 #endif
1134  { // Initialize threads' barrier data.
1135  int b;
1136  kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
1137  for ( b = 0; b < bs_last_barrier; ++ b ) {
1138  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
1139  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1140 #if USE_DEBUGGER
1141  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
1142 #endif
1143  }; // for b
1144  }
1145  }
1146 
1147 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1148  __kmp_partition_places( team );
1149 #endif
1150 
1151  }
1152 
1153  KMP_MB();
1154 }
1155 
1156 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1157 //
1158 // Propagate any changes to the floating point control registers out to the team
1159 // We try to avoid unnecessary writes to the relevant cache line in the team structure,
1160 // so we don't make changes unless they are needed.
1161 //
1162 inline static void
1163 propagateFPControl(kmp_team_t * team)
1164 {
1165  if ( __kmp_inherit_fp_control ) {
1166  kmp_int16 x87_fpu_control_word;
1167  kmp_uint32 mxcsr;
1168 
1169  // Get master values of FPU control flags (both X87 and vector)
1170  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1171  __kmp_store_mxcsr( &mxcsr );
1172  mxcsr &= KMP_X86_MXCSR_MASK;
1173 
1174  // There is no point looking at t_fp_control_saved here.
1175  // If it is TRUE, we still have to update the values if they are different from those we now have.
1176  // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
1177  // that the values in the team are the same as those we have.
1178  // So, this code achieves what we need whether or not t_fp_control_saved is true.
1179  // By checking whether the value needs updating we avoid unnecessary writes that would put the
1180  // cache-line into a written state, causing all threads in the team to have to read it again.
1181  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1182  team->t.t_x87_fpu_control_word = x87_fpu_control_word;
1183  }
1184  if ( team->t.t_mxcsr != mxcsr ) {
1185  team->t.t_mxcsr = mxcsr;
1186  }
1187  // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
1188  // So we must ensure it is correct.
1189  if (!team->t.t_fp_control_saved) {
1190  team->t.t_fp_control_saved = TRUE;
1191  }
1192  }
1193  else {
1194  // Similarly here. Don't write to this cache-line in the team structure unless we have to.
1195  if (team->t.t_fp_control_saved)
1196  team->t.t_fp_control_saved = FALSE;
1197  }
1198 }
1199 
1200 // Do the opposite, setting the hardware registers to the updated values from the team.
1201 inline static void
1202 updateHWFPControl(kmp_team_t * team)
1203 {
1204  if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
1205  //
1206  // Only reset the fp control regs if they have been changed in the team.
1207  // the parallel region that we are exiting.
1208  //
1209  kmp_int16 x87_fpu_control_word;
1210  kmp_uint32 mxcsr;
1211  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1212  __kmp_store_mxcsr( &mxcsr );
1213  mxcsr &= KMP_X86_MXCSR_MASK;
1214 
1215  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1216  __kmp_clear_x87_fpu_status_word();
1217  __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
1218  }
1219 
1220  if ( team->t.t_mxcsr != mxcsr ) {
1221  __kmp_load_mxcsr( &team->t.t_mxcsr );
1222  }
1223  }
1224 }
1225 #else
1226 # define propagateFPControl(x) ((void)0)
1227 # define updateHWFPControl(x) ((void)0)
1228 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1229 
1230 static void
1231 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
1232 
1233 /*
1234  * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
1235  */
1236 void
1237 __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
1238 {
1239  kmp_info_t *this_thr;
1240  kmp_team_t *serial_team;
1241 
1242  KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
1243 
1244  /* Skip all this code for autopar serialized loops since it results in
1245  unacceptable overhead */
1246  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
1247  return;
1248 
1249  if( ! TCR_4( __kmp_init_parallel ) )
1250  __kmp_parallel_initialize();
1251 
1252  this_thr = __kmp_threads[ global_tid ];
1253  serial_team = this_thr->th.th_serial_team;
1254 
1255  /* utilize the serialized team held by this thread */
1256  KMP_DEBUG_ASSERT( serial_team );
1257  KMP_MB();
1258 
1259  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1260  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1261  KMP_DEBUG_ASSERT( serial_team->t.t_task_team[this_thr->th.th_task_state] == NULL );
1262  KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
1263  global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
1264  this_thr->th.th_task_team = NULL;
1265  }
1266 
1267 #if OMP_40_ENABLED
1268  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1269  if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1270  proc_bind = proc_bind_false;
1271  }
1272  else if ( proc_bind == proc_bind_default ) {
1273  //
1274  // No proc_bind clause was specified, so use the current value
1275  // of proc-bind-var for this parallel region.
1276  //
1277  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1278  }
1279  //
1280  // Reset for next parallel region
1281  //
1282  this_thr->th.th_set_proc_bind = proc_bind_default;
1283 #endif /* OMP_40_ENABLED */
1284 
1285  if( this_thr->th.th_team != serial_team ) {
1286  // Nested level will be an index in the nested nthreads array
1287  int level = this_thr->th.th_team->t.t_level;
1288 
1289  if( serial_team->t.t_serialized ) {
1290  /* this serial team was already used
1291  * TODO increase performance by making this locks more specific */
1292  kmp_team_t *new_team;
1293  int tid = this_thr->th.th_info.ds.ds_tid;
1294 
1295  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1296 
1297 #if OMPT_SUPPORT
1298  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1299 #endif
1300 
1301  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1302 #if OMPT_SUPPORT
1303  ompt_parallel_id,
1304 #endif
1305 #if OMP_40_ENABLED
1306  proc_bind,
1307 #endif
1308  & this_thr->th.th_current_task->td_icvs,
1309  0 USE_NESTED_HOT_ARG(NULL) );
1310  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1311  KMP_ASSERT( new_team );
1312 
1313  /* setup new serialized team and install it */
1314  new_team->t.t_threads[0] = this_thr;
1315  new_team->t.t_parent = this_thr->th.th_team;
1316  serial_team = new_team;
1317  this_thr->th.th_serial_team = serial_team;
1318 
1319  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1320  global_tid, serial_team ) );
1321 
1322 
1323  /* TODO the above breaks the requirement that if we run out of
1324  * resources, then we can still guarantee that serialized teams
1325  * are ok, since we may need to allocate a new one */
1326  } else {
1327  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1328  global_tid, serial_team ) );
1329  }
1330 
1331  /* we have to initialize this serial team */
1332  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1333  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1334  KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
1335  serial_team->t.t_ident = loc;
1336  serial_team->t.t_serialized = 1;
1337  serial_team->t.t_nproc = 1;
1338  serial_team->t.t_parent = this_thr->th.th_team;
1339  serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1340  this_thr->th.th_team = serial_team;
1341  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1342 
1343  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
1344  global_tid, this_thr->th.th_current_task ) );
1345  KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
1346  this_thr->th.th_current_task->td_flags.executing = 0;
1347 
1348  __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
1349 
1350  /* TODO: GEH: do the ICVs work for nested serialized teams? Don't we need an implicit task for
1351  each serialized task represented by team->t.t_serialized? */
1352  copy_icvs(
1353  & this_thr->th.th_current_task->td_icvs,
1354  & this_thr->th.th_current_task->td_parent->td_icvs );
1355 
1356  // Thread value exists in the nested nthreads array for the next nested level
1357  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1358  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1359  }
1360 
1361 #if OMP_40_ENABLED
1362  if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
1363  this_thr->th.th_current_task->td_icvs.proc_bind
1364  = __kmp_nested_proc_bind.bind_types[ level + 1 ];
1365  }
1366 #endif /* OMP_40_ENABLED */
1367 
1368 #if USE_DEBUGGER
1369  serial_team->t.t_pkfn = (microtask_t)( ~0 ); // For the debugger.
1370 #endif
1371  this_thr->th.th_info.ds.ds_tid = 0;
1372 
1373  /* set thread cache values */
1374  this_thr->th.th_team_nproc = 1;
1375  this_thr->th.th_team_master = this_thr;
1376  this_thr->th.th_team_serialized = 1;
1377 
1378  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1379  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1380 
1381  propagateFPControl (serial_team);
1382 
1383  /* check if we need to allocate dispatch buffers stack */
1384  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1385  if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
1386  serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
1387  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1388  }
1389  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1390 
1391 #if OMPT_SUPPORT
1392  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1393  __ompt_team_assign_id(serial_team, ompt_parallel_id);
1394 #endif
1395 
1396  KMP_MB();
1397 
1398  } else {
1399  /* this serialized team is already being used,
1400  * that's fine, just add another nested level */
1401  KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
1402  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1403  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1404  ++ serial_team->t.t_serialized;
1405  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1406 
1407  // Nested level will be an index in the nested nthreads array
1408  int level = this_thr->th.th_team->t.t_level;
1409  // Thread value exists in the nested nthreads array for the next nested level
1410  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1411  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1412  }
1413  serial_team->t.t_level++;
1414  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
1415  global_tid, serial_team, serial_team->t.t_level ) );
1416 
1417  /* allocate/push dispatch buffers stack */
1418  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1419  {
1420  dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
1421  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1422  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1423  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1424  }
1425  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1426 
1427  KMP_MB();
1428  }
1429 
1430  if ( __kmp_env_consistency_check )
1431  __kmp_push_parallel( global_tid, NULL );
1432 
1433 #if USE_ITT_BUILD
1434  // Mark the start of the "parallel" region for VTune. Only use one of frame notification scheme at the moment
1435  if ( serial_team->t.t_level == 1
1436 #if OMP_40_ENABLED
1437  && this_thr->th.th_teams_microtask == NULL
1438 #endif
1439  ) {
1440 #if USE_ITT_NOTIFY
1441  // Save the start of the "parallel" region for VTune. This is the frame begin at the same time.
1442  if ( ( __itt_get_timestamp_ptr || KMP_ITT_DEBUG ) &&
1443  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
1444  {
1445  serial_team->t.t_region_time = this_thr->th.th_frame_time_serialized = __itt_get_timestamp();
1446  } else // only one notification scheme (either "submit" or "forking/joined", not both)
1447 #endif
1448  if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
1449  __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode )
1450  {
1451  this_thr->th.th_ident = loc;
1452  // 0 - no barriers; 1 - serialized parallel
1453  __kmp_itt_region_forking( global_tid, this_thr->th.th_team_nproc, 0, 1 );
1454  }
1455  }
1456 #endif /* USE_ITT_BUILD */
1457 }
1458 
1459 /* most of the work for a fork */
1460 /* return true if we really went parallel, false if serialized */
1461 int
1462 __kmp_fork_call(
1463  ident_t * loc,
1464  int gtid,
1465  enum fork_context_e call_context, // Intel, GNU, ...
1466  kmp_int32 argc,
1467 #if OMPT_SUPPORT
1468  void *unwrapped_task,
1469 #endif
1470  microtask_t microtask,
1471  launch_t invoker,
1472 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1473 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1474  va_list * ap
1475 #else
1476  va_list ap
1477 #endif
1478  )
1479 {
1480  void **argv;
1481  int i;
1482  int master_tid;
1483  int master_this_cons;
1484  kmp_team_t *team;
1485  kmp_team_t *parent_team;
1486  kmp_info_t *master_th;
1487  kmp_root_t *root;
1488  int nthreads;
1489  int master_active;
1490  int master_set_numthreads;
1491  int level;
1492 #if OMP_40_ENABLED
1493  int active_level;
1494  int teams_level;
1495 #endif
1496 #if KMP_NESTED_HOT_TEAMS
1497  kmp_hot_team_ptr_t **p_hot_teams;
1498 #endif
1499  { // KMP_TIME_BLOCK
1500  KMP_TIME_BLOCK(KMP_fork_call);
1501 
1502  KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
1503  if ( __kmp_stkpadding > 0 && __kmp_root[gtid] != NULL ) {
1504  /* Some systems prefer the stack for the root thread(s) to start with */
1505  /* some gap from the parent stack to prevent false sharing. */
1506  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1507  /* These 2 lines below are so this does not get optimized out */
1508  if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
1509  __kmp_stkpadding += (short)((kmp_int64)dummy);
1510  }
1511 
1512  /* initialize if needed */
1513  KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
1514  if( ! TCR_4(__kmp_init_parallel) )
1515  __kmp_parallel_initialize();
1516 
1517  /* setup current data */
1518  master_th = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
1519  parent_team = master_th->th.th_team;
1520  master_tid = master_th->th.th_info.ds.ds_tid;
1521  master_this_cons = master_th->th.th_local.this_construct;
1522  root = master_th->th.th_root;
1523  master_active = root->r.r_active;
1524  master_set_numthreads = master_th->th.th_set_nproc;
1525 
1526 #if OMPT_SUPPORT
1527  ompt_parallel_id_t ompt_parallel_id;
1528  ompt_task_id_t ompt_task_id;
1529  ompt_frame_t *ompt_frame;
1530  ompt_task_id_t my_task_id;
1531  ompt_parallel_id_t my_parallel_id;
1532 
1533  if (ompt_status & ompt_status_track) {
1534  ompt_parallel_id = __ompt_parallel_id_new(gtid);
1535  ompt_task_id = __ompt_get_task_id_internal(0);
1536  ompt_frame = __ompt_get_task_frame_internal(0);
1537  }
1538 #endif
1539 
1540  // Nested level will be an index in the nested nthreads array
1541  level = parent_team->t.t_level;
1542 #if OMP_40_ENABLED
1543  active_level = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
1544  teams_level = master_th->th.th_teams_level; // needed to check nesting inside the teams
1545 #endif
1546 #if KMP_NESTED_HOT_TEAMS
1547  p_hot_teams = &master_th->th.th_hot_teams;
1548  if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
1549  *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
1550  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1551  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1552  (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
1553  }
1554 #endif
1555 
1556 #if USE_DEBUGGER
1557  if ( __kmp_debugging ) { // Let debugger override number of threads.
1558  int nth = __kmp_omp_num_threads( loc );
1559  if ( nth > 0 ) { // 0 means debugger does not want to change number of threads.
1560  master_set_numthreads = nth;
1561  }; // if
1562  }; // if
1563 #endif
1564 
1565 #if OMPT_SUPPORT
1566  if ((ompt_status == ompt_status_track_callback) &&
1567  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1568  int team_size = master_set_numthreads;
1569 
1570  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1571  ompt_task_id, ompt_frame, ompt_parallel_id,
1572  team_size, unwrapped_task);
1573  }
1574 #endif
1575 
1576  master_th->th.th_ident = loc;
1577 
1578 #if OMP_40_ENABLED
1579  if ( master_th->th.th_teams_microtask &&
1580  ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
1581  // AC: This is start of parallel that is nested inside teams construct.
1582  // The team is actual (hot), all workers are ready at the fork barrier.
1583  // No lock needed to initialize the team a bit, then free workers.
1584  parent_team->t.t_ident = loc;
1585  parent_team->t.t_argc = argc;
1586  argv = (void**)parent_team->t.t_argv;
1587  for( i=argc-1; i >= 0; --i )
1588 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1589 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1590  *argv++ = va_arg( *ap, void * );
1591 #else
1592  *argv++ = va_arg( ap, void * );
1593 #endif
1594  /* Increment our nested depth levels, but not increase the serialization */
1595  if ( parent_team == master_th->th.th_serial_team ) {
1596  // AC: we are in serialized parallel
1597  __kmpc_serialized_parallel(loc, gtid);
1598  KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
1599  parent_team->t.t_serialized--; // AC: need this in order enquiry functions
1600  // work correctly, will restore at join time
1601 
1602 #if OMPT_SUPPORT
1603  void *dummy;
1604  void **exit_runtime_p;
1605 
1606  ompt_lw_taskteam_t lw_taskteam;
1607 
1608  if (ompt_status & ompt_status_track) {
1609  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1610  unwrapped_task, ompt_parallel_id);
1611  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1612  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1613 
1614  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1615 
1616 #if OMPT_TRACE
1617  /* OMPT implicit task begin */
1618  my_task_id = lw_taskteam.ompt_task_info.task_id;
1619  my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1620  if ((ompt_status == ompt_status_track_callback) &&
1621  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1622  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1623  my_parallel_id, my_task_id);
1624  }
1625 #endif
1626 
1627  /* OMPT state */
1628  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1629  } else {
1630  exit_runtime_p = &dummy;
1631  }
1632 #endif
1633 
1634  KMP_TIME_BLOCK(OMP_work);
1635  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1636 #if OMPT_SUPPORT
1637  , exit_runtime_p
1638 #endif
1639  );
1640 
1641 #if OMPT_SUPPORT
1642  if (ompt_status & ompt_status_track) {
1643 #if OMPT_TRACE
1644  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1645 
1646  if ((ompt_status == ompt_status_track_callback) &&
1647  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1648  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1649  ompt_parallel_id, ompt_task_id);
1650  }
1651 
1652  __ompt_lw_taskteam_unlink(master_th);
1653  // reset clear the task id only after unlinking the task
1654  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1655 #endif
1656 
1657  if ((ompt_status == ompt_status_track_callback) &&
1658  ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1659  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1660  ompt_parallel_id, ompt_task_id);
1661  }
1662  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1663  }
1664 #endif
1665  return TRUE;
1666  }
1667 
1668  parent_team->t.t_pkfn = microtask;
1669 #if OMPT_SUPPORT
1670  parent_team->t.ompt_team_info.microtask = unwrapped_task;
1671 #endif
1672  parent_team->t.t_invoke = invoker;
1673  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1674  parent_team->t.t_active_level ++;
1675  parent_team->t.t_level ++;
1676 
1677  /* Change number of threads in the team if requested */
1678  if ( master_set_numthreads ) { // The parallel has num_threads clause
1679  if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
1680  // AC: only can reduce the number of threads dynamically, cannot increase
1681  kmp_info_t **other_threads = parent_team->t.t_threads;
1682  parent_team->t.t_nproc = master_set_numthreads;
1683  for ( i = 0; i < master_set_numthreads; ++i ) {
1684  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1685  }
1686  // Keep extra threads hot in the team for possible next parallels
1687  }
1688  master_th->th.th_set_nproc = 0;
1689  }
1690 
1691 
1692  KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1693  __kmp_internal_fork( loc, gtid, parent_team );
1694  KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1695 
1696  /* Invoke microtask for MASTER thread */
1697  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1698  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1699 
1700  {
1701  KMP_TIME_BLOCK(OMP_work);
1702  if (! parent_team->t.t_invoke( gtid )) {
1703  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1704  }
1705  }
1706  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1707  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1708  KMP_MB(); /* Flush all pending memory write invalidates. */
1709 
1710  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1711 
1712  return TRUE;
1713  } // Parallel closely nested in teams construct
1714 #endif /* OMP_40_ENABLED */
1715 
1716 #if KMP_DEBUG
1717  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1718  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
1719  }
1720 #endif
1721 
1722  if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
1723  nthreads = 1;
1724  } else {
1725 #if OMP_40_ENABLED
1726  int enter_teams = ((ap==NULL && active_level==0)||(ap && teams_level>0 && teams_level==level));
1727 #endif
1728  nthreads = master_set_numthreads ?
1729  master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
1730 
1731  // Check if we need to take forkjoin lock? (no need for serialized parallel out of teams construct).
1732  // This code moved here from __kmp_reserve_threads() to speedup nested serialized parallels.
1733  if (nthreads > 1) {
1734  if ( ( !get__nested(master_th) && (root->r.r_in_parallel
1735 #if OMP_40_ENABLED
1736  && !enter_teams
1737 #endif /* OMP_40_ENABLED */
1738  ) ) || ( __kmp_library == library_serial ) ) {
1739  KC_TRACE( 10, ( "__kmp_fork_call: T#%d serializing team; requested %d threads\n",
1740  gtid, nthreads ));
1741  nthreads = 1;
1742  }
1743  }
1744  if ( nthreads > 1 ) {
1745  /* determine how many new threads we can use */
1746  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1747 
1748  nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
1749 #if OMP_40_ENABLED
1750 /* AC: If we execute teams from parallel region (on host), then teams should be created
1751  but each can only have 1 thread if nesting is disabled. If teams called from serial region,
1752  then teams and their threads should be created regardless of the nesting setting. */
1753  , enter_teams
1754 #endif /* OMP_40_ENABLED */
1755  );
1756  if ( nthreads == 1 ) {
1757  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1758  }
1759  }
1760  }
1761  KMP_DEBUG_ASSERT( nthreads > 0 );
1762 
1763  /* If we temporarily changed the set number of threads then restore it now */
1764  master_th->th.th_set_nproc = 0;
1765 
1766 
1767  /* create a serialized parallel region? */
1768  if ( nthreads == 1 ) {
1769  /* josh todo: hypothetical question: what do we do for OS X*? */
1770 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1771  void * args[ argc ];
1772 #else
1773  void * * args = (void**) KMP_ALLOCA( argc * sizeof( void * ) );
1774 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
1775 
1776  KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
1777 
1778  __kmpc_serialized_parallel(loc, gtid);
1779 
1780  if ( call_context == fork_context_intel ) {
1781  /* TODO this sucks, use the compiler itself to pass args! :) */
1782  master_th->th.th_serial_team->t.t_ident = loc;
1783 #if OMP_40_ENABLED
1784  if ( !ap ) {
1785  // revert change made in __kmpc_serialized_parallel()
1786  master_th->th.th_serial_team->t.t_level--;
1787  // Get args from parent team for teams construct
1788 
1789 #if OMPT_SUPPORT
1790  void *dummy;
1791  void **exit_runtime_p;
1792 
1793  ompt_lw_taskteam_t lw_taskteam;
1794 
1795  if (ompt_status & ompt_status_track) {
1796  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1797  unwrapped_task, ompt_parallel_id);
1798  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1799  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1800 
1801  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1802 
1803 #if OMPT_TRACE
1804  my_task_id = lw_taskteam.ompt_task_info.task_id;
1805  if ((ompt_status == ompt_status_track_callback) &&
1806  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1807  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1808  ompt_parallel_id, my_task_id);
1809  }
1810 #endif
1811 
1812  /* OMPT state */
1813  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1814  } else {
1815  exit_runtime_p = &dummy;
1816  }
1817 #endif
1818 
1819  {
1820  KMP_TIME_BLOCK(OMP_work);
1821  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1822 #if OMPT_SUPPORT
1823  , exit_runtime_p
1824 #endif
1825  );
1826  }
1827 
1828 #if OMPT_SUPPORT
1829  if (ompt_status & ompt_status_track) {
1830  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1831 
1832 #if OMPT_TRACE
1833  if ((ompt_status == ompt_status_track_callback) &&
1834  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1835  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1836  ompt_parallel_id, ompt_task_id);
1837  }
1838 #endif
1839 
1840  __ompt_lw_taskteam_unlink(master_th);
1841  // reset clear the task id only after unlinking the task
1842  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1843 
1844  if ((ompt_status == ompt_status_track_callback) &&
1845  ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1846  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1847  ompt_parallel_id, ompt_task_id);
1848  }
1849  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1850  }
1851 #endif
1852  } else if ( microtask == (microtask_t)__kmp_teams_master ) {
1853  KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
1854  team = master_th->th.th_team;
1855  //team->t.t_pkfn = microtask;
1856  team->t.t_invoke = invoker;
1857  __kmp_alloc_argv_entries( argc, team, TRUE );
1858  team->t.t_argc = argc;
1859  argv = (void**) team->t.t_argv;
1860  if ( ap ) {
1861  for( i=argc-1; i >= 0; --i )
1862 // TODO: revert workaround for Intel(R) 64 tracker #96
1863 # if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1864  *argv++ = va_arg( *ap, void * );
1865 # else
1866  *argv++ = va_arg( ap, void * );
1867 # endif
1868  } else {
1869  for( i=0; i < argc; ++i )
1870  // Get args from parent team for teams construct
1871  argv[i] = parent_team->t.t_argv[i];
1872  }
1873  // AC: revert change made in __kmpc_serialized_parallel()
1874  // because initial code in teams should have level=0
1875  team->t.t_level--;
1876  // AC: call special invoker for outer "parallel" of the teams construct
1877  {
1878  KMP_TIME_BLOCK(OMP_work);
1879  invoker(gtid);
1880  }
1881  } else {
1882 #endif /* OMP_40_ENABLED */
1883  argv = args;
1884  for( i=argc-1; i >= 0; --i )
1885 // TODO: revert workaround for Intel(R) 64 tracker #96
1886 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1887  *argv++ = va_arg( *ap, void * );
1888 #else
1889  *argv++ = va_arg( ap, void * );
1890 #endif
1891  KMP_MB();
1892 
1893 #if OMPT_SUPPORT
1894  void *dummy;
1895  void **exit_runtime_p;
1896 
1897  ompt_lw_taskteam_t lw_taskteam;
1898 
1899  if (ompt_status & ompt_status_track) {
1900  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1901  unwrapped_task, ompt_parallel_id);
1902  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1903  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1904 
1905  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1906 
1907 #if OMPT_TRACE
1908  /* OMPT implicit task begin */
1909  my_task_id = lw_taskteam.ompt_task_info.task_id;
1910  my_parallel_id = ompt_parallel_id;
1911  if ((ompt_status == ompt_status_track_callback) &&
1912  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1913  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1914  my_parallel_id, my_task_id);
1915  }
1916 #endif
1917 
1918  /* OMPT state */
1919  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1920  } else {
1921  exit_runtime_p = &dummy;
1922  }
1923 #endif
1924 
1925  {
1926  KMP_TIME_BLOCK(OMP_work);
1927  __kmp_invoke_microtask( microtask, gtid, 0, argc, args
1928 #if OMPT_SUPPORT
1929  , exit_runtime_p
1930 #endif
1931  );
1932  }
1933 
1934 #if OMPT_SUPPORT
1935  if (ompt_status & ompt_status_track) {
1936 #if OMPT_TRACE
1937  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1938 
1939  if ((ompt_status == ompt_status_track_callback) &&
1940  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1941  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1942  my_parallel_id, my_task_id);
1943  }
1944 #endif
1945 
1946  __ompt_lw_taskteam_unlink(master_th);
1947  // reset clear the task id only after unlinking the task
1948  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1949 
1950  if ((ompt_status == ompt_status_track_callback) &&
1951  ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1952  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1953  ompt_parallel_id, ompt_task_id);
1954  }
1955  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1956  }
1957 #endif
1958 #if OMP_40_ENABLED
1959  }
1960 #endif /* OMP_40_ENABLED */
1961  }
1962  else if ( call_context == fork_context_gnu ) {
1963 #if OMPT_SUPPORT
1964  ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
1965  __kmp_allocate(sizeof(ompt_lw_taskteam_t));
1966  __ompt_lw_taskteam_init(lwt, master_th, gtid,
1967  unwrapped_task, ompt_parallel_id);
1968 
1969  lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1970  lwt->ompt_task_info.frame.exit_runtime_frame = 0;
1971  __ompt_lw_taskteam_link(lwt, master_th);
1972 #endif
1973 
1974  // we were called from GNU native code
1975  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1976  return FALSE;
1977  }
1978  else {
1979  KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
1980  }
1981 
1982 
1983  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1984  KMP_MB();
1985  return FALSE;
1986  }
1987 
1988  // GEH: only modify the executing flag in the case when not serialized
1989  // serialized case is handled in kmpc_serialized_parallel
1990  KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
1991  parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
1992  master_th->th.th_current_task->td_icvs.max_active_levels ) );
1993  // TODO: GEH - cannot do this assertion because root thread not set up as executing
1994  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1995  master_th->th.th_current_task->td_flags.executing = 0;
1996 
1997 #if OMP_40_ENABLED
1998  if ( !master_th->th.th_teams_microtask || level > teams_level )
1999 #endif /* OMP_40_ENABLED */
2000  {
2001  /* Increment our nested depth level */
2002  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
2003  }
2004 
2005  // See if we need to make a copy of the ICVs.
2006  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2007  if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
2008  nthreads_icv = __kmp_nested_nth.nth[level+1];
2009  }
2010  else {
2011  nthreads_icv = 0; // don't update
2012  }
2013 
2014 #if OMP_40_ENABLED
2015  // Figure out the proc_bind_policy for the new team.
2016  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2017  kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
2018  if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
2019  proc_bind = proc_bind_false;
2020  }
2021  else {
2022  if (proc_bind == proc_bind_default) {
2023  // No proc_bind clause specified; use current proc-bind-var for this parallel region
2024  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2025  }
2026  /* else: The proc_bind policy was specified explicitly on parallel clause. This
2027  overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
2028  // Figure the value of proc-bind-var for the child threads.
2029  if ((level+1 < __kmp_nested_proc_bind.used)
2030  && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
2031  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
2032  }
2033  }
2034 
2035  // Reset for next parallel region
2036  master_th->th.th_set_proc_bind = proc_bind_default;
2037 #endif /* OMP_40_ENABLED */
2038 
2039  if ((nthreads_icv > 0)
2040 #if OMP_40_ENABLED
2041  || (proc_bind_icv != proc_bind_default)
2042 #endif /* OMP_40_ENABLED */
2043  ) {
2044  kmp_internal_control_t new_icvs;
2045  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2046  new_icvs.next = NULL;
2047  if (nthreads_icv > 0) {
2048  new_icvs.nproc = nthreads_icv;
2049  }
2050 
2051 #if OMP_40_ENABLED
2052  if (proc_bind_icv != proc_bind_default) {
2053  new_icvs.proc_bind = proc_bind_icv;
2054  }
2055 #endif /* OMP_40_ENABLED */
2056 
2057  /* allocate a new parallel team */
2058  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
2059  team = __kmp_allocate_team(root, nthreads, nthreads,
2060 #if OMPT_SUPPORT
2061  ompt_parallel_id,
2062 #endif
2063 #if OMP_40_ENABLED
2064  proc_bind,
2065 #endif
2066  &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
2067  } else {
2068  /* allocate a new parallel team */
2069  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
2070  team = __kmp_allocate_team(root, nthreads, nthreads,
2071 #if OMPT_SUPPORT
2072  ompt_parallel_id,
2073 #endif
2074 #if OMP_40_ENABLED
2075  proc_bind,
2076 #endif
2077  &master_th->th.th_current_task->td_icvs, argc
2078  USE_NESTED_HOT_ARG(master_th) );
2079  }
2080  KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
2081 
2082  /* setup the new team */
2083  team->t.t_master_tid = master_tid;
2084  team->t.t_master_this_cons = master_this_cons;
2085  team->t.t_ident = loc;
2086  team->t.t_parent = parent_team;
2087  TCW_SYNC_PTR(team->t.t_pkfn, microtask);
2088 #if OMPT_SUPPORT
2089  TCW_SYNC_PTR(team->t.ompt_team_info.microtask, unwrapped_task);
2090 #endif
2091  team->t.t_invoke = invoker; /* TODO move this to root, maybe */
2092  // TODO: parent_team->t.t_level == INT_MAX ???
2093 #if OMP_40_ENABLED
2094  if ( !master_th->th.th_teams_microtask || level > teams_level ) {
2095 #endif /* OMP_40_ENABLED */
2096  team->t.t_level = parent_team->t.t_level + 1;
2097  team->t.t_active_level = parent_team->t.t_active_level + 1;
2098 #if OMP_40_ENABLED
2099  } else {
2100  // AC: Do not increase parallel level at start of the teams construct
2101  team->t.t_level = parent_team->t.t_level;
2102  team->t.t_active_level = parent_team->t.t_active_level;
2103  }
2104 #endif /* OMP_40_ENABLED */
2105  team->t.t_sched = get__sched_2(parent_team, master_tid); // set master's schedule as new run-time schedule
2106 
2107  // Update the floating point rounding in the team if required.
2108  propagateFPControl(team);
2109 
2110  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2111  // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
2112  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
2113  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
2114  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2115  parent_team, team->t.t_task_team[master_th->th.th_task_state], team ) );
2116  if (level) {
2117  // Take a memo of master's task_state
2118  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2119  if (master_th->th.th_task_state_top >= master_th->th.th_task_state_stack_sz) { // increase size
2120  kmp_uint8 *old_stack, *new_stack = (kmp_uint8 *) __kmp_allocate( 2*master_th->th.th_task_state_stack_sz );
2121  kmp_uint32 i;
2122  for (i=0; i<master_th->th.th_task_state_stack_sz; ++i) {
2123  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2124  }
2125  old_stack = master_th->th.th_task_state_memo_stack;
2126  master_th->th.th_task_state_memo_stack = new_stack;
2127  master_th->th.th_task_state_stack_sz *= 2;
2128  __kmp_free(old_stack);
2129  }
2130  // Store master's task_state on stack
2131  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2132  master_th->th.th_task_state_top++;
2133  master_th->th.th_task_state = 0;
2134  }
2135  master_th->th.th_task_team = team->t.t_task_team[master_th->th.th_task_state];
2136 
2137 #if !KMP_NESTED_HOT_TEAMS
2138  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
2139 #endif
2140  }
2141 
2142  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2143  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
2144  KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
2145  ( team->t.t_master_tid == 0 &&
2146  ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
2147  KMP_MB();
2148 
2149  /* now, setup the arguments */
2150  argv = (void**)team->t.t_argv;
2151 #if OMP_40_ENABLED
2152  if ( ap ) {
2153 #endif /* OMP_40_ENABLED */
2154  for ( i=argc-1; i >= 0; --i )
2155 // TODO: revert workaround for Intel(R) 64 tracker #96
2156 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2157  *argv++ = va_arg( *ap, void * );
2158 #else
2159  *argv++ = va_arg( ap, void * );
2160 #endif
2161 #if OMP_40_ENABLED
2162  } else {
2163  for ( i=0; i < argc; ++i )
2164  // Get args from parent team for teams construct
2165  argv[i] = team->t.t_parent->t.t_argv[i];
2166  }
2167 #endif /* OMP_40_ENABLED */
2168 
2169  /* now actually fork the threads */
2170  team->t.t_master_active = master_active;
2171  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2172  root->r.r_active = TRUE;
2173 
2174  __kmp_fork_team_threads( root, team, master_th, gtid );
2175  __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
2176 
2177 
2178 #if OMPT_SUPPORT
2179  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2180 #endif
2181 
2182  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2183 
2184 
2185 #if USE_ITT_BUILD
2186  if ( team->t.t_active_level == 1 // only report frames at level 1
2187 # if OMP_40_ENABLED
2188  && !master_th->th.th_teams_microtask // not in teams construct
2189 # endif /* OMP_40_ENABLED */
2190  ) {
2191 #if USE_ITT_NOTIFY
2192  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
2193  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
2194  {
2195  kmp_uint64 tmp_time = 0;
2196  if ( __itt_get_timestamp_ptr )
2197  tmp_time = __itt_get_timestamp();
2198  // Internal fork - report frame begin
2199  master_th->th.th_frame_time = tmp_time;
2200  if ( __kmp_forkjoin_frames_mode == 3 )
2201  team->t.t_region_time = tmp_time;
2202  } else // only one notification scheme (either "submit" or "forking/joined", not both)
2203 #endif /* USE_ITT_NOTIFY */
2204  if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
2205  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode )
2206  { // Mark start of "parallel" region for VTune.
2207  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2208  }
2209  }
2210 #endif /* USE_ITT_BUILD */
2211 
2212  /* now go on and do the work */
2213  KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
2214  KMP_MB();
2215  KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2216  root, team, master_th, gtid));
2217 
2218 #if USE_ITT_BUILD
2219  if ( __itt_stack_caller_create_ptr ) {
2220  team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
2221  }
2222 #endif /* USE_ITT_BUILD */
2223 
2224 #if OMP_40_ENABLED
2225  if ( ap ) // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
2226 #endif /* OMP_40_ENABLED */
2227  {
2228  __kmp_internal_fork( loc, gtid, team );
2229  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
2230  root, team, master_th, gtid));
2231  }
2232 
2233  if (call_context == fork_context_gnu) {
2234  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2235  return TRUE;
2236  }
2237 
2238  /* Invoke microtask for MASTER thread */
2239  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2240  gtid, team->t.t_id, team->t.t_pkfn ) );
2241  } // END of timer KMP_fork_call block
2242 
2243  {
2244  //KMP_TIME_BLOCK(OMP_work);
2245  KMP_TIME_BLOCK(USER_master_invoke);
2246  if (! team->t.t_invoke( gtid )) {
2247  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2248  }
2249  }
2250  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2251  gtid, team->t.t_id, team->t.t_pkfn ) );
2252  KMP_MB(); /* Flush all pending memory write invalidates. */
2253 
2254  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2255 
2256 #if OMPT_SUPPORT
2257  if (ompt_status & ompt_status_track) {
2258  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2259  }
2260 #endif
2261 
2262  return TRUE;
2263 }
2264 
2265 #if OMPT_SUPPORT
2266 static inline void
2267 __kmp_join_restore_state(
2268  kmp_info_t *thread,
2269  kmp_team_t *team)
2270 {
2271  // restore state outside the region
2272  thread->th.ompt_thread_info.state = ((team->t.t_serialized) ?
2273  ompt_state_work_serial : ompt_state_work_parallel);
2274 }
2275 
2276 static inline void
2277 __kmp_join_ompt(
2278  kmp_info_t *thread,
2279  kmp_team_t *team,
2280  ompt_parallel_id_t parallel_id)
2281 {
2282  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2283  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2284  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2285  parallel_id, task_info->task_id);
2286  }
2287 
2288  __kmp_join_restore_state(thread,team);
2289 }
2290 #endif
2291 
2292 void
2293 __kmp_join_call(ident_t *loc, int gtid
2294 #if OMP_40_ENABLED
2295  , int exit_teams
2296 #endif /* OMP_40_ENABLED */
2297 )
2298 {
2299  KMP_TIME_BLOCK(KMP_join_call);
2300  kmp_team_t *team;
2301  kmp_team_t *parent_team;
2302  kmp_info_t *master_th;
2303  kmp_root_t *root;
2304  int master_active;
2305  int i;
2306 
2307  KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
2308 
2309  /* setup current data */
2310  master_th = __kmp_threads[ gtid ];
2311  root = master_th->th.th_root;
2312  team = master_th->th.th_team;
2313  parent_team = team->t.t_parent;
2314 
2315  master_th->th.th_ident = loc;
2316 
2317 #if OMPT_SUPPORT
2318  if (ompt_status & ompt_status_track) {
2319  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2320  }
2321 #endif
2322 
2323 #if KMP_DEBUG
2324  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2325  KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
2326  __kmp_gtid_from_thread( master_th ), team,
2327  team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team) );
2328  KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team[master_th->th.th_task_state] );
2329  }
2330 #endif
2331 
2332  if( team->t.t_serialized ) {
2333 #if OMP_40_ENABLED
2334  if ( master_th->th.th_teams_microtask ) {
2335  // We are in teams construct
2336  int level = team->t.t_level;
2337  int tlevel = master_th->th.th_teams_level;
2338  if ( level == tlevel ) {
2339  // AC: we haven't incremented it earlier at start of teams construct,
2340  // so do it here - at the end of teams construct
2341  team->t.t_level++;
2342  } else if ( level == tlevel + 1 ) {
2343  // AC: we are exiting parallel inside teams, need to increment serialization
2344  // in order to restore it in the next call to __kmpc_end_serialized_parallel
2345  team->t.t_serialized++;
2346  }
2347  }
2348 #endif /* OMP_40_ENABLED */
2349  __kmpc_end_serialized_parallel( loc, gtid );
2350 
2351 #if OMPT_SUPPORT
2352  if (ompt_status == ompt_status_track_callback) {
2353  __kmp_join_restore_state(master_th, parent_team);
2354  }
2355 #endif
2356 
2357  return;
2358  }
2359 
2360  master_active = team->t.t_master_active;
2361 
2362 #if OMP_40_ENABLED
2363  if (!exit_teams)
2364 #endif /* OMP_40_ENABLED */
2365  {
2366  // AC: No barrier for internal teams at exit from teams construct.
2367  // But there is barrier for external team (league).
2368  __kmp_internal_join( loc, gtid, team );
2369  }
2370 #if OMP_40_ENABLED
2371  else {
2372  master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel)
2373  }
2374 #endif /* OMP_40_ENABLED */
2375 
2376  KMP_MB();
2377 
2378 #if OMPT_SUPPORT
2379  ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2380 #endif
2381 
2382 #if USE_ITT_BUILD
2383  if ( __itt_stack_caller_create_ptr ) {
2384  __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
2385  }
2386 
2387  // Mark end of "parallel" region for VTune.
2388  if ( team->t.t_active_level == 1
2389 # if OMP_40_ENABLED
2390  && !master_th->th.th_teams_microtask /* not in teams construct */
2391 # endif /* OMP_40_ENABLED */
2392  ) {
2393  master_th->th.th_ident = loc;
2394  // only one notification scheme (either "submit" or "forking/joined", not both)
2395  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 )
2396  __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time,
2397  0, loc, master_th->th.th_team_nproc, 1 );
2398  else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
2399  ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
2400  __kmp_itt_region_joined( gtid );
2401  } // active_level == 1
2402 #endif /* USE_ITT_BUILD */
2403 
2404 #if OMP_40_ENABLED
2405  if ( master_th->th.th_teams_microtask &&
2406  !exit_teams &&
2407  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2408  team->t.t_level == master_th->th.th_teams_level + 1 ) {
2409  // AC: We need to leave the team structure intact at the end
2410  // of parallel inside the teams construct, so that at the next
2411  // parallel same (hot) team works, only adjust nesting levels
2412 
2413  /* Decrement our nested depth level */
2414  team->t.t_level --;
2415  team->t.t_active_level --;
2416  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2417 
2418  /* Restore number of threads in the team if needed */
2419  if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
2420  int old_num = master_th->th.th_team_nproc;
2421  int new_num = master_th->th.th_teams_size.nth;
2422  kmp_info_t **other_threads = team->t.t_threads;
2423  kmp_task_team_t * task_team = master_th->th.th_task_team;
2424  team->t.t_nproc = new_num;
2425  if ( task_team ) { // task team might have lesser value of counters
2426  task_team->tt.tt_ref_ct = new_num - 1;
2427  task_team->tt.tt_unfinished_threads = new_num;
2428  }
2429  for ( i = 0; i < old_num; ++i ) {
2430  other_threads[i]->th.th_team_nproc = new_num;
2431  }
2432  // Adjust states of non-used threads of the team
2433  for ( i = old_num; i < new_num; ++i ) {
2434  // Re-initialize thread's barrier data.
2435  int b;
2436  kmp_balign_t * balign = other_threads[i]->th.th_bar;
2437  for ( b = 0; b < bs_last_barrier; ++ b ) {
2438  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
2439  KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2440 #if USE_DEBUGGER
2441  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
2442 #endif
2443  }
2444  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2445  // Synchronize thread's task state
2446  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2447  }
2448  }
2449  }
2450 
2451 #if OMPT_SUPPORT
2452  if (ompt_status == ompt_status_track_callback) {
2453  __kmp_join_ompt(master_th, parent_team, parallel_id);
2454  }
2455 #endif
2456 
2457  return;
2458  }
2459 #endif /* OMP_40_ENABLED */
2460 
2461  /* do cleanup and restore the parent team */
2462  master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
2463  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2464 
2465  master_th->th.th_dispatch =
2466  & parent_team->t.t_dispatch[ team->t.t_master_tid ];
2467 
2468  /* jc: The following lock has instructions with REL and ACQ semantics,
2469  separating the parallel user code called in this parallel region
2470  from the serial user code called after this function returns.
2471  */
2472  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2473 
2474 #if OMP_40_ENABLED
2475  if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
2476 #endif /* OMP_40_ENABLED */
2477  {
2478  /* Decrement our nested depth level */
2479  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2480  }
2481  KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
2482 
2483  KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
2484  0, master_th, team ) );
2485  __kmp_pop_current_task_from_thread( master_th );
2486 
2487 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2488  //
2489  // Restore master thread's partition.
2490  //
2491  master_th->th.th_first_place = team->t.t_first_place;
2492  master_th->th.th_last_place = team->t.t_last_place;
2493 #endif /* OMP_40_ENABLED */
2494 
2495  updateHWFPControl (team);
2496 
2497  if ( root->r.r_active != master_active )
2498  root->r.r_active = master_active;
2499 
2500  __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
2501 
2502  /* this race was fun to find. make sure the following is in the critical
2503  * region otherwise assertions may fail occasionally since the old team
2504  * may be reallocated and the hierarchy appears inconsistent. it is
2505  * actually safe to run and won't cause any bugs, but will cause those
2506  * assertion failures. it's only one deref&assign so might as well put this
2507  * in the critical region */
2508  master_th->th.th_team = parent_team;
2509  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2510  master_th->th.th_team_master = parent_team->t.t_threads[0];
2511  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2512 
2513  /* restore serialized team, if need be */
2514  if( parent_team->t.t_serialized &&
2515  parent_team != master_th->th.th_serial_team &&
2516  parent_team != root->r.r_root_team ) {
2517  __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
2518  master_th->th.th_serial_team = parent_team;
2519  }
2520 
2521  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2522  // Restore task state from memo stack
2523  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2524  if (master_th->th.th_task_state_top > 0) {
2525  --master_th->th.th_task_state_top; // pop
2526  master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2527  }
2528  // Copy the first task team from the new child / old parent team to the thread and reset state flag.
2529  master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state];
2530 
2531  KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2532  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2533  parent_team ) );
2534  }
2535 
2536  // TODO: GEH - cannot do this assertion because root thread not set up as executing
2537  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2538  master_th->th.th_current_task->td_flags.executing = 1;
2539 
2540  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2541 
2542 #if OMPT_SUPPORT
2543  if (ompt_status == ompt_status_track_callback) {
2544  __kmp_join_ompt(master_th, parent_team, parallel_id);
2545  }
2546 #endif
2547 
2548  KMP_MB();
2549  KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
2550 }
2551 
2552 /* ------------------------------------------------------------------------ */
2553 /* ------------------------------------------------------------------------ */
2554 
2555 /* Check whether we should push an internal control record onto the
2556  serial team stack. If so, do it. */
2557 void
2558 __kmp_save_internal_controls ( kmp_info_t * thread )
2559 {
2560 
2561  if ( thread->th.th_team != thread->th.th_serial_team ) {
2562  return;
2563  }
2564  if (thread->th.th_team->t.t_serialized > 1) {
2565  int push = 0;
2566 
2567  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2568  push = 1;
2569  } else {
2570  if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2571  thread->th.th_team->t.t_serialized ) {
2572  push = 1;
2573  }
2574  }
2575  if (push) { /* push a record on the serial team's stack */
2576  kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
2577 
2578  copy_icvs( control, & thread->th.th_current_task->td_icvs );
2579 
2580  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2581 
2582  control->next = thread->th.th_team->t.t_control_stack_top;
2583  thread->th.th_team->t.t_control_stack_top = control;
2584  }
2585  }
2586 }
2587 
2588 /* Changes set_nproc */
2589 void
2590 __kmp_set_num_threads( int new_nth, int gtid )
2591 {
2592  kmp_info_t *thread;
2593  kmp_root_t *root;
2594 
2595  KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
2596  KMP_DEBUG_ASSERT( __kmp_init_serial );
2597 
2598  if (new_nth < 1)
2599  new_nth = 1;
2600  else if (new_nth > __kmp_max_nth)
2601  new_nth = __kmp_max_nth;
2602 
2603  thread = __kmp_threads[gtid];
2604 
2605  __kmp_save_internal_controls( thread );
2606 
2607  set__nproc( thread, new_nth );
2608 
2609  //
2610  // If this omp_set_num_threads() call will cause the hot team size to be
2611  // reduced (in the absence of a num_threads clause), then reduce it now,
2612  // rather than waiting for the next parallel region.
2613  //
2614  root = thread->th.th_root;
2615  if ( __kmp_init_parallel && ( ! root->r.r_active )
2616  && ( root->r.r_hot_team->t.t_nproc > new_nth )
2617 #if KMP_NESTED_HOT_TEAMS
2618  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2619 #endif
2620  ) {
2621  kmp_team_t *hot_team = root->r.r_hot_team;
2622  int f;
2623 
2624  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2625 
2626 
2627  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2628  int tt_idx;
2629  for (tt_idx=0; tt_idx<2; ++tt_idx) {
2630  kmp_task_team_t *task_team = hot_team->t.t_task_team[tt_idx];
2631  if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
2632  // Signal worker threads (esp. the extra ones) to stop looking for tasks while spin waiting.
2633  // The task teams are reference counted and will be deallocated by the last worker thread.
2634  KMP_DEBUG_ASSERT( hot_team->t.t_nproc > 1 );
2635  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
2636  KMP_MB();
2637  KA_TRACE( 20, ( "__kmp_set_num_threads: setting task_team %p to NULL\n",
2638  &hot_team->t.t_task_team[tt_idx] ) );
2639  hot_team->t.t_task_team[tt_idx] = NULL;
2640  }
2641  else {
2642  KMP_DEBUG_ASSERT( task_team == NULL );
2643  }
2644  }
2645  }
2646 
2647  //
2648  // Release the extra threads we don't need any more.
2649  //
2650  for ( f = new_nth; f < hot_team->t.t_nproc; f++ ) {
2651  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2652  __kmp_free_thread( hot_team->t.t_threads[f] );
2653  hot_team->t.t_threads[f] = NULL;
2654  }
2655  hot_team->t.t_nproc = new_nth;
2656 #if KMP_NESTED_HOT_TEAMS
2657  if( thread->th.th_hot_teams ) {
2658  KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
2659  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2660  }
2661 #endif
2662 
2663 
2664  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2665 
2666  //
2667  // Update the t_nproc field in the threads that are still active.
2668  //
2669  for( f=0 ; f < new_nth; f++ ) {
2670  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2671  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2672  }
2673  // Special flag in case omp_set_num_threads() call
2674  hot_team->t.t_size_changed = -1;
2675  }
2676 
2677 }
2678 
2679 /* Changes max_active_levels */
2680 void
2681 __kmp_set_max_active_levels( int gtid, int max_active_levels )
2682 {
2683  kmp_info_t *thread;
2684 
2685  KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2686  KMP_DEBUG_ASSERT( __kmp_init_serial );
2687 
2688  // validate max_active_levels
2689  if( max_active_levels < 0 ) {
2690  KMP_WARNING( ActiveLevelsNegative, max_active_levels );
2691  // We ignore this call if the user has specified a negative value.
2692  // The current setting won't be changed. The last valid setting will be used.
2693  // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
2694  KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2695  return;
2696  }
2697  if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
2698  // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2699  // We allow a zero value. (implementation defined behavior)
2700  } else {
2701  KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT );
2702  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2703  // Current upper limit is MAX_INT. (implementation defined behavior)
2704  // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
2705  // Actually, the flow should never get here until we use MAX_INT limit.
2706  }
2707  KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2708 
2709  thread = __kmp_threads[ gtid ];
2710 
2711  __kmp_save_internal_controls( thread );
2712 
2713  set__max_active_levels( thread, max_active_levels );
2714 
2715 }
2716 
2717 /* Gets max_active_levels */
2718 int
2719 __kmp_get_max_active_levels( int gtid )
2720 {
2721  kmp_info_t *thread;
2722 
2723  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
2724  KMP_DEBUG_ASSERT( __kmp_init_serial );
2725 
2726  thread = __kmp_threads[ gtid ];
2727  KMP_DEBUG_ASSERT( thread->th.th_current_task );
2728  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
2729  gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
2730  return thread->th.th_current_task->td_icvs.max_active_levels;
2731 }
2732 
2733 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2734 void
2735 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
2736 {
2737  kmp_info_t *thread;
2738 // kmp_team_t *team;
2739 
2740  KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
2741  KMP_DEBUG_ASSERT( __kmp_init_serial );
2742 
2743  // Check if the kind parameter is valid, correct if needed.
2744  // Valid parameters should fit in one of two intervals - standard or extended:
2745  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2746  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2747  if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2748  ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
2749  {
2750  // TODO: Hint needs attention in case we change the default schedule.
2751  __kmp_msg(
2752  kmp_ms_warning,
2753  KMP_MSG( ScheduleKindOutOfRange, kind ),
2754  KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
2755  __kmp_msg_null
2756  );
2757  kind = kmp_sched_default;
2758  chunk = 0; // ignore chunk value in case of bad kind
2759  }
2760 
2761  thread = __kmp_threads[ gtid ];
2762 
2763  __kmp_save_internal_controls( thread );
2764 
2765  if ( kind < kmp_sched_upper_std ) {
2766  if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
2767  // differ static chunked vs. unchunked:
2768  // chunk should be invalid to indicate unchunked schedule (which is the default)
2769  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2770  } else {
2771  thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
2772  }
2773  } else {
2774  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2775  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2776  __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2777  }
2778  if ( kind == kmp_sched_auto ) {
2779  // ignore parameter chunk for schedule auto
2780  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2781  } else {
2782  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2783  }
2784 }
2785 
2786 /* Gets def_sched_var ICV values */
2787 void
2788 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
2789 {
2790  kmp_info_t *thread;
2791  enum sched_type th_type;
2792  int i;
2793 
2794  KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
2795  KMP_DEBUG_ASSERT( __kmp_init_serial );
2796 
2797  thread = __kmp_threads[ gtid ];
2798 
2799  //th_type = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].r_sched_type;
2800  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2801 
2802  switch ( th_type ) {
2803  case kmp_sch_static:
2804  case kmp_sch_static_greedy:
2805  case kmp_sch_static_balanced:
2806  *kind = kmp_sched_static;
2807  *chunk = 0; // chunk was not set, try to show this fact via zero value
2808  return;
2809  case kmp_sch_static_chunked:
2810  *kind = kmp_sched_static;
2811  break;
2812  case kmp_sch_dynamic_chunked:
2813  *kind = kmp_sched_dynamic;
2814  break;
2816  case kmp_sch_guided_iterative_chunked:
2817  case kmp_sch_guided_analytical_chunked:
2818  *kind = kmp_sched_guided;
2819  break;
2820  case kmp_sch_auto:
2821  *kind = kmp_sched_auto;
2822  break;
2823  case kmp_sch_trapezoidal:
2824  *kind = kmp_sched_trapezoidal;
2825  break;
2826 /*
2827  case kmp_sch_static_steal:
2828  *kind = kmp_sched_static_steal;
2829  break;
2830 */
2831  default:
2832  KMP_FATAL( UnknownSchedulingType, th_type );
2833  }
2834 
2835  //*chunk = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].chunk;
2836  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2837 }
2838 
2839 int
2840 __kmp_get_ancestor_thread_num( int gtid, int level ) {
2841 
2842  int ii, dd;
2843  kmp_team_t *team;
2844  kmp_info_t *thr;
2845 
2846  KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
2847  KMP_DEBUG_ASSERT( __kmp_init_serial );
2848 
2849  // validate level
2850  if( level == 0 ) return 0;
2851  if( level < 0 ) return -1;
2852  thr = __kmp_threads[ gtid ];
2853  team = thr->th.th_team;
2854  ii = team->t.t_level;
2855  if( level > ii ) return -1;
2856 
2857 #if OMP_40_ENABLED
2858  if( thr->th.th_teams_microtask ) {
2859  // AC: we are in teams region where multiple nested teams have same level
2860  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2861  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2862  KMP_DEBUG_ASSERT( ii >= tlevel );
2863  // AC: As we need to pass by the teams league, we need to artificially increase ii
2864  if ( ii == tlevel ) {
2865  ii += 2; // three teams have same level
2866  } else {
2867  ii ++; // two teams have same level
2868  }
2869  }
2870  }
2871 #endif
2872 
2873  if( ii == level ) return __kmp_tid_from_gtid( gtid );
2874 
2875  dd = team->t.t_serialized;
2876  level++;
2877  while( ii > level )
2878  {
2879  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2880  {
2881  }
2882  if( ( team->t.t_serialized ) && ( !dd ) ) {
2883  team = team->t.t_parent;
2884  continue;
2885  }
2886  if( ii > level ) {
2887  team = team->t.t_parent;
2888  dd = team->t.t_serialized;
2889  ii--;
2890  }
2891  }
2892 
2893  return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
2894 }
2895 
2896 int
2897 __kmp_get_team_size( int gtid, int level ) {
2898 
2899  int ii, dd;
2900  kmp_team_t *team;
2901  kmp_info_t *thr;
2902 
2903  KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
2904  KMP_DEBUG_ASSERT( __kmp_init_serial );
2905 
2906  // validate level
2907  if( level == 0 ) return 1;
2908  if( level < 0 ) return -1;
2909  thr = __kmp_threads[ gtid ];
2910  team = thr->th.th_team;
2911  ii = team->t.t_level;
2912  if( level > ii ) return -1;
2913 
2914 #if OMP_40_ENABLED
2915  if( thr->th.th_teams_microtask ) {
2916  // AC: we are in teams region where multiple nested teams have same level
2917  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2918  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2919  KMP_DEBUG_ASSERT( ii >= tlevel );
2920  // AC: As we need to pass by the teams league, we need to artificially increase ii
2921  if ( ii == tlevel ) {
2922  ii += 2; // three teams have same level
2923  } else {
2924  ii ++; // two teams have same level
2925  }
2926  }
2927  }
2928 #endif
2929 
2930  while( ii > level )
2931  {
2932  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2933  {
2934  }
2935  if( team->t.t_serialized && ( !dd ) ) {
2936  team = team->t.t_parent;
2937  continue;
2938  }
2939  if( ii > level ) {
2940  team = team->t.t_parent;
2941  ii--;
2942  }
2943  }
2944 
2945  return team->t.t_nproc;
2946 }
2947 
2948 kmp_r_sched_t
2949 __kmp_get_schedule_global() {
2950 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
2951 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
2952 
2953  kmp_r_sched_t r_sched;
2954 
2955  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
2956  // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
2957  // and thus have different run-time schedules in different roots (even in OMP 2.5)
2958  if ( __kmp_sched == kmp_sch_static ) {
2959  r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
2960  } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
2961  r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
2962  } else {
2963  r_sched.r_sched_type = __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2964  }
2965 
2966  if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
2967  r_sched.chunk = KMP_DEFAULT_CHUNK;
2968  } else {
2969  r_sched.chunk = __kmp_chunk;
2970  }
2971 
2972  return r_sched;
2973 }
2974 
2975 /* ------------------------------------------------------------------------ */
2976 /* ------------------------------------------------------------------------ */
2977 
2978 
2979 /*
2980  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2981  * at least argc number of *t_argv entries for the requested team.
2982  */
2983 static void
2984 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
2985 {
2986 
2987  KMP_DEBUG_ASSERT( team );
2988  if( !realloc || argc > team->t.t_max_argc ) {
2989 
2990  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
2991  team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
2992  /* if previously allocated heap space for args, free them */
2993  if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
2994  __kmp_free( (void *) team->t.t_argv );
2995 
2996  if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
2997  /* use unused space in the cache line for arguments */
2998  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2999  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
3000  team->t.t_id, team->t.t_max_argc ));
3001  team->t.t_argv = &team->t.t_inline_argv[0];
3002  if ( __kmp_storage_map ) {
3003  __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
3004  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3005  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
3006  "team_%d.t_inline_argv",
3007  team->t.t_id );
3008  }
3009  } else {
3010  /* allocate space for arguments in the heap */
3011  team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
3012  KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
3013  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
3014  team->t.t_id, team->t.t_max_argc ));
3015  team->t.t_argv = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
3016  if ( __kmp_storage_map ) {
3017  __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
3018  sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
3019  team->t.t_id );
3020  }
3021  }
3022  }
3023 }
3024 
3025 static void
3026 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
3027 {
3028  int i;
3029  int num_disp_buff = max_nth > 1 ? KMP_MAX_DISP_BUF : 2;
3030 #if KMP_USE_POOLED_ALLOC
3031  // AC: TODO: fix bug here: size of t_disp_buffer should not be multiplied by max_nth!
3032  char *ptr = __kmp_allocate(max_nth *
3033  ( sizeof(kmp_info_t*) + sizeof(dispatch_shared_info_t)*num_disp_buf
3034  + sizeof(kmp_disp_t) + sizeof(int)*6
3035  //+ sizeof(int)
3036  + sizeof(kmp_r_sched_t)
3037  + sizeof(kmp_taskdata_t) ) );
3038 
3039  team->t.t_threads = (kmp_info_t**) ptr; ptr += sizeof(kmp_info_t*) * max_nth;
3040  team->t.t_disp_buffer = (dispatch_shared_info_t*) ptr;
3041  ptr += sizeof(dispatch_shared_info_t) * num_disp_buff;
3042  team->t.t_dispatch = (kmp_disp_t*) ptr; ptr += sizeof(kmp_disp_t) * max_nth;
3043  team->t.t_set_nproc = (int*) ptr; ptr += sizeof(int) * max_nth;
3044  team->t.t_set_dynamic = (int*) ptr; ptr += sizeof(int) * max_nth;
3045  team->t.t_set_nested = (int*) ptr; ptr += sizeof(int) * max_nth;
3046  team->t.t_set_blocktime = (int*) ptr; ptr += sizeof(int) * max_nth;
3047  team->t.t_set_bt_intervals = (int*) ptr; ptr += sizeof(int) * max_nth;
3048  team->t.t_set_bt_set = (int*) ptr;
3049  ptr += sizeof(int) * max_nth;
3050  //team->t.t_set_max_active_levels = (int*) ptr; ptr += sizeof(int) * max_nth;
3051  team->t.t_set_sched = (kmp_r_sched_t*) ptr;
3052  ptr += sizeof(kmp_r_sched_t) * max_nth;
3053  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) ptr;
3054  ptr += sizeof(kmp_taskdata_t) * max_nth;
3055 #else
3056 
3057  team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
3058  team->t.t_disp_buffer = (dispatch_shared_info_t*)
3059  __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
3060  team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
3061  //team->t.t_set_max_active_levels = (int*) __kmp_allocate( sizeof(int) * max_nth );
3062  //team->t.t_set_sched = (kmp_r_sched_t*) __kmp_allocate( sizeof(kmp_r_sched_t) * max_nth );
3063  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
3064 #endif
3065  team->t.t_max_nproc = max_nth;
3066 
3067  /* setup dispatch buffers */
3068  for(i = 0 ; i < num_disp_buff; ++i)
3069  team->t.t_disp_buffer[i].buffer_index = i;
3070 }
3071 
3072 static void
3073 __kmp_free_team_arrays(kmp_team_t *team) {
3074  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3075  int i;
3076  for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
3077  if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
3078  __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
3079  team->t.t_dispatch[ i ].th_disp_buffer = NULL;
3080  }; // if
3081  }; // for
3082  __kmp_free(team->t.t_threads);
3083  #if !KMP_USE_POOLED_ALLOC
3084  __kmp_free(team->t.t_disp_buffer);
3085  __kmp_free(team->t.t_dispatch);
3086  //__kmp_free(team->t.t_set_max_active_levels);
3087  //__kmp_free(team->t.t_set_sched);
3088  __kmp_free(team->t.t_implicit_task_taskdata);
3089  #endif
3090  team->t.t_threads = NULL;
3091  team->t.t_disp_buffer = NULL;
3092  team->t.t_dispatch = NULL;
3093  //team->t.t_set_sched = 0;
3094  //team->t.t_set_max_active_levels = 0;
3095  team->t.t_implicit_task_taskdata = 0;
3096 }
3097 
3098 static void
3099 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3100  kmp_info_t **oldThreads = team->t.t_threads;
3101 
3102  #if !KMP_USE_POOLED_ALLOC
3103  __kmp_free(team->t.t_disp_buffer);
3104  __kmp_free(team->t.t_dispatch);
3105  //__kmp_free(team->t.t_set_max_active_levels);
3106  //__kmp_free(team->t.t_set_sched);
3107  __kmp_free(team->t.t_implicit_task_taskdata);
3108  #endif
3109  __kmp_allocate_team_arrays(team, max_nth);
3110 
3111  KMP_MEMCPY(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
3112 
3113  __kmp_free(oldThreads);
3114 }
3115 
3116 static kmp_internal_control_t
3117 __kmp_get_global_icvs( void ) {
3118 
3119  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3120 
3121 #if OMP_40_ENABLED
3122  KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
3123 #endif /* OMP_40_ENABLED */
3124 
3125  kmp_internal_control_t g_icvs = {
3126  0, //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
3127  (kmp_int8)__kmp_dflt_nested, //int nested; //internal control for nested parallelism (per thread)
3128  (kmp_int8)__kmp_global.g.g_dynamic, //internal control for dynamic adjustment of threads (per thread)
3129  (kmp_int8)__kmp_env_blocktime, //int bt_set; //internal control for whether blocktime is explicitly set
3130  __kmp_dflt_blocktime, //int blocktime; //internal control for blocktime
3131  __kmp_bt_intervals, //int bt_intervals; //internal control for blocktime intervals
3132  __kmp_dflt_team_nth, //int nproc; //internal control for # of threads for next parallel region (per thread)
3133  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3134  __kmp_dflt_max_active_levels, //int max_active_levels; //internal control for max_active_levels
3135  r_sched, //kmp_r_sched_t sched; //internal control for runtime schedule {sched,chunk} pair
3136 #if OMP_40_ENABLED
3137  __kmp_nested_proc_bind.bind_types[0],
3138 #endif /* OMP_40_ENABLED */
3139  NULL //struct kmp_internal_control *next;
3140  };
3141 
3142  return g_icvs;
3143 }
3144 
3145 static kmp_internal_control_t
3146 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
3147 
3148  kmp_internal_control_t gx_icvs;
3149  gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
3150  copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
3151  gx_icvs.next = NULL;
3152 
3153  return gx_icvs;
3154 }
3155 
3156 static void
3157 __kmp_initialize_root( kmp_root_t *root )
3158 {
3159  int f;
3160  kmp_team_t *root_team;
3161  kmp_team_t *hot_team;
3162  size_t disp_size, dispatch_size, bar_size;
3163  int hot_team_max_nth;
3164  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3165  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3166  KMP_DEBUG_ASSERT( root );
3167  KMP_ASSERT( ! root->r.r_begin );
3168 
3169  /* setup the root state structure */
3170  __kmp_init_lock( &root->r.r_begin_lock );
3171  root->r.r_begin = FALSE;
3172  root->r.r_active = FALSE;
3173  root->r.r_in_parallel = 0;
3174  root->r.r_blocktime = __kmp_dflt_blocktime;
3175  root->r.r_nested = __kmp_dflt_nested;
3176 
3177  /* setup the root team for this task */
3178  /* allocate the root team structure */
3179  KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
3180 
3181  root_team =
3182  __kmp_allocate_team(
3183  root,
3184  1, // new_nproc
3185  1, // max_nproc
3186 #if OMPT_SUPPORT
3187  0, // root parallel id
3188 #endif
3189 #if OMP_40_ENABLED
3190  __kmp_nested_proc_bind.bind_types[0],
3191 #endif
3192  &r_icvs,
3193  0 // argc
3194  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3195  );
3196 #if USE_DEBUGGER
3197  // Non-NULL value should be assigned to make the debugger display the root team.
3198  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)( ~ 0 ));
3199 #endif
3200 
3201  KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
3202 
3203  root->r.r_root_team = root_team;
3204  root_team->t.t_control_stack_top = NULL;
3205 
3206  /* initialize root team */
3207  root_team->t.t_threads[0] = NULL;
3208  root_team->t.t_nproc = 1;
3209  root_team->t.t_serialized = 1;
3210  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3211  root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3212  root_team->t.t_sched.chunk = r_sched.chunk;
3213  KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3214  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3215 
3216  /* setup the hot team for this task */
3217  /* allocate the hot team structure */
3218  KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
3219 
3220  hot_team =
3221  __kmp_allocate_team(
3222  root,
3223  1, // new_nproc
3224  __kmp_dflt_team_nth_ub * 2, // max_nproc
3225 #if OMPT_SUPPORT
3226  0, // root parallel id
3227 #endif
3228 #if OMP_40_ENABLED
3229  __kmp_nested_proc_bind.bind_types[0],
3230 #endif
3231  &r_icvs,
3232  0 // argc
3233  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3234  );
3235  KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
3236 
3237  root->r.r_hot_team = hot_team;
3238  root_team->t.t_control_stack_top = NULL;
3239 
3240  /* first-time initialization */
3241  hot_team->t.t_parent = root_team;
3242 
3243  /* initialize hot team */
3244  hot_team_max_nth = hot_team->t.t_max_nproc;
3245  for ( f = 0; f < hot_team_max_nth; ++ f ) {
3246  hot_team->t.t_threads[ f ] = NULL;
3247  }; // for
3248  hot_team->t.t_nproc = 1;
3249  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3250  hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3251  hot_team->t.t_sched.chunk = r_sched.chunk;
3252  hot_team->t.t_size_changed = 0;
3253 
3254 }
3255 
3256 #ifdef KMP_DEBUG
3257 
3258 
3259 typedef struct kmp_team_list_item {
3260  kmp_team_p const * entry;
3261  struct kmp_team_list_item * next;
3262 } kmp_team_list_item_t;
3263 typedef kmp_team_list_item_t * kmp_team_list_t;
3264 
3265 
3266 static void
3267 __kmp_print_structure_team_accum( // Add team to list of teams.
3268  kmp_team_list_t list, // List of teams.
3269  kmp_team_p const * team // Team to add.
3270 ) {
3271 
3272  // List must terminate with item where both entry and next are NULL.
3273  // Team is added to the list only once.
3274  // List is sorted in ascending order by team id.
3275  // Team id is *not* a key.
3276 
3277  kmp_team_list_t l;
3278 
3279  KMP_DEBUG_ASSERT( list != NULL );
3280  if ( team == NULL ) {
3281  return;
3282  }; // if
3283 
3284  __kmp_print_structure_team_accum( list, team->t.t_parent );
3285  __kmp_print_structure_team_accum( list, team->t.t_next_pool );
3286 
3287  // Search list for the team.
3288  l = list;
3289  while ( l->next != NULL && l->entry != team ) {
3290  l = l->next;
3291  }; // while
3292  if ( l->next != NULL ) {
3293  return; // Team has been added before, exit.
3294  }; // if
3295 
3296  // Team is not found. Search list again for insertion point.
3297  l = list;
3298  while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
3299  l = l->next;
3300  }; // while
3301 
3302  // Insert team.
3303  {
3304  kmp_team_list_item_t * item =
3305  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3306  * item = * l;
3307  l->entry = team;
3308  l->next = item;
3309  }
3310 
3311 }
3312 
3313 static void
3314 __kmp_print_structure_team(
3315  char const * title,
3316  kmp_team_p const * team
3317 
3318 ) {
3319  __kmp_printf( "%s", title );
3320  if ( team != NULL ) {
3321  __kmp_printf( "%2x %p\n", team->t.t_id, team );
3322  } else {
3323  __kmp_printf( " - (nil)\n" );
3324  }; // if
3325 }
3326 
3327 static void
3328 __kmp_print_structure_thread(
3329  char const * title,
3330  kmp_info_p const * thread
3331 
3332 ) {
3333  __kmp_printf( "%s", title );
3334  if ( thread != NULL ) {
3335  __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
3336  } else {
3337  __kmp_printf( " - (nil)\n" );
3338  }; // if
3339 }
3340 
3341 void
3342 __kmp_print_structure(
3343  void
3344 ) {
3345 
3346  kmp_team_list_t list;
3347 
3348  // Initialize list of teams.
3349  list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3350  list->entry = NULL;
3351  list->next = NULL;
3352 
3353  __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
3354  {
3355  int gtid;
3356  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3357  __kmp_printf( "%2d", gtid );
3358  if ( __kmp_threads != NULL ) {
3359  __kmp_printf( " %p", __kmp_threads[ gtid ] );
3360  }; // if
3361  if ( __kmp_root != NULL ) {
3362  __kmp_printf( " %p", __kmp_root[ gtid ] );
3363  }; // if
3364  __kmp_printf( "\n" );
3365  }; // for gtid
3366  }
3367 
3368  // Print out __kmp_threads array.
3369  __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
3370  if ( __kmp_threads != NULL ) {
3371  int gtid;
3372  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3373  kmp_info_t const * thread = __kmp_threads[ gtid ];
3374  if ( thread != NULL ) {
3375  __kmp_printf( "GTID %2d %p:\n", gtid, thread );
3376  __kmp_printf( " Our Root: %p\n", thread->th.th_root );
3377  __kmp_print_structure_team( " Our Team: ", thread->th.th_team );
3378  __kmp_print_structure_team( " Serial Team: ", thread->th.th_serial_team );
3379  __kmp_printf( " Threads: %2d\n", thread->th.th_team_nproc );
3380  __kmp_print_structure_thread( " Master: ", thread->th.th_team_master );
3381  __kmp_printf( " Serialized?: %2d\n", thread->th.th_team_serialized );
3382  __kmp_printf( " Set NProc: %2d\n", thread->th.th_set_nproc );
3383 #if OMP_40_ENABLED
3384  __kmp_printf( " Set Proc Bind: %2d\n", thread->th.th_set_proc_bind );
3385 #endif
3386  __kmp_print_structure_thread( " Next in pool: ", thread->th.th_next_pool );
3387  __kmp_printf( "\n" );
3388  __kmp_print_structure_team_accum( list, thread->th.th_team );
3389  __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
3390  }; // if
3391  }; // for gtid
3392  } else {
3393  __kmp_printf( "Threads array is not allocated.\n" );
3394  }; // if
3395 
3396  // Print out __kmp_root array.
3397  __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
3398  if ( __kmp_root != NULL ) {
3399  int gtid;
3400  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3401  kmp_root_t const * root = __kmp_root[ gtid ];
3402  if ( root != NULL ) {
3403  __kmp_printf( "GTID %2d %p:\n", gtid, root );
3404  __kmp_print_structure_team( " Root Team: ", root->r.r_root_team );
3405  __kmp_print_structure_team( " Hot Team: ", root->r.r_hot_team );
3406  __kmp_print_structure_thread( " Uber Thread: ", root->r.r_uber_thread );
3407  __kmp_printf( " Active?: %2d\n", root->r.r_active );
3408  __kmp_printf( " Nested?: %2d\n", root->r.r_nested );
3409  __kmp_printf( " In Parallel: %2d\n", root->r.r_in_parallel );
3410  __kmp_printf( "\n" );
3411  __kmp_print_structure_team_accum( list, root->r.r_root_team );
3412  __kmp_print_structure_team_accum( list, root->r.r_hot_team );
3413  }; // if
3414  }; // for gtid
3415  } else {
3416  __kmp_printf( "Ubers array is not allocated.\n" );
3417  }; // if
3418 
3419  __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
3420  while ( list->next != NULL ) {
3421  kmp_team_p const * team = list->entry;
3422  int i;
3423  __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
3424  __kmp_print_structure_team( " Parent Team: ", team->t.t_parent );
3425  __kmp_printf( " Master TID: %2d\n", team->t.t_master_tid );
3426  __kmp_printf( " Max threads: %2d\n", team->t.t_max_nproc );
3427  __kmp_printf( " Levels of serial: %2d\n", team->t.t_serialized );
3428  __kmp_printf( " Number threads: %2d\n", team->t.t_nproc );
3429  for ( i = 0; i < team->t.t_nproc; ++ i ) {
3430  __kmp_printf( " Thread %2d: ", i );
3431  __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
3432  }; // for i
3433  __kmp_print_structure_team( " Next in pool: ", team->t.t_next_pool );
3434  __kmp_printf( "\n" );
3435  list = list->next;
3436  }; // while
3437 
3438  // Print out __kmp_thread_pool and __kmp_team_pool.
3439  __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
3440  __kmp_print_structure_thread( "Thread pool: ", (kmp_info_t *)__kmp_thread_pool );
3441  __kmp_print_structure_team( "Team pool: ", (kmp_team_t *)__kmp_team_pool );
3442  __kmp_printf( "\n" );
3443 
3444  // Free team list.
3445  while ( list != NULL ) {
3446  kmp_team_list_item_t * item = list;
3447  list = list->next;
3448  KMP_INTERNAL_FREE( item );
3449  }; // while
3450 
3451 }
3452 
3453 #endif
3454 
3455 
3456 //---------------------------------------------------------------------------
3457 // Stuff for per-thread fast random number generator
3458 // Table of primes
3459 
3460 static const unsigned __kmp_primes[] = {
3461  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
3462  0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
3463  0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3464  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
3465  0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
3466  0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3467  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
3468  0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
3469  0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3470  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
3471  0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
3472  0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3473  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
3474  0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
3475  0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3476  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
3477 };
3478 
3479 //---------------------------------------------------------------------------
3480 // __kmp_get_random: Get a random number using a linear congruential method.
3481 
3482 unsigned short
3483 __kmp_get_random( kmp_info_t * thread )
3484 {
3485  unsigned x = thread->th.th_x;
3486  unsigned short r = x>>16;
3487 
3488  thread->th.th_x = x*thread->th.th_a+1;
3489 
3490  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3491  thread->th.th_info.ds.ds_tid, r) );
3492 
3493  return r;
3494 }
3495 //--------------------------------------------------------
3496 // __kmp_init_random: Initialize a random number generator
3497 
3498 void
3499 __kmp_init_random( kmp_info_t * thread )
3500 {
3501  unsigned seed = thread->th.th_info.ds.ds_tid;
3502 
3503  thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
3504  thread->th.th_x = (seed+1)*thread->th.th_a+1;
3505  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
3506 }
3507 
3508 
3509 #if KMP_OS_WINDOWS
3510 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
3511 static int
3512 __kmp_reclaim_dead_roots(void) {
3513  int i, r = 0;
3514 
3515  for(i = 0; i < __kmp_threads_capacity; ++i) {
3516  if( KMP_UBER_GTID( i ) &&
3517  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3518  !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
3519  r += __kmp_unregister_root_other_thread(i);
3520  }
3521  }
3522  return r;
3523 }
3524 #endif
3525 
3526 /*
3527  This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
3528  free entries generated.
3529 
3530  For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
3531  already dead.
3532 
3533  On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
3534  update to __kmp_threads_capacity. Array capacity is increased by doubling with clipping to
3535  __kmp_tp_capacity, if threadprivate cache array has been created.
3536  Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3537 
3538  After any dead root reclamation, if the clipping value allows array expansion to result in the generation
3539  of a total of nWish free slots, the function does that expansion. If not, but the clipping value allows
3540  array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
3541  Otherwise, nothing is done beyond the possible initial root thread reclamation. However, if nNeed is zero,
3542  a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
3543  as many free slots as possible up to nWish.
3544 
3545  If any argument is negative, the behavior is undefined.
3546 */
3547 static int
3548 __kmp_expand_threads(int nWish, int nNeed) {
3549  int added = 0;
3550  int old_tp_cached;
3551  int __kmp_actual_max_nth;
3552 
3553  if(nNeed > nWish) /* normalize the arguments */
3554  nWish = nNeed;
3555 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3556 /* only for Windows static library */
3557  /* reclaim array entries for root threads that are already dead */
3558  added = __kmp_reclaim_dead_roots();
3559 
3560  if(nNeed) {
3561  nNeed -= added;
3562  if(nNeed < 0)
3563  nNeed = 0;
3564  }
3565  if(nWish) {
3566  nWish -= added;
3567  if(nWish < 0)
3568  nWish = 0;
3569  }
3570 #endif
3571  if(nWish <= 0)
3572  return added;
3573 
3574  while(1) {
3575  int nTarget;
3576  int minimumRequiredCapacity;
3577  int newCapacity;
3578  kmp_info_t **newThreads;
3579  kmp_root_t **newRoot;
3580 
3581  //
3582  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
3583  // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
3584  // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
3585  // become > __kmp_max_nth in one of two ways:
3586  //
3587  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3588  // may not be resused by another thread, so we may need to increase
3589  // __kmp_threads_capacity to __kmp_max_threads + 1.
3590  //
3591  // 2) New foreign root(s) are encountered. We always register new
3592  // foreign roots. This may cause a smaller # of threads to be
3593  // allocated at subsequent parallel regions, but the worker threads
3594  // hang around (and eventually go to sleep) and need slots in the
3595  // __kmp_threads[] array.
3596  //
3597  // Anyway, that is the reason for moving the check to see if
3598  // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3599  // instead of having it performed here. -BB
3600  //
3601  old_tp_cached = __kmp_tp_cached;
3602  __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3603  KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3604 
3605  /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
3606  nTarget = nWish;
3607  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3608  /* can't fulfil nWish, so try nNeed */
3609  if(nNeed) {
3610  nTarget = nNeed;
3611  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3612  /* possible expansion too small -- give up */
3613  break;
3614  }
3615  } else {
3616  /* best-effort */
3617  nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3618  if(!nTarget) {
3619  /* can expand at all -- give up */
3620  break;
3621  }
3622  }
3623  }
3624  minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3625 
3626  newCapacity = __kmp_threads_capacity;
3627  do{
3628  newCapacity =
3629  newCapacity <= (__kmp_actual_max_nth >> 1) ?
3630  (newCapacity << 1) :
3631  __kmp_actual_max_nth;
3632  } while(newCapacity < minimumRequiredCapacity);
3633  newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
3634  newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
3635  KMP_MEMCPY(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
3636  KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
3637  memset(newThreads + __kmp_threads_capacity, 0,
3638  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
3639  memset(newRoot + __kmp_threads_capacity, 0,
3640  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
3641 
3642  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3643  /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
3644  while we were allocating the expanded array, and our new capacity is larger than the threadprivate
3645  cache capacity, so we should deallocate the expanded arrays and try again. This is the first check
3646  of a double-check pair.
3647  */
3648  __kmp_free(newThreads);
3649  continue; /* start over and try again */
3650  }
3651  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3652  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3653  /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
3654  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3655  __kmp_free(newThreads);
3656  continue; /* start over and try again */
3657  } else {
3658  /* success */
3659  // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
3660  //
3661  *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
3662  *(kmp_root_t**volatile*)&__kmp_root = newRoot;
3663  added += newCapacity - __kmp_threads_capacity;
3664  *(volatile int*)&__kmp_threads_capacity = newCapacity;
3665  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3666  break; /* succeeded, so we can exit the loop */
3667  }
3668  }
3669  return added;
3670 }
3671 
3672 /* register the current thread as a root thread and obtain our gtid */
3673 /* we must have the __kmp_initz_lock held at this point */
3674 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
3675 int
3676 __kmp_register_root( int initial_thread )
3677 {
3678  kmp_info_t *root_thread;
3679  kmp_root_t *root;
3680  int gtid;
3681  int capacity;
3682  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3683  KA_TRACE( 20, ("__kmp_register_root: entered\n"));
3684  KMP_MB();
3685 
3686 
3687  /*
3688  2007-03-02:
3689 
3690  If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
3691  "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
3692  return false (that means there is at least one empty slot in __kmp_threads array), but it
3693  is possible the only free slot is #0, which is reserved for initial thread and so cannot be
3694  used for this one. Following code workarounds this bug.
3695 
3696  However, right solution seems to be not reserving slot #0 for initial thread because:
3697  (1) there is no magic in slot #0,
3698  (2) we cannot detect initial thread reliably (the first thread which does serial
3699  initialization may be not a real initial thread).
3700  */
3701  capacity = __kmp_threads_capacity;
3702  if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
3703  -- capacity;
3704  }; // if
3705 
3706  /* see if there are too many threads */
3707  if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
3708  if ( __kmp_tp_cached ) {
3709  __kmp_msg(
3710  kmp_ms_fatal,
3711  KMP_MSG( CantRegisterNewThread ),
3712  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
3713  KMP_HNT( PossibleSystemLimitOnThreads ),
3714  __kmp_msg_null
3715  );
3716  }
3717  else {
3718  __kmp_msg(
3719  kmp_ms_fatal,
3720  KMP_MSG( CantRegisterNewThread ),
3721  KMP_HNT( SystemLimitOnThreads ),
3722  __kmp_msg_null
3723  );
3724  }
3725  }; // if
3726 
3727  /* find an available thread slot */
3728  /* Don't reassign the zero slot since we need that to only be used by initial
3729  thread */
3730  for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
3731  ;
3732  KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
3733  KMP_ASSERT( gtid < __kmp_threads_capacity );
3734 
3735  /* update global accounting */
3736  __kmp_all_nth ++;
3737  TCW_4(__kmp_nth, __kmp_nth + 1);
3738 
3739  //
3740  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3741  // for low numbers of procs, and method #2 (keyed API call) for higher
3742  // numbers of procs.
3743  //
3744  if ( __kmp_adjust_gtid_mode ) {
3745  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3746  if ( TCR_4(__kmp_gtid_mode) != 2) {
3747  TCW_4(__kmp_gtid_mode, 2);
3748  }
3749  }
3750  else {
3751  if (TCR_4(__kmp_gtid_mode) != 1 ) {
3752  TCW_4(__kmp_gtid_mode, 1);
3753  }
3754  }
3755  }
3756 
3757 #ifdef KMP_ADJUST_BLOCKTIME
3758  /* Adjust blocktime to zero if necessary */
3759  /* Middle initialization might not have occurred yet */
3760  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3761  if ( __kmp_nth > __kmp_avail_proc ) {
3762  __kmp_zero_bt = TRUE;
3763  }
3764  }
3765 #endif /* KMP_ADJUST_BLOCKTIME */
3766 
3767  /* setup this new hierarchy */
3768  if( ! ( root = __kmp_root[gtid] )) {
3769  root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
3770  KMP_DEBUG_ASSERT( ! root->r.r_root_team );
3771  }
3772 
3773  __kmp_initialize_root( root );
3774 
3775  /* setup new root thread structure */
3776  if( root->r.r_uber_thread ) {
3777  root_thread = root->r.r_uber_thread;
3778  } else {
3779  root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3780  if ( __kmp_storage_map ) {
3781  __kmp_print_thread_storage_map( root_thread, gtid );
3782  }
3783  root_thread->th.th_info .ds.ds_gtid = gtid;
3784  root_thread->th.th_root = root;
3785  if( __kmp_env_consistency_check ) {
3786  root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
3787  }
3788  #if USE_FAST_MEMORY
3789  __kmp_initialize_fast_memory( root_thread );
3790  #endif /* USE_FAST_MEMORY */
3791 
3792  #if KMP_USE_BGET
3793  KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
3794  __kmp_initialize_bget( root_thread );
3795  #endif
3796  __kmp_init_random( root_thread ); // Initialize random number generator
3797  }
3798 
3799  /* setup the serial team held in reserve by the root thread */
3800  if( ! root_thread->th.th_serial_team ) {
3801  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3802  KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
3803  root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
3804 #if OMPT_SUPPORT
3805  0, // root parallel id
3806 #endif
3807 #if OMP_40_ENABLED
3808  proc_bind_default,
3809 #endif
3810  &r_icvs,
3811  0 USE_NESTED_HOT_ARG(NULL) );
3812  }
3813  KMP_ASSERT( root_thread->th.th_serial_team );
3814  KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
3815  root_thread->th.th_serial_team ) );
3816 
3817  /* drop root_thread into place */
3818  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3819 
3820  root->r.r_root_team->t.t_threads[0] = root_thread;
3821  root->r.r_hot_team ->t.t_threads[0] = root_thread;
3822  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3823  root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3824  root->r.r_uber_thread = root_thread;
3825 
3826  /* initialize the thread, get it ready to go */
3827  __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
3828 
3829  /* prepare the master thread for get_gtid() */
3830  __kmp_gtid_set_specific( gtid );
3831 
3832  __kmp_itt_thread_name( gtid );
3833 
3834  #ifdef KMP_TDATA_GTID
3835  __kmp_gtid = gtid;
3836  #endif
3837  __kmp_create_worker( gtid, root_thread, __kmp_stksize );
3838  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
3839  TCW_4(__kmp_init_gtid, TRUE);
3840 
3841  KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
3842  gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
3843  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3844  KMP_INIT_BARRIER_STATE ) );
3845  { // Initialize barrier data.
3846  int b;
3847  for ( b = 0; b < bs_last_barrier; ++ b ) {
3848  root_thread->th.th_bar[ b ].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3849 #if USE_DEBUGGER
3850  root_thread->th.th_bar[ b ].bb.b_worker_arrived = 0;
3851 #endif
3852  }; // for
3853  }
3854  KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
3855 
3856 
3857 #if KMP_AFFINITY_SUPPORTED
3858  if ( TCR_4(__kmp_init_middle) ) {
3859  __kmp_affinity_set_init_mask( gtid, TRUE );
3860  }
3861 #endif /* KMP_AFFINITY_SUPPORTED */
3862 
3863  __kmp_root_counter ++;
3864 
3865  KMP_MB();
3866  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3867 
3868  return gtid;
3869 }
3870 
3871 #if KMP_NESTED_HOT_TEAMS
3872 static int
3873 __kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
3874 {
3875  int i, n, nth;
3876  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3877  if( !hot_teams || !hot_teams[level].hot_team ) {
3878  return 0;
3879  }
3880  KMP_DEBUG_ASSERT( level < max_level );
3881  kmp_team_t *team = hot_teams[level].hot_team;
3882  nth = hot_teams[level].hot_team_nth;
3883  n = nth - 1; // master is not freed
3884  if( level < max_level - 1 ) {
3885  for( i = 0; i < nth; ++i ) {
3886  kmp_info_t *th = team->t.t_threads[i];
3887  n += __kmp_free_hot_teams( root, th, level + 1, max_level );
3888  if( i > 0 && th->th.th_hot_teams ) {
3889  __kmp_free( th->th.th_hot_teams );
3890  th->th.th_hot_teams = NULL;
3891  }
3892  }
3893  }
3894  __kmp_free_team( root, team, NULL );
3895  return n;
3896 }
3897 #endif
3898 
3899 /* Resets a root thread and clear its root and hot teams.
3900  Returns the number of __kmp_threads entries directly and indirectly freed.
3901 */
3902 static int
3903 __kmp_reset_root(int gtid, kmp_root_t *root)
3904 {
3905  kmp_team_t * root_team = root->r.r_root_team;
3906  kmp_team_t * hot_team = root->r.r_hot_team;
3907  int n = hot_team->t.t_nproc;
3908  int i;
3909 
3910  KMP_DEBUG_ASSERT( ! root->r.r_active );
3911 
3912  root->r.r_root_team = NULL;
3913  root->r.r_hot_team = NULL;
3914  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
3915  // to __kmp_free_team().
3916  __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
3917 #if KMP_NESTED_HOT_TEAMS
3918  if( __kmp_hot_teams_max_level > 1 ) { // need to free nested hot teams and their threads if any
3919  for( i = 0; i < hot_team->t.t_nproc; ++i ) {
3920  kmp_info_t *th = hot_team->t.t_threads[i];
3921  n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
3922  if( th->th.th_hot_teams ) {
3923  __kmp_free( th->th.th_hot_teams );
3924  th->th.th_hot_teams = NULL;
3925  }
3926  }
3927  }
3928 #endif
3929  __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
3930 
3931  //
3932  // Before we can reap the thread, we need to make certain that all
3933  // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
3934  //
3935  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3936  __kmp_wait_to_unref_task_teams();
3937  }
3938 
3939  #if KMP_OS_WINDOWS
3940  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3941  KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
3942  (LPVOID)&(root->r.r_uber_thread->th),
3943  root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
3944  __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
3945  #endif /* KMP_OS_WINDOWS */
3946 
3947 #if OMPT_SUPPORT
3948  if ((ompt_status == ompt_status_track_callback) &&
3949  ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3950  int gtid = __kmp_get_gtid();
3951  __ompt_thread_end(ompt_thread_initial, gtid);
3952  }
3953 #endif
3954 
3955  TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3956  __kmp_reap_thread( root->r.r_uber_thread, 1 );
3957 
3958  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
3959  root->r.r_uber_thread = NULL;
3960  /* mark root as no longer in use */
3961  root->r.r_begin = FALSE;
3962 
3963  return n;
3964 }
3965 
3966 void
3967 __kmp_unregister_root_current_thread( int gtid )
3968 {
3969  KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
3970  /* this lock should be ok, since unregister_root_current_thread is never called during
3971  * and abort, only during a normal close. furthermore, if you have the
3972  * forkjoin lock, you should never try to get the initz lock */
3973 
3974  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3975  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
3976  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
3977  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3978  return;
3979  }
3980  kmp_root_t *root = __kmp_root[gtid];
3981 
3982  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3983  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3984  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3985  KMP_ASSERT( root->r.r_active == FALSE );
3986 
3987 
3988  KMP_MB();
3989 
3990 #if OMP_41_ENABLED
3991  kmp_info_t * thread = __kmp_threads[gtid];
3992  kmp_team_t * team = thread->th.th_team;
3993  kmp_task_team_t * task_team = thread->th.th_task_team;
3994 
3995  // we need to wait for the proxy tasks before finishing the thread
3996  if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
3997  __kmp_task_team_wait(thread, team, NULL );
3998 #endif
3999 
4000  __kmp_reset_root(gtid, root);
4001 
4002  /* free up this thread slot */
4003  __kmp_gtid_set_specific( KMP_GTID_DNE );
4004 #ifdef KMP_TDATA_GTID
4005  __kmp_gtid = KMP_GTID_DNE;
4006 #endif
4007 
4008  KMP_MB();
4009  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
4010 
4011  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
4012 }
4013 
4014 /* __kmp_forkjoin_lock must be already held
4015  Unregisters a root thread that is not the current thread. Returns the number of
4016  __kmp_threads entries freed as a result.
4017  */
4018 static int
4019 __kmp_unregister_root_other_thread( int gtid )
4020 {
4021  kmp_root_t *root = __kmp_root[gtid];
4022  int r;
4023 
4024  KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
4025  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
4026  KMP_ASSERT( KMP_UBER_GTID( gtid ));
4027  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
4028  KMP_ASSERT( root->r.r_active == FALSE );
4029 
4030  r = __kmp_reset_root(gtid, root);
4031  KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
4032  return r;
4033 }
4034 
4035 #if KMP_DEBUG
4036 void __kmp_task_info() {
4037 
4038  kmp_int32 gtid = __kmp_entry_gtid();
4039  kmp_int32 tid = __kmp_tid_from_gtid( gtid );
4040  kmp_info_t *this_thr = __kmp_threads[ gtid ];
4041  kmp_team_t *steam = this_thr->th.th_serial_team;
4042  kmp_team_t *team = this_thr->th.th_team;
4043 
4044  __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
4045  gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
4046 }
4047 #endif // KMP_DEBUG
4048 
4049 /* TODO optimize with one big memclr, take out what isn't needed,
4050  * split responsibility to workers as much as possible, and delay
4051  * initialization of features as much as possible */
4052 static void
4053 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
4054 {
4055  /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
4056  * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4057  kmp_info_t *master = team->t.t_threads[0];
4058  KMP_DEBUG_ASSERT( this_thr != NULL );
4059  KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
4060  KMP_DEBUG_ASSERT( team );
4061  KMP_DEBUG_ASSERT( team->t.t_threads );
4062  KMP_DEBUG_ASSERT( team->t.t_dispatch );
4063  KMP_DEBUG_ASSERT( master );
4064  KMP_DEBUG_ASSERT( master->th.th_root );
4065 
4066  KMP_MB();
4067 
4068  TCW_SYNC_PTR(this_thr->th.th_team, team);
4069 
4070  this_thr->th.th_info.ds.ds_tid = tid;
4071  this_thr->th.th_set_nproc = 0;
4072 #if OMP_40_ENABLED
4073  this_thr->th.th_set_proc_bind = proc_bind_default;
4074 # if KMP_AFFINITY_SUPPORTED
4075  this_thr->th.th_new_place = this_thr->th.th_current_place;
4076 # endif
4077 #endif
4078  this_thr->th.th_root = master->th.th_root;
4079 
4080  /* setup the thread's cache of the team structure */
4081  this_thr->th.th_team_nproc = team->t.t_nproc;
4082  this_thr->th.th_team_master = master;
4083  this_thr->th.th_team_serialized = team->t.t_serialized;
4084  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4085 
4086  KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
4087  this_thr->th.th_task_state = 0;
4088 
4089  KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4090  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4091 
4092  __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
4093 
4094  KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4095  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4096  // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
4097 
4098  /* TODO no worksharing in speculative threads */
4099  this_thr->th.th_dispatch = &team->t.t_dispatch[ tid ];
4100 
4101  this_thr->th.th_local.this_construct = 0;
4102 
4103 #ifdef BUILD_TV
4104  this_thr->th.th_local.tv_data = 0;
4105 #endif
4106 
4107  if ( ! this_thr->th.th_pri_common ) {
4108  this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
4109  if ( __kmp_storage_map ) {
4110  __kmp_print_storage_map_gtid(
4111  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4112  sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
4113  );
4114  }; // if
4115  this_thr->th.th_pri_head = NULL;
4116  }; // if
4117 
4118  /* Initialize dynamic dispatch */
4119  {
4120  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4121  /*
4122  * Use team max_nproc since this will never change for the team.
4123  */
4124  size_t disp_size = sizeof( dispatch_private_info_t ) *
4125  ( team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF );
4126  KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
4127  KMP_ASSERT( dispatch );
4128  KMP_DEBUG_ASSERT( team->t.t_dispatch );
4129  KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
4130 
4131  dispatch->th_disp_index = 0;
4132 
4133  if( ! dispatch->th_disp_buffer ) {
4134  dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
4135 
4136  if ( __kmp_storage_map ) {
4137  __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
4138  &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF ],
4139  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4140  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4141  gtid, team->t.t_id, gtid );
4142  }
4143  } else {
4144  memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
4145  }
4146 
4147  dispatch->th_dispatch_pr_current = 0;
4148  dispatch->th_dispatch_sh_current = 0;
4149 
4150  dispatch->th_deo_fcn = 0; /* ORDERED */
4151  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4152  }
4153 
4154  this_thr->th.th_next_pool = NULL;
4155 
4156  if (!this_thr->th.th_task_state_memo_stack) {
4157  this_thr->th.th_task_state_memo_stack = (kmp_uint8 *) __kmp_allocate( 4*sizeof(kmp_uint8) );
4158  this_thr->th.th_task_state_top = 0;
4159  this_thr->th.th_task_state_stack_sz = 4;
4160  }
4161 
4162  KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
4163  KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
4164 
4165  KMP_MB();
4166 }
4167 
4168 
4169 /* allocate a new thread for the requesting team. this is only called from within a
4170  * forkjoin critical section. we will first try to get an available thread from the
4171  * thread pool. if none is available, we will fork a new one assuming we are able
4172  * to create a new one. this should be assured, as the caller should check on this
4173  * first.
4174  */
4175 kmp_info_t *
4176 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
4177 {
4178  kmp_team_t *serial_team;
4179  kmp_info_t *new_thr;
4180  int new_gtid;
4181 
4182  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
4183  KMP_DEBUG_ASSERT( root && team );
4184 #if !KMP_NESTED_HOT_TEAMS
4185  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
4186 #endif
4187  KMP_MB();
4188 
4189  /* first, try to get one from the thread pool */
4190  if ( __kmp_thread_pool ) {
4191 
4192  new_thr = (kmp_info_t*)__kmp_thread_pool;
4193  __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
4194  if ( new_thr == __kmp_thread_pool_insert_pt ) {
4195  __kmp_thread_pool_insert_pt = NULL;
4196  }
4197  TCW_4(new_thr->th.th_in_pool, FALSE);
4198  //
4199  // Don't touch th_active_in_pool or th_active.
4200  // The worker thread adjusts those flags as it sleeps/awakens.
4201  //
4202 
4203  __kmp_thread_pool_nth--;
4204 
4205  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4206  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
4207  KMP_ASSERT( ! new_thr->th.th_team );
4208  KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
4209  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
4210 
4211  /* setup the thread structure */
4212  __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
4213  KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
4214 
4215  TCW_4(__kmp_nth, __kmp_nth + 1);
4216 
4217  new_thr->th.th_task_state_top = 0;
4218  new_thr->th.th_task_state_stack_sz = 4;
4219 
4220 #ifdef KMP_ADJUST_BLOCKTIME
4221  /* Adjust blocktime back to zero if necessary */
4222  /* Middle initialization might not have occurred yet */
4223  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4224  if ( __kmp_nth > __kmp_avail_proc ) {
4225  __kmp_zero_bt = TRUE;
4226  }
4227  }
4228 #endif /* KMP_ADJUST_BLOCKTIME */
4229 
4230 #if KMP_DEBUG
4231  // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
4232  int b;
4233  kmp_balign_t * balign = new_thr->th.th_bar;
4234  for( b = 0; b < bs_last_barrier; ++ b )
4235  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4236 #endif
4237 
4238  KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4239  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
4240 
4241  KMP_MB();
4242  return new_thr;
4243  }
4244 
4245 
4246  /* no, well fork a new one */
4247  KMP_ASSERT( __kmp_nth == __kmp_all_nth );
4248  KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
4249 
4250  //
4251  // If this is the first worker thread the RTL is creating, then also
4252  // launch the monitor thread. We try to do this as early as possible.
4253  //
4254  if ( ! TCR_4( __kmp_init_monitor ) ) {
4255  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
4256  if ( ! TCR_4( __kmp_init_monitor ) ) {
4257  KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
4258  TCW_4( __kmp_init_monitor, 1 );
4259  __kmp_create_monitor( & __kmp_monitor );
4260  KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
4261  #if KMP_OS_WINDOWS
4262  // AC: wait until monitor has started. This is a fix for CQ232808.
4263  // The reason is that if the library is loaded/unloaded in a loop with small (parallel)
4264  // work in between, then there is high probability that monitor thread started after
4265  // the library shutdown. At shutdown it is too late to cope with the problem, because
4266  // when the master is in DllMain (process detach) the monitor has no chances to start
4267  // (it is blocked), and master has no means to inform the monitor that the library has gone,
4268  // because all the memory which the monitor can access is going to be released/reset.
4269  while ( TCR_4(__kmp_init_monitor) < 2 ) {
4270  KMP_YIELD( TRUE );
4271  }
4272  KF_TRACE( 10, ( "after monitor thread has started\n" ) );
4273  #endif
4274  }
4275  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
4276  }
4277 
4278  KMP_MB();
4279  for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
4280  KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
4281  }
4282 
4283  /* allocate space for it. */
4284  new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4285 
4286  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4287 
4288  if ( __kmp_storage_map ) {
4289  __kmp_print_thread_storage_map( new_thr, new_gtid );
4290  }
4291 
4292  /* add the reserve serialized team, initialized from the team's master thread */
4293  {
4294  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
4295  KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
4296  new_thr->th.th_serial_team = serial_team =
4297  (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
4298 #if OMPT_SUPPORT
4299  0, // root parallel id
4300 #endif
4301 #if OMP_40_ENABLED
4302  proc_bind_default,
4303 #endif
4304  &r_icvs,
4305  0 USE_NESTED_HOT_ARG(NULL) );
4306  }
4307  KMP_ASSERT ( serial_team );
4308  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
4309  serial_team->t.t_threads[0] = new_thr;
4310  KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4311  new_thr ) );
4312 
4313  /* setup the thread structures */
4314  __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
4315 
4316  #if USE_FAST_MEMORY
4317  __kmp_initialize_fast_memory( new_thr );
4318  #endif /* USE_FAST_MEMORY */
4319 
4320  #if KMP_USE_BGET
4321  KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
4322  __kmp_initialize_bget( new_thr );
4323  #endif
4324 
4325  __kmp_init_random( new_thr ); // Initialize random number generator
4326 
4327  /* Initialize these only once when thread is grabbed for a team allocation */
4328  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4329  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4330 
4331  int b;
4332  kmp_balign_t * balign = new_thr->th.th_bar;
4333  for(b=0; b<bs_last_barrier; ++b) {
4334  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4335  balign[b].bb.team = NULL;
4336  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4337  balign[b].bb.use_oncore_barrier = 0;
4338  }
4339 
4340  new_thr->th.th_spin_here = FALSE;
4341  new_thr->th.th_next_waiting = 0;
4342 
4343 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4344  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4345  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4346  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4347  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4348 #endif
4349 
4350  TCW_4(new_thr->th.th_in_pool, FALSE);
4351  new_thr->th.th_active_in_pool = FALSE;
4352  TCW_4(new_thr->th.th_active, TRUE);
4353 
4354  /* adjust the global counters */
4355  __kmp_all_nth ++;
4356  __kmp_nth ++;
4357 
4358  //
4359  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4360  // for low numbers of procs, and method #2 (keyed API call) for higher
4361  // numbers of procs.
4362  //
4363  if ( __kmp_adjust_gtid_mode ) {
4364  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4365  if ( TCR_4(__kmp_gtid_mode) != 2) {
4366  TCW_4(__kmp_gtid_mode, 2);
4367  }
4368  }
4369  else {
4370  if (TCR_4(__kmp_gtid_mode) != 1 ) {
4371  TCW_4(__kmp_gtid_mode, 1);
4372  }
4373  }
4374  }
4375 
4376 #ifdef KMP_ADJUST_BLOCKTIME
4377  /* Adjust blocktime back to zero if necessary */
4378  /* Middle initialization might not have occurred yet */
4379  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4380  if ( __kmp_nth > __kmp_avail_proc ) {
4381  __kmp_zero_bt = TRUE;
4382  }
4383  }
4384 #endif /* KMP_ADJUST_BLOCKTIME */
4385 
4386  /* actually fork it and create the new worker thread */
4387  KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
4388  __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
4389  KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
4390 
4391 
4392  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
4393  KMP_MB();
4394  return new_thr;
4395 }
4396 
4397 /*
4398  * reinitialize team for reuse.
4399  *
4400  * The hot team code calls this case at every fork barrier, so EPCC barrier
4401  * test are extremely sensitive to changes in it, esp. writes to the team
4402  * struct, which cause a cache invalidation in all threads.
4403  *
4404  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
4405  */
4406 static void
4407 __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
4408  KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4409  team->t.t_threads[0], team ) );
4410  KMP_DEBUG_ASSERT( team && new_icvs);
4411  KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
4412  team->t.t_ident = loc;
4413 
4414  team->t.t_id = KMP_GEN_TEAM_ID();
4415 
4416  // Copy ICVs to the master thread's implicit taskdata
4417  __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
4418  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4419 
4420  KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4421  team->t.t_threads[0], team ) );
4422 }
4423 
4424 
4425 /* initialize the team data structure
4426  * this assumes the t_threads and t_max_nproc are already set
4427  * also, we don't touch the arguments */
4428 static void
4429 __kmp_initialize_team(
4430  kmp_team_t * team,
4431  int new_nproc,
4432  kmp_internal_control_t * new_icvs,
4433  ident_t * loc
4434 ) {
4435  KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
4436 
4437  /* verify */
4438  KMP_DEBUG_ASSERT( team );
4439  KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
4440  KMP_DEBUG_ASSERT( team->t.t_threads );
4441  KMP_MB();
4442 
4443  team->t.t_master_tid = 0; /* not needed */
4444  /* team->t.t_master_bar; not needed */
4445  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4446  team->t.t_nproc = new_nproc;
4447 
4448  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4449  team->t.t_next_pool = NULL;
4450  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
4451 
4452  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4453  team->t.t_invoke = NULL; /* not needed */
4454 
4455  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4456  team->t.t_sched = new_icvs->sched;
4457 
4458 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4459  team->t.t_fp_control_saved = FALSE; /* not needed */
4460  team->t.t_x87_fpu_control_word = 0; /* not needed */
4461  team->t.t_mxcsr = 0; /* not needed */
4462 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4463 
4464  team->t.t_construct = 0;
4465  __kmp_init_lock( & team->t.t_single_lock );
4466 
4467  team->t.t_ordered .dt.t_value = 0;
4468  team->t.t_master_active = FALSE;
4469 
4470  memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
4471 
4472 #ifdef KMP_DEBUG
4473  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4474 #endif
4475  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4476 
4477  team->t.t_control_stack_top = NULL;
4478 
4479  __kmp_reinitialize_team( team, new_icvs, loc );
4480 
4481  KMP_MB();
4482  KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
4483 }
4484 
4485 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4486 /* Sets full mask for thread and returns old mask, no changes to structures. */
4487 static void
4488 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
4489 {
4490  if ( KMP_AFFINITY_CAPABLE() ) {
4491  int status;
4492  if ( old_mask != NULL ) {
4493  status = __kmp_get_system_affinity( old_mask, TRUE );
4494  int error = errno;
4495  if ( status != 0 ) {
4496  __kmp_msg(
4497  kmp_ms_fatal,
4498  KMP_MSG( ChangeThreadAffMaskError ),
4499  KMP_ERR( error ),
4500  __kmp_msg_null
4501  );
4502  }
4503  }
4504  __kmp_set_system_affinity( __kmp_affinity_get_fullMask(), TRUE );
4505  }
4506 }
4507 #endif
4508 
4509 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4510 
4511 //
4512 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4513 // It calculats the worker + master thread's partition based upon the parent
4514 // thread's partition, and binds each worker to a thread in their partition.
4515 // The master thread's partition should already include its current binding.
4516 //
4517 static void
4518 __kmp_partition_places( kmp_team_t *team )
4519 {
4520  //
4521  // Copy the master thread's place partion to the team struct
4522  //
4523  kmp_info_t *master_th = team->t.t_threads[0];
4524  KMP_DEBUG_ASSERT( master_th != NULL );
4525  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4526  int first_place = master_th->th.th_first_place;
4527  int last_place = master_th->th.th_last_place;
4528  int masters_place = master_th->th.th_current_place;
4529  team->t.t_first_place = first_place;
4530  team->t.t_last_place = last_place;
4531 
4532  KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
4533  proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
4534  masters_place, first_place, last_place ) );
4535 
4536  switch ( proc_bind ) {
4537 
4538  case proc_bind_default:
4539  //
4540  // serial teams might have the proc_bind policy set to
4541  // proc_bind_default. It doesn't matter, as we don't
4542  // rebind the master thread for any proc_bind policy.
4543  //
4544  KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
4545  break;
4546 
4547  case proc_bind_master:
4548  {
4549  int f;
4550  int n_th = team->t.t_nproc;
4551  for ( f = 1; f < n_th; f++ ) {
4552  kmp_info_t *th = team->t.t_threads[f];
4553  KMP_DEBUG_ASSERT( th != NULL );
4554  th->th.th_first_place = first_place;
4555  th->th.th_last_place = last_place;
4556  th->th.th_new_place = masters_place;
4557 
4558  KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4559  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4560  team->t.t_id, f, masters_place, first_place, last_place ) );
4561  }
4562  }
4563  break;
4564 
4565  case proc_bind_close:
4566  {
4567  int f;
4568  int n_th = team->t.t_nproc;
4569  int n_places;
4570  if ( first_place <= last_place ) {
4571  n_places = last_place - first_place + 1;
4572  }
4573  else {
4574  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4575  }
4576  if ( n_th <= n_places ) {
4577  int place = masters_place;
4578  for ( f = 1; f < n_th; f++ ) {
4579  kmp_info_t *th = team->t.t_threads[f];
4580  KMP_DEBUG_ASSERT( th != NULL );
4581 
4582  if ( place == last_place ) {
4583  place = first_place;
4584  }
4585  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4586  place = 0;
4587  }
4588  else {
4589  place++;
4590  }
4591  th->th.th_first_place = first_place;
4592  th->th.th_last_place = last_place;
4593  th->th.th_new_place = place;
4594 
4595  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4596  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4597  team->t.t_id, f, place, first_place, last_place ) );
4598  }
4599  }
4600  else {
4601  int S, rem, gap, s_count;
4602  S = n_th / n_places;
4603  s_count = 0;
4604  rem = n_th - ( S * n_places );
4605  gap = rem > 0 ? n_places/rem : n_places;
4606  int place = masters_place;
4607  int gap_ct = gap;
4608  for ( f = 0; f < n_th; f++ ) {
4609  kmp_info_t *th = team->t.t_threads[f];
4610  KMP_DEBUG_ASSERT( th != NULL );
4611 
4612  th->th.th_first_place = first_place;
4613  th->th.th_last_place = last_place;
4614  th->th.th_new_place = place;
4615  s_count++;
4616 
4617  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4618  // do nothing, add an extra thread to place on next iteration
4619  }
4620  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4621  // we added an extra thread to this place; move to next place
4622  if ( place == last_place ) {
4623  place = first_place;
4624  }
4625  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4626  place = 0;
4627  }
4628  else {
4629  place++;
4630  }
4631  s_count = 0;
4632  gap_ct = 1;
4633  rem--;
4634  }
4635  else if (s_count == S) { // place full; don't add extra
4636  if ( place == last_place ) {
4637  place = first_place;
4638  }
4639  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4640  place = 0;
4641  }
4642  else {
4643  place++;
4644  }
4645  gap_ct++;
4646  s_count = 0;
4647  }
4648 
4649  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4650  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4651  team->t.t_id, f, th->th.th_new_place, first_place,
4652  last_place ) );
4653  }
4654  KMP_DEBUG_ASSERT( place == masters_place );
4655  }
4656  }
4657  break;
4658 
4659  case proc_bind_spread:
4660  {
4661  int f;
4662  int n_th = team->t.t_nproc;
4663  int n_places;
4664  if ( first_place <= last_place ) {
4665  n_places = last_place - first_place + 1;
4666  }
4667  else {
4668  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4669  }
4670  if ( n_th <= n_places ) {
4671  int place = masters_place;
4672  int S = n_places/n_th;
4673  int s_count, rem, gap, gap_ct;
4674  rem = n_places - n_th*S;
4675  gap = rem ? n_th/rem : 1;
4676  gap_ct = gap;
4677  for ( f = 0; f < n_th; f++ ) {
4678  kmp_info_t *th = team->t.t_threads[f];
4679  KMP_DEBUG_ASSERT( th != NULL );
4680 
4681  th->th.th_first_place = place;
4682  th->th.th_new_place = place;
4683  s_count = 1;
4684  while (s_count < S) {
4685  if ( place == last_place ) {
4686  place = first_place;
4687  }
4688  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4689  place = 0;
4690  }
4691  else {
4692  place++;
4693  }
4694  s_count++;
4695  }
4696  if (rem && (gap_ct == gap)) {
4697  if ( place == last_place ) {
4698  place = first_place;
4699  }
4700  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4701  place = 0;
4702  }
4703  else {
4704  place++;
4705  }
4706  rem--;
4707  gap_ct = 0;
4708  }
4709  th->th.th_last_place = place;
4710  gap_ct++;
4711 
4712  if ( place == last_place ) {
4713  place = first_place;
4714  }
4715  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4716  place = 0;
4717  }
4718  else {
4719  place++;
4720  }
4721 
4722  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4723  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4724  team->t.t_id, f, th->th.th_new_place,
4725  th->th.th_first_place, th->th.th_last_place ) );
4726  }
4727  KMP_DEBUG_ASSERT( place == masters_place );
4728  }
4729  else {
4730  int S, rem, gap, s_count;
4731  S = n_th / n_places;
4732  s_count = 0;
4733  rem = n_th - ( S * n_places );
4734  gap = rem > 0 ? n_places/rem : n_places;
4735  int place = masters_place;
4736  int gap_ct = gap;
4737  for ( f = 0; f < n_th; f++ ) {
4738  kmp_info_t *th = team->t.t_threads[f];
4739  KMP_DEBUG_ASSERT( th != NULL );
4740 
4741  th->th.th_first_place = place;
4742  th->th.th_last_place = place;
4743  th->th.th_new_place = place;
4744  s_count++;
4745 
4746  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4747  // do nothing, add an extra thread to place on next iteration
4748  }
4749  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4750  // we added an extra thread to this place; move on to next place
4751  if ( place == last_place ) {
4752  place = first_place;
4753  }
4754  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4755  place = 0;
4756  }
4757  else {
4758  place++;
4759  }
4760  s_count = 0;
4761  gap_ct = 1;
4762  rem--;
4763  }
4764  else if (s_count == S) { // place is full; don't add extra thread
4765  if ( place == last_place ) {
4766  place = first_place;
4767  }
4768  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4769  place = 0;
4770  }
4771  else {
4772  place++;
4773  }
4774  gap_ct++;
4775  s_count = 0;
4776  }
4777 
4778  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4779  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4780  team->t.t_id, f, th->th.th_new_place,
4781  th->th.th_first_place, th->th.th_last_place) );
4782  }
4783  KMP_DEBUG_ASSERT( place == masters_place );
4784  }
4785  }
4786  break;
4787 
4788  default:
4789  break;
4790  }
4791 
4792  KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
4793 }
4794 
4795 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4796 
4797 /* allocate a new team data structure to use. take one off of the free pool if available */
4798 kmp_team_t *
4799 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
4800 #if OMPT_SUPPORT
4801  ompt_parallel_id_t ompt_parallel_id,
4802 #endif
4803 #if OMP_40_ENABLED
4804  kmp_proc_bind_t new_proc_bind,
4805 #endif
4806  kmp_internal_control_t *new_icvs,
4807  int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
4808 {
4809  KMP_TIME_BLOCK(KMP_allocate_team);
4810  int f;
4811  kmp_team_t *team;
4812  char *ptr;
4813  size_t size;
4814  int use_hot_team = ! root->r.r_active;
4815  int level = 0;
4816 
4817  KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
4818  KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
4819  KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
4820  KMP_MB();
4821 
4822 #if KMP_NESTED_HOT_TEAMS
4823  kmp_hot_team_ptr_t *hot_teams;
4824  if( master ) {
4825  team = master->th.th_team;
4826  level = team->t.t_active_level;
4827  if( master->th.th_teams_microtask ) { // in teams construct?
4828  if( master->th.th_teams_size.nteams > 1 && ( // #teams > 1
4829  team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
4830  master->th.th_teams_level < team->t.t_level ) ) { // or nested parallel inside the teams
4831  ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
4832  }
4833  }
4834  hot_teams = master->th.th_hot_teams;
4835  if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
4836  { // hot team has already been allocated for given level
4837  use_hot_team = 1;
4838  } else {
4839  use_hot_team = 0;
4840  }
4841  }
4842 #endif
4843  // Optimization to use a "hot" team
4844  if( use_hot_team && new_nproc > 1 ) {
4845  KMP_DEBUG_ASSERT( new_nproc == max_nproc );
4846 #if KMP_NESTED_HOT_TEAMS
4847  team = hot_teams[level].hot_team;
4848 #else
4849  team = root->r.r_hot_team;
4850 #endif
4851 #if KMP_DEBUG
4852  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4853  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p before reinit\n",
4854  team->t.t_task_team[0], team->t.t_task_team[1] ));
4855  }
4856 #endif
4857 
4858  // Has the number of threads changed?
4859  /* Let's assume the most common case is that the number of threads is unchanged, and
4860  put that case first. */
4861  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4862  KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
4863  // This case can mean that omp_set_num_threads() was called and the hot team size
4864  // was already reduced, so we check the special flag
4865  if ( team->t.t_size_changed == -1 ) {
4866  team->t.t_size_changed = 1;
4867  } else {
4868  team->t.t_size_changed = 0;
4869  }
4870 
4871  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4872  team->t.t_sched = new_icvs->sched;
4873 
4874  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4875 
4876  KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
4877  0, team->t.t_threads[0], team ) );
4878  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4879 
4880 #if OMP_40_ENABLED
4881 # if KMP_AFFINITY_SUPPORTED
4882  if ( ( team->t.t_size_changed == 0 )
4883  && ( team->t.t_proc_bind == new_proc_bind ) ) {
4884  KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
4885  team->t.t_id, new_proc_bind, team->t.t_first_place,
4886  team->t.t_last_place ) );
4887  }
4888  else {
4889  team->t.t_proc_bind = new_proc_bind;
4890  __kmp_partition_places( team );
4891  }
4892 # else
4893  if ( team->t.t_proc_bind != new_proc_bind ) {
4894  team->t.t_proc_bind = new_proc_bind;
4895  }
4896 # endif /* KMP_AFFINITY_SUPPORTED */
4897 #endif /* OMP_40_ENABLED */
4898  }
4899  else if( team->t.t_nproc > new_nproc ) {
4900  KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
4901 
4902  team->t.t_size_changed = 1;
4903  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4904  // Signal the worker threads (esp. extra ones) to stop looking for tasks while spin waiting.
4905  // The task teams are reference counted and will be deallocated by the last worker thread.
4906  int tt_idx;
4907  for (tt_idx=0; tt_idx<2; ++tt_idx) {
4908  // We don't know which of the two task teams workers are waiting on, so deactivate both.
4909  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
4910  if ( (task_team != NULL) && TCR_SYNC_4(task_team->tt.tt_active) ) {
4911  KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
4912  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
4913  KMP_MB();
4914  KA_TRACE(20, ("__kmp_allocate_team: setting task_team %p to NULL\n",
4915  &team->t.t_task_team[tt_idx]));
4916  team->t.t_task_team[tt_idx] = NULL;
4917  }
4918  else {
4919  KMP_DEBUG_ASSERT( task_team == NULL );
4920  }
4921  }
4922  }
4923 #if KMP_NESTED_HOT_TEAMS
4924  if( __kmp_hot_teams_mode == 0 ) {
4925  // AC: saved number of threads should correspond to team's value in this mode,
4926  // can be bigger in mode 1, when hot team has some threads in reserve
4927  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4928  hot_teams[level].hot_team_nth = new_nproc;
4929 #endif // KMP_NESTED_HOT_TEAMS
4930  /* release the extra threads we don't need any more */
4931  for( f = new_nproc ; f < team->t.t_nproc ; f++ ) {
4932  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4933  __kmp_free_thread( team->t.t_threads[ f ] );
4934  team->t.t_threads[ f ] = NULL;
4935  }
4936 #if KMP_NESTED_HOT_TEAMS
4937  } // (__kmp_hot_teams_mode == 0)
4938 #endif // KMP_NESTED_HOT_TEAMS
4939  team->t.t_nproc = new_nproc;
4940  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4941  team->t.t_sched = new_icvs->sched;
4942  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4943 
4944  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4945  // Init both task teams
4946  int tt_idx;
4947  for (tt_idx=0; tt_idx<2; ++tt_idx) {
4948  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
4949  if ( task_team != NULL ) {
4950  KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
4951  task_team->tt.tt_nproc = new_nproc;
4952  task_team->tt.tt_unfinished_threads = new_nproc;
4953  task_team->tt.tt_ref_ct = new_nproc - 1;
4954  }
4955  }
4956  }
4957 
4958  /* update the remaining threads */
4959  if (level) {
4960  team->t.t_threads[0]->th.th_team_nproc = new_nproc;
4961  for(f = 1; f < new_nproc; ++f) {
4962  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4963  team->t.t_threads[f]->th.th_task_state = 0;
4964  }
4965  }
4966  else {
4967  for(f = 0; f < new_nproc; ++f) {
4968  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4969  }
4970  }
4971  // restore the current task state of the master thread: should be the implicit task
4972  KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
4973  0, team->t.t_threads[0], team ) );
4974 
4975  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4976 
4977 #ifdef KMP_DEBUG
4978  for ( f = 0; f < team->t.t_nproc; f++ ) {
4979  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4980  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4981  }
4982 #endif
4983 
4984 #if OMP_40_ENABLED
4985  team->t.t_proc_bind = new_proc_bind;
4986 # if KMP_AFFINITY_SUPPORTED
4987  __kmp_partition_places( team );
4988 # endif
4989 #endif
4990  }
4991  else { // team->t.t_nproc < new_nproc
4992 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4993  kmp_affin_mask_t *old_mask;
4994  if ( KMP_AFFINITY_CAPABLE() ) {
4995  KMP_CPU_ALLOC(old_mask);
4996  }
4997 #endif
4998 
4999  KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
5000 
5001  team->t.t_size_changed = 1;
5002 
5003 
5004 #if KMP_NESTED_HOT_TEAMS
5005  int avail_threads = hot_teams[level].hot_team_nth;
5006  if( new_nproc < avail_threads )
5007  avail_threads = new_nproc;
5008  kmp_info_t **other_threads = team->t.t_threads;
5009  for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
5010  // Adjust barrier data of reserved threads (if any) of the team
5011  // Other data will be set in __kmp_initialize_info() below.
5012  int b;
5013  kmp_balign_t * balign = other_threads[f]->th.th_bar;
5014  for ( b = 0; b < bs_last_barrier; ++ b ) {
5015  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5016  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5017 #if USE_DEBUGGER
5018  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5019 #endif
5020  }
5021  }
5022  if( hot_teams[level].hot_team_nth >= new_nproc ) {
5023  // we have all needed threads in reserve, no need to allocate any
5024  // this only possible in mode 1, cannot have reserved threads in mode 0
5025  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5026  team->t.t_nproc = new_nproc; // just get reserved threads involved
5027  } else {
5028  // we may have some threads in reserve, but not enough
5029  team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
5030  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5031 #endif // KMP_NESTED_HOT_TEAMS
5032  if(team->t.t_max_nproc < new_nproc) {
5033  /* reallocate larger arrays */
5034  __kmp_reallocate_team_arrays(team, new_nproc);
5035  __kmp_reinitialize_team( team, new_icvs, NULL );
5036  }
5037 
5038 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5039  /* Temporarily set full mask for master thread before
5040  creation of workers. The reason is that workers inherit
5041  the affinity from master, so if a lot of workers are
5042  created on the single core quickly, they don't get
5043  a chance to set their own affinity for a long time.
5044  */
5045  __kmp_set_thread_affinity_mask_full_tmp( old_mask );
5046 #endif
5047 
5048  /* allocate new threads for the hot team */
5049  for( f = team->t.t_nproc ; f < new_nproc ; f++ ) {
5050  kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
5051  KMP_DEBUG_ASSERT( new_worker );
5052  team->t.t_threads[ f ] = new_worker;
5053 
5054  KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%llu, plain=%llu\n",
5055  team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
5056  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5057  team->t.t_bar[bs_plain_barrier].b_arrived ) );
5058 
5059  { // Initialize barrier data for new threads.
5060  int b;
5061  kmp_balign_t * balign = new_worker->th.th_bar;
5062  for( b = 0; b < bs_last_barrier; ++ b ) {
5063  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
5064  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5065 #if USE_DEBUGGER
5066  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5067 #endif
5068  }
5069  }
5070  }
5071 
5072 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5073  if ( KMP_AFFINITY_CAPABLE() ) {
5074  /* Restore initial master thread's affinity mask */
5075  __kmp_set_system_affinity( old_mask, TRUE );
5076  KMP_CPU_FREE(old_mask);
5077  }
5078 #endif
5079 #if KMP_NESTED_HOT_TEAMS
5080  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5081 #endif // KMP_NESTED_HOT_TEAMS
5082  /* make sure everyone is syncronized */
5083  __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
5084 
5085  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5086  // Signal the worker threads to stop looking for tasks while spin waiting.
5087  // The task teams are reference counted and will be deallocated by the last worker thread.
5088  int tt_idx;
5089  for (tt_idx=0; tt_idx<2; ++tt_idx) {
5090  // We don't know which of the two task teams workers are waiting on, so deactivate both.
5091  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5092  if ( (task_team != NULL) && TCR_SYNC_4(task_team->tt.tt_active) ) {
5093  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
5094  team->t.t_task_team[tt_idx] = NULL;
5095  }
5096  }
5097  }
5098 
5099  /* reinitialize the threads */
5100  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5101  if (level) {
5102  int old_state = team->t.t_threads[0]->th.th_task_state;
5103  for (f=0; f < team->t.t_nproc; ++f)
5104  __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
5105  // th_task_state for master thread will be put in stack of states in __kmp_fork_call()
5106  // before zeroing, for workers it was just zeroed in __kmp_initialize_info()
5107  team->t.t_threads[0]->th.th_task_state = old_state;
5108  }
5109  else {
5110  int old_state = team->t.t_threads[0]->th.th_task_state;
5111  for (f=0; f<team->t.t_nproc; ++f) {
5112  __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
5113  team->t.t_threads[f]->th.th_task_state = old_state;
5114  team->t.t_threads[f]->th.th_task_team = team->t.t_task_team[old_state];
5115  }
5116  }
5117 
5118 #ifdef KMP_DEBUG
5119  for ( f = 0; f < team->t.t_nproc; ++ f ) {
5120  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5121  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5122  }
5123 #endif
5124 
5125 #if OMP_40_ENABLED
5126  team->t.t_proc_bind = new_proc_bind;
5127 # if KMP_AFFINITY_SUPPORTED
5128  __kmp_partition_places( team );
5129 # endif
5130 #endif
5131  } // Check changes in number of threads
5132 
5133 #if OMP_40_ENABLED
5134  kmp_info_t *master = team->t.t_threads[0];
5135  if( master->th.th_teams_microtask ) {
5136  for( f = 1; f < new_nproc; ++f ) {
5137  // propagate teams construct specific info to workers
5138  kmp_info_t *thr = team->t.t_threads[f];
5139  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5140  thr->th.th_teams_level = master->th.th_teams_level;
5141  thr->th.th_teams_size = master->th.th_teams_size;
5142  }
5143  }
5144 #endif /* OMP_40_ENABLED */
5145 #if KMP_NESTED_HOT_TEAMS
5146  if( level ) {
5147  // Sync barrier state for nested hot teams, not needed for outermost hot team.
5148  for( f = 1; f < new_nproc; ++f ) {
5149  kmp_info_t *thr = team->t.t_threads[f];
5150  int b;
5151  kmp_balign_t * balign = thr->th.th_bar;
5152  for( b = 0; b < bs_last_barrier; ++ b ) {
5153  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
5154  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5155 #if USE_DEBUGGER
5156  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5157 #endif
5158  }
5159  }
5160  }
5161 #endif // KMP_NESTED_HOT_TEAMS
5162 
5163  /* reallocate space for arguments if necessary */
5164  __kmp_alloc_argv_entries( argc, team, TRUE );
5165  team->t.t_argc = argc;
5166  //
5167  // The hot team re-uses the previous task team,
5168  // if untouched during the previous release->gather phase.
5169  //
5170 
5171  KF_TRACE( 10, ( " hot_team = %p\n", team ) );
5172 
5173 #if KMP_DEBUG
5174  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5175  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p after reinit\n",
5176  team->t.t_task_team[0], team->t.t_task_team[1] ));
5177  }
5178 #endif
5179 
5180 #if OMPT_SUPPORT
5181  __ompt_team_assign_id(team, ompt_parallel_id);
5182 #endif
5183 
5184  KMP_MB();
5185 
5186  return team;
5187  }
5188 
5189  /* next, let's try to take one from the team pool */
5190  KMP_MB();
5191  for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
5192  {
5193  /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
5194  if ( team->t.t_max_nproc >= max_nproc ) {
5195  /* take this team from the team pool */
5196  __kmp_team_pool = team->t.t_next_pool;
5197 
5198  /* setup the team for fresh use */
5199  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5200 
5201  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5202  &team->t.t_task_team[0], &team->t.t_task_team[1]) );
5203  team->t.t_task_team[0] = NULL;
5204  team->t.t_task_team[1] = NULL;
5205 
5206  /* reallocate space for arguments if necessary */
5207  __kmp_alloc_argv_entries( argc, team, TRUE );
5208  team->t.t_argc = argc;
5209 
5210  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5211  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5212  { // Initialize barrier data.
5213  int b;
5214  for ( b = 0; b < bs_last_barrier; ++ b) {
5215  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5216 #if USE_DEBUGGER
5217  team->t.t_bar[ b ].b_master_arrived = 0;
5218  team->t.t_bar[ b ].b_team_arrived = 0;
5219 #endif
5220  }
5221  }
5222 
5223 #if OMP_40_ENABLED
5224  team->t.t_proc_bind = new_proc_bind;
5225 #endif
5226 
5227  KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
5228 
5229 #if OMPT_SUPPORT
5230  __ompt_team_assign_id(team, ompt_parallel_id);
5231 #endif
5232 
5233  KMP_MB();
5234 
5235  return team;
5236  }
5237 
5238  /* reap team if it is too small, then loop back and check the next one */
5239  /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
5240  /* TODO: Use technique to find the right size hot-team, don't reap them */
5241  team = __kmp_reap_team( team );
5242  __kmp_team_pool = team;
5243  }
5244 
5245  /* nothing available in the pool, no matter, make a new team! */
5246  KMP_MB();
5247  team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
5248 
5249  /* and set it up */
5250  team->t.t_max_nproc = max_nproc;
5251  /* NOTE well, for some reason allocating one big buffer and dividing it
5252  * up seems to really hurt performance a lot on the P4, so, let's not use
5253  * this... */
5254  __kmp_allocate_team_arrays( team, max_nproc );
5255 
5256  KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
5257  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5258 
5259  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5260  &team->t.t_task_team[0], &team->t.t_task_team[1] ) );
5261  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5262  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5263 
5264  if ( __kmp_storage_map ) {
5265  __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
5266  }
5267 
5268  /* allocate space for arguments */
5269  __kmp_alloc_argv_entries( argc, team, FALSE );
5270  team->t.t_argc = argc;
5271 
5272  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5273  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5274  { // Initialize barrier data.
5275  int b;
5276  for ( b = 0; b < bs_last_barrier; ++ b ) {
5277  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5278 #if USE_DEBUGGER
5279  team->t.t_bar[ b ].b_master_arrived = 0;
5280  team->t.t_bar[ b ].b_team_arrived = 0;
5281 #endif
5282  }
5283  }
5284 
5285 #if OMP_40_ENABLED
5286  team->t.t_proc_bind = new_proc_bind;
5287 #endif
5288 
5289 #if OMPT_SUPPORT
5290  __ompt_team_assign_id(team, ompt_parallel_id);
5291  team->t.ompt_serialized_team_info = NULL;
5292 #endif
5293 
5294  KMP_MB();
5295 
5296  KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
5297 
5298  return team;
5299 }
5300 
5301 /* TODO implement hot-teams at all levels */
5302 /* TODO implement lazy thread release on demand (disband request) */
5303 
5304 /* free the team. return it to the team pool. release all the threads
5305  * associated with it */
5306 void
5307 __kmp_free_team( kmp_root_t *root, kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master) )
5308 {
5309  int f;
5310  KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
5311 
5312  /* verify state */
5313  KMP_DEBUG_ASSERT( root );
5314  KMP_DEBUG_ASSERT( team );
5315  KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
5316  KMP_DEBUG_ASSERT( team->t.t_threads );
5317 
5318  int use_hot_team = team == root->r.r_hot_team;
5319 #if KMP_NESTED_HOT_TEAMS
5320  int level;
5321  kmp_hot_team_ptr_t *hot_teams;
5322  if( master ) {
5323  level = team->t.t_active_level - 1;
5324  if( master->th.th_teams_microtask ) { // in teams construct?
5325  if( master->th.th_teams_size.nteams > 1 ) {
5326  ++level; // level was not increased in teams construct for team_of_masters
5327  }
5328  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5329  master->th.th_teams_level == team->t.t_level ) {
5330  ++level; // level was not increased in teams construct for team_of_workers before the parallel
5331  } // team->t.t_level will be increased inside parallel
5332  }
5333  hot_teams = master->th.th_hot_teams;
5334  if( level < __kmp_hot_teams_max_level ) {
5335  KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
5336  use_hot_team = 1;
5337  }
5338  }
5339 #endif // KMP_NESTED_HOT_TEAMS
5340 
5341  /* team is done working */
5342  TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
5343  team->t.t_copyin_counter = 0; // init counter for possible reuse
5344  // Do not reset pointer to parent team to NULL for hot teams.
5345 
5346  /* if we are non-hot team, release our threads */
5347  if( ! use_hot_team ) {
5348  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5349  int tt_idx;
5350  for (tt_idx=0; tt_idx<2; ++tt_idx) {
5351  // We don't know which of the two task teams workers are waiting on, so deactivate both.
5352  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5353  if ( task_team != NULL ) {
5354  // Signal the worker threads to stop looking for tasks while spin waiting. The task
5355  // teams are reference counted and will be deallocated by the last worker thread via the
5356  // thread's pointer to the task team.
5357  KA_TRACE( 20, ( "__kmp_free_team: deactivating task_team %p\n", task_team ) );
5358  KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
5359  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
5360  KMP_MB();
5361  team->t.t_task_team[tt_idx] = NULL;
5362  }
5363  }
5364  }
5365 
5366  // Reset pointer to parent team only for non-hot teams.
5367  team->t.t_parent = NULL;
5368 
5369 
5370  /* free the worker threads */
5371  for ( f = 1; f < team->t.t_nproc; ++ f ) {
5372  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5373  __kmp_free_thread( team->t.t_threads[ f ] );
5374  team->t.t_threads[ f ] = NULL;
5375  }
5376 
5377 
5378  /* put the team back in the team pool */
5379  /* TODO limit size of team pool, call reap_team if pool too large */
5380  team->t.t_next_pool = (kmp_team_t*) __kmp_team_pool;
5381  __kmp_team_pool = (volatile kmp_team_t*) team;
5382  }
5383 
5384  KMP_MB();
5385 }
5386 
5387 
5388 /* reap the team. destroy it, reclaim all its resources and free its memory */
5389 kmp_team_t *
5390 __kmp_reap_team( kmp_team_t *team )
5391 {
5392  kmp_team_t *next_pool = team->t.t_next_pool;
5393 
5394  KMP_DEBUG_ASSERT( team );
5395  KMP_DEBUG_ASSERT( team->t.t_dispatch );
5396  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
5397  KMP_DEBUG_ASSERT( team->t.t_threads );
5398  KMP_DEBUG_ASSERT( team->t.t_argv );
5399 
5400  /* TODO clean the threads that are a part of this? */
5401 
5402  /* free stuff */
5403 
5404  __kmp_free_team_arrays( team );
5405  if ( team->t.t_argv != &team->t.t_inline_argv[0] )
5406  __kmp_free( (void*) team->t.t_argv );
5407  __kmp_free( team );
5408 
5409  KMP_MB();
5410  return next_pool;
5411 }
5412 
5413 //
5414 // Free the thread. Don't reap it, just place it on the pool of available
5415 // threads.
5416 //
5417 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5418 // binding for the affinity mechanism to be useful.
5419 //
5420 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5421 // However, we want to avoid a potential performance problem by always
5422 // scanning through the list to find the correct point at which to insert
5423 // the thread (potential N**2 behavior). To do this we keep track of the
5424 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5425 // With single-level parallelism, threads will always be added to the tail
5426 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5427 // parallelism, all bets are off and we may need to scan through the entire
5428 // free list.
5429 //
5430 // This change also has a potentially large performance benefit, for some
5431 // applications. Previously, as threads were freed from the hot team, they
5432 // would be placed back on the free list in inverse order. If the hot team
5433 // grew back to it's original size, then the freed thread would be placed
5434 // back on the hot team in reverse order. This could cause bad cache
5435 // locality problems on programs where the size of the hot team regularly
5436 // grew and shrunk.
5437 //
5438 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5439 //
5440 void
5441 __kmp_free_thread( kmp_info_t *this_th )
5442 {
5443  int gtid;
5444  kmp_info_t **scan;
5445 
5446  KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5447  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
5448 
5449  KMP_DEBUG_ASSERT( this_th );
5450 
5451  // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
5452  int b;
5453  kmp_balign_t *balign = this_th->th.th_bar;
5454  for (b=0; b<bs_last_barrier; ++b) {
5455  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5456  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5457  balign[b].bb.team = NULL;
5458  }
5459 
5460 
5461  /* put thread back on the free pool */
5462  TCW_PTR(this_th->th.th_team, NULL);
5463  TCW_PTR(this_th->th.th_root, NULL);
5464  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5465 
5466  //
5467  // If the __kmp_thread_pool_insert_pt is already past the new insert
5468  // point, then we need to re-scan the entire list.
5469  //
5470  gtid = this_th->th.th_info.ds.ds_gtid;
5471  if ( __kmp_thread_pool_insert_pt != NULL ) {
5472  KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
5473  if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
5474  __kmp_thread_pool_insert_pt = NULL;
5475  }
5476  }
5477 
5478  //
5479  // Scan down the list to find the place to insert the thread.
5480  // scan is the address of a link in the list, possibly the address of
5481  // __kmp_thread_pool itself.
5482  //
5483  // In the absence of nested parallism, the for loop will have 0 iterations.
5484  //
5485  if ( __kmp_thread_pool_insert_pt != NULL ) {
5486  scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
5487  }
5488  else {
5489  scan = (kmp_info_t **)&__kmp_thread_pool;
5490  }
5491  for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
5492  scan = &( (*scan)->th.th_next_pool ) );
5493 
5494  //
5495  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5496  // to its address.
5497  //
5498  TCW_PTR(this_th->th.th_next_pool, *scan);
5499  __kmp_thread_pool_insert_pt = *scan = this_th;
5500  KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
5501  || ( this_th->th.th_info.ds.ds_gtid
5502  < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
5503  TCW_4(this_th->th.th_in_pool, TRUE);
5504  __kmp_thread_pool_nth++;
5505 
5506  TCW_4(__kmp_nth, __kmp_nth - 1);
5507 
5508 #ifdef KMP_ADJUST_BLOCKTIME
5509  /* Adjust blocktime back to user setting or default if necessary */
5510  /* Middle initialization might never have occurred */
5511  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5512  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5513  if ( __kmp_nth <= __kmp_avail_proc ) {
5514  __kmp_zero_bt = FALSE;
5515  }
5516  }
5517 #endif /* KMP_ADJUST_BLOCKTIME */
5518 
5519  KMP_MB();
5520 }
5521 
5522 
5523 /* ------------------------------------------------------------------------ */
5524 
5525 void *
5526 __kmp_launch_thread( kmp_info_t *this_thr )
5527 {
5528  int gtid = this_thr->th.th_info.ds.ds_gtid;
5529 /* void *stack_data;*/
5530  kmp_team_t *(*volatile pteam);
5531 
5532  KMP_MB();
5533  KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
5534 
5535  if( __kmp_env_consistency_check ) {
5536  this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid ); // ATT: Memory leak?
5537  }
5538 
5539 #if OMPT_SUPPORT
5540  if (ompt_status & ompt_status_track) {
5541  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5542  this_thr->th.ompt_thread_info.wait_id = 0;
5543  this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5544  if ((ompt_status == ompt_status_track_callback) &&
5545  ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5546  __ompt_thread_begin(ompt_thread_worker, gtid);
5547  }
5548  }
5549 #endif
5550 
5551  /* This is the place where threads wait for work */
5552  while( ! TCR_4(__kmp_global.g.g_done) ) {
5553  KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
5554  KMP_MB();
5555 
5556  /* wait for work to do */
5557  KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
5558 
5559 #if OMPT_SUPPORT
5560  if (ompt_status & ompt_status_track) {
5561  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5562  }
5563 #endif
5564 
5565  /* No tid yet since not part of a team */
5566  __kmp_fork_barrier( gtid, KMP_GTID_DNE );
5567 
5568 #if OMPT_SUPPORT
5569  if (ompt_status & ompt_status_track) {
5570  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5571  }
5572 #endif
5573 
5574  pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
5575 
5576  /* have we been allocated? */
5577  if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
5578  /* we were just woken up, so run our new task */
5579  if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
5580  int rc;
5581  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5582  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5583 
5584  updateHWFPControl (*pteam);
5585 
5586 #if OMPT_SUPPORT
5587  if (ompt_status & ompt_status_track) {
5588  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5589  }
5590 #endif
5591 
5592  KMP_STOP_EXPLICIT_TIMER(USER_launch_thread_loop);
5593  {
5594  KMP_TIME_BLOCK(USER_worker_invoke);
5595  rc = (*pteam)->t.t_invoke( gtid );
5596  }
5597  KMP_START_EXPLICIT_TIMER(USER_launch_thread_loop);
5598  KMP_ASSERT( rc );
5599 
5600 #if OMPT_SUPPORT
5601  if (ompt_status & ompt_status_track) {
5602  /* no frame set while outside task */
5603  int tid = __kmp_tid_from_gtid(gtid);
5604  (*pteam)->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_runtime_frame = 0;
5605 
5606  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5607  }
5608 #endif
5609  KMP_MB();
5610  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5611  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5612  }
5613  /* join barrier after parallel region */
5614  __kmp_join_barrier( gtid );
5615  }
5616  }
5617  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5618 
5619 #if OMPT_SUPPORT
5620  if ((ompt_status == ompt_status_track_callback) &&
5621  ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5622  __ompt_thread_end(ompt_thread_worker, gtid);
5623  }
5624 #endif
5625 
5626  if ( TCR_PTR( this_thr->th.th_task_team ) != NULL ) {
5627  __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
5628  }
5629  /* run the destructors for the threadprivate data for this thread */
5630  __kmp_common_destroy_gtid( gtid );
5631 
5632  KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
5633  KMP_MB();
5634  return this_thr;
5635 }
5636 
5637 /* ------------------------------------------------------------------------ */
5638 /* ------------------------------------------------------------------------ */
5639 
5640 void
5641 __kmp_internal_end_dest( void *specific_gtid )
5642 {
5643  #if KMP_COMPILER_ICC
5644  #pragma warning( push )
5645  #pragma warning( disable: 810 ) // conversion from "void *" to "int" may lose significant bits
5646  #endif
5647  // Make sure no significant bits are lost
5648  int gtid = (kmp_intptr_t)specific_gtid - 1;
5649  #if KMP_COMPILER_ICC
5650  #pragma warning( pop )
5651  #endif
5652 
5653  KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5654  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5655  * this is because 0 is reserved for the nothing-stored case */
5656 
5657  /* josh: One reason for setting the gtid specific data even when it is being
5658  destroyed by pthread is to allow gtid lookup through thread specific data
5659  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5660  that gets executed in the call to __kmp_internal_end_thread, actually
5661  gets the gtid through the thread specific data. Setting it here seems
5662  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5663  to run smoothly.
5664  todo: get rid of this after we remove the dependence on
5665  __kmp_gtid_get_specific
5666  */
5667  if(gtid >= 0 && KMP_UBER_GTID(gtid))
5668  __kmp_gtid_set_specific( gtid );
5669  #ifdef KMP_TDATA_GTID
5670  __kmp_gtid = gtid;
5671  #endif
5672  __kmp_internal_end_thread( gtid );
5673 }
5674 
5675 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5676 
5677 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
5678 // perfectly, but in real libiomp5.so I have no evidence it is ever called. However, -fini linker
5679 // option in makefile.mk works fine.
5680 
5681 __attribute__(( destructor ))
5682 void
5683 __kmp_internal_end_dtor( void )
5684 {
5685  __kmp_internal_end_atexit();
5686 }
5687 
5688 void
5689 __kmp_internal_end_fini( void )
5690 {
5691  __kmp_internal_end_atexit();
5692 }
5693 
5694 #endif
5695 
5696 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
5697 void
5698 __kmp_internal_end_atexit( void )
5699 {
5700  KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
5701  /* [Windows]
5702  josh: ideally, we want to completely shutdown the library in this atexit handler, but
5703  stat code that depends on thread specific data for gtid fails because that data becomes
5704  unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
5705  instead. We should eventually remove the dependency on __kmp_get_specific_gtid in the
5706  stat code and use __kmp_internal_end_library to cleanly shutdown the library.
5707 
5708 // TODO: Can some of this comment about GVS be removed?
5709  I suspect that the offending stat code is executed when the calling thread tries to
5710  clean up a dead root thread's data structures, resulting in GVS code trying to close
5711  the GVS structures for that thread, but since the stat code uses
5712  __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
5713  cleaning up itself instead of another thread, it gets confused. This happens because
5714  allowing a thread to unregister and cleanup another thread is a recent modification for
5715  addressing an issue with Maxon Cinema4D. Based on the current design (20050722), a
5716  thread may end up trying to unregister another thread only if thread death does not
5717  trigger the calling of __kmp_internal_end_thread. For Linux* OS, there is the thread
5718  specific data destructor function to detect thread death. For Windows dynamic, there
5719  is DllMain(THREAD_DETACH). For Windows static, there is nothing. Thus, the
5720  workaround is applicable only for Windows static stat library.
5721  */
5722  __kmp_internal_end_library( -1 );
5723  #if KMP_OS_WINDOWS
5724  __kmp_close_console();
5725  #endif
5726 }
5727 
5728 static void
5729 __kmp_reap_thread(
5730  kmp_info_t * thread,
5731  int is_root
5732 ) {
5733 
5734  // It is assumed __kmp_forkjoin_lock is acquired.
5735 
5736  int gtid;
5737 
5738  KMP_DEBUG_ASSERT( thread != NULL );
5739 
5740  gtid = thread->th.th_info.ds.ds_gtid;
5741 
5742  if ( ! is_root ) {
5743 
5744  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
5745  /* Assume the threads are at the fork barrier here */
5746  KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
5747  /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
5748  kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
5749  __kmp_release_64(&flag);
5750  }; // if
5751 
5752 
5753  // Terminate OS thread.
5754  __kmp_reap_worker( thread );
5755 
5756  //
5757  // The thread was killed asynchronously. If it was actively
5758  // spinning in the in the thread pool, decrement the global count.
5759  //
5760  // There is a small timing hole here - if the worker thread was
5761  // just waking up after sleeping in the pool, had reset it's
5762  // th_active_in_pool flag but not decremented the global counter
5763  // __kmp_thread_pool_active_nth yet, then the global counter
5764  // might not get updated.
5765  //
5766  // Currently, this can only happen as the library is unloaded,
5767  // so there are no harmful side effects.
5768  //
5769  if ( thread->th.th_active_in_pool ) {
5770  thread->th.th_active_in_pool = FALSE;
5771  KMP_TEST_THEN_DEC32(
5772  (kmp_int32 *) &__kmp_thread_pool_active_nth );
5773  KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
5774  }
5775 
5776  // Decrement # of [worker] threads in the pool.
5777  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
5778  --__kmp_thread_pool_nth;
5779  }; // if
5780 
5781  // Free the fast memory for tasking
5782  #if USE_FAST_MEMORY
5783  __kmp_free_fast_memory( thread );
5784  #endif /* USE_FAST_MEMORY */
5785 
5786  __kmp_suspend_uninitialize_thread( thread );
5787 
5788  KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
5789  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5790 
5791  -- __kmp_all_nth;
5792  // __kmp_nth was decremented when thread is added to the pool.
5793 
5794 #ifdef KMP_ADJUST_BLOCKTIME
5795  /* Adjust blocktime back to user setting or default if necessary */
5796  /* Middle initialization might never have occurred */
5797  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5798  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5799  if ( __kmp_nth <= __kmp_avail_proc ) {
5800  __kmp_zero_bt = FALSE;
5801  }
5802  }
5803 #endif /* KMP_ADJUST_BLOCKTIME */
5804 
5805  /* free the memory being used */
5806  if( __kmp_env_consistency_check ) {
5807  if ( thread->th.th_cons ) {
5808  __kmp_free_cons_stack( thread->th.th_cons );
5809  thread->th.th_cons = NULL;
5810  }; // if
5811  }
5812 
5813  if ( thread->th.th_pri_common != NULL ) {
5814  __kmp_free( thread->th.th_pri_common );
5815  thread->th.th_pri_common = NULL;
5816  }; // if
5817 
5818  if (thread->th.th_task_state_memo_stack != NULL) {
5819  __kmp_free(thread->th.th_task_state_memo_stack);
5820  thread->th.th_task_state_memo_stack = NULL;
5821  }
5822 
5823  #if KMP_USE_BGET
5824  if ( thread->th.th_local.bget_data != NULL ) {
5825  __kmp_finalize_bget( thread );
5826  }; // if
5827  #endif
5828 
5829 #if KMP_AFFINITY_SUPPORTED
5830  if ( thread->th.th_affin_mask != NULL ) {
5831  KMP_CPU_FREE( thread->th.th_affin_mask );
5832  thread->th.th_affin_mask = NULL;
5833  }; // if
5834 #endif /* KMP_AFFINITY_SUPPORTED */
5835 
5836  __kmp_reap_team( thread->th.th_serial_team );
5837  thread->th.th_serial_team = NULL;
5838  __kmp_free( thread );
5839 
5840  KMP_MB();
5841 
5842 } // __kmp_reap_thread
5843 
5844 static void
5845 __kmp_internal_end(void)
5846 {
5847  int i;
5848 
5849  /* First, unregister the library */
5850  __kmp_unregister_library();
5851 
5852  #if KMP_OS_WINDOWS
5853  /* In Win static library, we can't tell when a root actually dies, so we
5854  reclaim the data structures for any root threads that have died but not
5855  unregistered themselves, in order to shut down cleanly.
5856  In Win dynamic library we also can't tell when a thread dies.
5857  */
5858  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
5859  #endif
5860 
5861  for( i=0 ; i<__kmp_threads_capacity ; i++ )
5862  if( __kmp_root[i] )
5863  if( __kmp_root[i]->r.r_active )
5864  break;
5865  KMP_MB(); /* Flush all pending memory write invalidates. */
5866  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5867 
5868  if ( i < __kmp_threads_capacity ) {
5869  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5870  KMP_MB(); /* Flush all pending memory write invalidates. */
5871 
5872  //
5873  // Need to check that monitor was initialized before reaping it.
5874  // If we are called form __kmp_atfork_child (which sets
5875  // __kmp_init_parallel = 0), then __kmp_monitor will appear to
5876  // contain valid data, but it is only valid in the parent process,
5877  // not the child.
5878  //
5879  // One of the possible fixes for CQ138434 / CQ140126
5880  // (used in 20091103_dreamworks patch)
5881  //
5882  // New behavior (201008): instead of keying off of the flag
5883  // __kmp_init_parallel, the monitor thread creation is keyed off
5884  // of the new flag __kmp_init_monitor.
5885  //
5886  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5887  if ( TCR_4( __kmp_init_monitor ) ) {
5888  __kmp_reap_monitor( & __kmp_monitor );
5889  TCW_4( __kmp_init_monitor, 0 );
5890  }
5891  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5892  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5893  } else {
5894  /* TODO move this to cleanup code */
5895  #ifdef KMP_DEBUG
5896  /* make sure that everything has properly ended */
5897  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
5898  if( __kmp_root[i] ) {
5899 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: there can be uber threads alive here
5900  KMP_ASSERT( ! __kmp_root[i]->r.r_active ); // TODO: can they be active?
5901  }
5902  }
5903  #endif
5904 
5905  KMP_MB();
5906 
5907  // Reap the worker threads.
5908  // This is valid for now, but be careful if threads are reaped sooner.
5909  while ( __kmp_thread_pool != NULL ) { // Loop thru all the thread in the pool.
5910  // Get the next thread from the pool.
5911  kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
5912  __kmp_thread_pool = thread->th.th_next_pool;
5913  // Reap it.
5914  thread->th.th_next_pool = NULL;
5915  thread->th.th_in_pool = FALSE;
5916  __kmp_reap_thread( thread, 0 );
5917  }; // while
5918  __kmp_thread_pool_insert_pt = NULL;
5919 
5920  // Reap teams.
5921  while ( __kmp_team_pool != NULL ) { // Loop thru all the teams in the pool.
5922  // Get the next team from the pool.
5923  kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
5924  __kmp_team_pool = team->t.t_next_pool;
5925  // Reap it.
5926  team->t.t_next_pool = NULL;
5927  __kmp_reap_team( team );
5928  }; // while
5929 
5930  __kmp_reap_task_teams( );
5931 
5932  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5933  // TBD: Add some checking...
5934  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5935  }
5936 
5937  /* Make sure all threadprivate destructors get run by joining with all worker
5938  threads before resetting this flag */
5939  TCW_SYNC_4(__kmp_init_common, FALSE);
5940 
5941  KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
5942  KMP_MB();
5943 
5944  //
5945  // See note above: One of the possible fixes for CQ138434 / CQ140126
5946  //
5947  // FIXME: push both code fragments down and CSE them?
5948  // push them into __kmp_cleanup() ?
5949  //
5950  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5951  if ( TCR_4( __kmp_init_monitor ) ) {
5952  __kmp_reap_monitor( & __kmp_monitor );
5953  TCW_4( __kmp_init_monitor, 0 );
5954  }
5955  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5956  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5957 
5958  } /* else !__kmp_global.t_active */
5959  TCW_4(__kmp_init_gtid, FALSE);
5960  KMP_MB(); /* Flush all pending memory write invalidates. */
5961 
5962 
5963  __kmp_cleanup();
5964 #if OMPT_SUPPORT
5965  ompt_fini();
5966 #endif
5967 }
5968 
5969 void
5970 __kmp_internal_end_library( int gtid_req )
5971 {
5972  int i;
5973 
5974  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5975  /* this shouldn't be a race condition because __kmp_internal_end() is the
5976  * only place to clear __kmp_serial_init */
5977  /* we'll check this later too, after we get the lock */
5978  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
5979  // because the next check will work in any case.
5980  if( __kmp_global.g.g_abort ) {
5981  KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
5982  /* TODO abort? */
5983  return;
5984  }
5985  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5986  KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
5987  return;
5988  }
5989 
5990 
5991  KMP_MB(); /* Flush all pending memory write invalidates. */
5992 
5993  /* find out who we are and what we should do */
5994  {
5995  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5996  KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req ));
5997  if( gtid == KMP_GTID_SHUTDOWN ) {
5998  KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
5999  return;
6000  } else if( gtid == KMP_GTID_MONITOR ) {
6001  KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
6002  return;
6003  } else if( gtid == KMP_GTID_DNE ) {
6004  KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
6005  /* we don't know who we are, but we may still shutdown the library */
6006  } else if( KMP_UBER_GTID( gtid )) {
6007  /* unregister ourselves as an uber thread. gtid is no longer valid */
6008  if( __kmp_root[gtid]->r.r_active ) {
6009  __kmp_global.g.g_abort = -1;
6010  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6011  KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
6012  return;
6013  } else {
6014  KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
6015  __kmp_unregister_root_current_thread( gtid );
6016  }
6017  } else {
6018  /* worker threads may call this function through the atexit handler, if they call exit() */
6019  /* For now, skip the usual subsequent processing and just dump the debug buffer.
6020  TODO: do a thorough shutdown instead
6021  */
6022  #ifdef DUMP_DEBUG_ON_EXIT
6023  if ( __kmp_debug_buf )
6024  __kmp_dump_debug_buffer( );
6025  #endif
6026  return;
6027  }
6028  }
6029  /* synchronize the termination process */
6030  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6031 
6032  /* have we already finished */
6033  if( __kmp_global.g.g_abort ) {
6034  KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
6035  /* TODO abort? */
6036  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6037  return;
6038  }
6039  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6040  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6041  return;
6042  }
6043 
6044  /* We need this lock to enforce mutex between this reading of
6045  __kmp_threads_capacity and the writing by __kmp_register_root.
6046  Alternatively, we can use a counter of roots that is
6047  atomically updated by __kmp_get_global_thread_id_reg,
6048  __kmp_do_serial_initialize and __kmp_internal_end_*.
6049  */
6050  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6051 
6052  /* now we can safely conduct the actual termination */
6053  __kmp_internal_end();
6054 
6055  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6056  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6057 
6058  KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
6059 
6060  #ifdef DUMP_DEBUG_ON_EXIT
6061  if ( __kmp_debug_buf )
6062  __kmp_dump_debug_buffer();
6063  #endif
6064 
6065  #if KMP_OS_WINDOWS
6066  __kmp_close_console();
6067  #endif
6068 
6069  __kmp_fini_allocator();
6070 
6071 } // __kmp_internal_end_library
6072 
6073 void
6074 __kmp_internal_end_thread( int gtid_req )
6075 {
6076  int i;
6077 
6078  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6079  /* this shouldn't be a race condition because __kmp_internal_end() is the
6080  * only place to clear __kmp_serial_init */
6081  /* we'll check this later too, after we get the lock */
6082  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
6083  // because the next check will work in any case.
6084  if( __kmp_global.g.g_abort ) {
6085  KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
6086  /* TODO abort? */
6087  return;
6088  }
6089  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6090  KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
6091  return;
6092  }
6093 
6094  KMP_MB(); /* Flush all pending memory write invalidates. */
6095 
6096  /* find out who we are and what we should do */
6097  {
6098  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
6099  KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req ));
6100  if( gtid == KMP_GTID_SHUTDOWN ) {
6101  KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
6102  return;
6103  } else if( gtid == KMP_GTID_MONITOR ) {
6104  KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
6105  return;
6106  } else if( gtid == KMP_GTID_DNE ) {
6107  KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
6108  return;
6109  /* we don't know who we are */
6110  } else if( KMP_UBER_GTID( gtid )) {
6111  /* unregister ourselves as an uber thread. gtid is no longer valid */
6112  if( __kmp_root[gtid]->r.r_active ) {
6113  __kmp_global.g.g_abort = -1;
6114  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6115  KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
6116  return;
6117  } else {
6118  KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
6119  __kmp_unregister_root_current_thread( gtid );
6120  }
6121  } else {
6122  /* just a worker thread, let's leave */
6123  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
6124 
6125  if ( gtid >= 0 ) {
6126  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6127  if (TCR_PTR(this_thr->th.th_task_team) != NULL) {
6128  __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
6129  }
6130  }
6131 
6132  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
6133  return;
6134  }
6135  }
6136  #if defined KMP_DYNAMIC_LIB
6137  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
6138  // because we will better shutdown later in the library destructor.
6139  // The reason of this change is performance problem when non-openmp thread
6140  // in a loop forks and joins many openmp threads. We can save a lot of time
6141  // keeping worker threads alive until the program shutdown.
6142  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
6143  // Windows(DPD200287443) that occurs when using critical sections from foreign threads.
6144  KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
6145  return;
6146  #endif
6147  /* synchronize the termination process */
6148  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6149 
6150  /* have we already finished */
6151  if( __kmp_global.g.g_abort ) {
6152  KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
6153  /* TODO abort? */
6154  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6155  return;
6156  }
6157  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6158  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6159  return;
6160  }
6161 
6162  /* We need this lock to enforce mutex between this reading of
6163  __kmp_threads_capacity and the writing by __kmp_register_root.
6164  Alternatively, we can use a counter of roots that is
6165  atomically updated by __kmp_get_global_thread_id_reg,
6166  __kmp_do_serial_initialize and __kmp_internal_end_*.
6167  */
6168 
6169  /* should we finish the run-time? are all siblings done? */
6170  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6171 
6172  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
6173  if ( KMP_UBER_GTID( i ) ) {
6174  KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
6175  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6176  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6177  return;
6178  };
6179  }
6180 
6181  /* now we can safely conduct the actual termination */
6182 
6183  __kmp_internal_end();
6184 
6185  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6186  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6187 
6188  KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
6189 
6190  #ifdef DUMP_DEBUG_ON_EXIT
6191  if ( __kmp_debug_buf )
6192  __kmp_dump_debug_buffer();
6193  #endif
6194 } // __kmp_internal_end_thread
6195 
6196 // -------------------------------------------------------------------------------------------------
6197 // Library registration stuff.
6198 
6199 static long __kmp_registration_flag = 0;
6200  // Random value used to indicate library initialization.
6201 static char * __kmp_registration_str = NULL;
6202  // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6203 
6204 
6205 static inline
6206 char *
6207 __kmp_reg_status_name() {
6208  /*
6209  On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
6210  If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
6211  the name of registered_lib_env env var can not be found, because the name will contain different pid.
6212  */
6213  return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
6214 } // __kmp_reg_status_get
6215 
6216 
6217 void
6218 __kmp_register_library_startup(
6219  void
6220 ) {
6221 
6222  char * name = __kmp_reg_status_name(); // Name of the environment variable.
6223  int done = 0;
6224  union {
6225  double dtime;
6226  long ltime;
6227  } time;
6228  #if KMP_OS_WINDOWS
6229  __kmp_initialize_system_tick();
6230  #endif
6231  __kmp_read_system_time( & time.dtime );
6232  __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
6233  __kmp_registration_str =
6234  __kmp_str_format(
6235  "%p-%lx-%s",
6236  & __kmp_registration_flag,
6237  __kmp_registration_flag,
6238  KMP_LIBRARY_FILE
6239  );
6240 
6241  KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
6242 
6243  while ( ! done ) {
6244 
6245  char * value = NULL; // Actual value of the environment variable.
6246 
6247  // Set environment variable, but do not overwrite if it is exist.
6248  __kmp_env_set( name, __kmp_registration_str, 0 );
6249  // Check the variable is written.
6250  value = __kmp_env_get( name );
6251  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6252 
6253  done = 1; // Ok, environment variable set successfully, exit the loop.
6254 
6255  } else {
6256 
6257  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6258  // Check whether it alive or dead.
6259  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6260  char * tail = value;
6261  char * flag_addr_str = NULL;
6262  char * flag_val_str = NULL;
6263  char const * file_name = NULL;
6264  __kmp_str_split( tail, '-', & flag_addr_str, & tail );
6265  __kmp_str_split( tail, '-', & flag_val_str, & tail );
6266  file_name = tail;
6267  if ( tail != NULL ) {
6268  long * flag_addr = 0;
6269  long flag_val = 0;
6270  KMP_SSCANF( flag_addr_str, "%p", & flag_addr );
6271  KMP_SSCANF( flag_val_str, "%lx", & flag_val );
6272  if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
6273  // First, check whether environment-encoded address is mapped into addr space.
6274  // If so, dereference it to see if it still has the right value.
6275 
6276  if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
6277  neighbor = 1;
6278  } else {
6279  // If not, then we know the other copy of the library is no longer running.
6280  neighbor = 2;
6281  }; // if
6282  }; // if
6283  }; // if
6284  switch ( neighbor ) {
6285  case 0 : // Cannot parse environment variable -- neighbor status unknown.
6286  // Assume it is the incompatible format of future version of the library.
6287  // Assume the other library is alive.
6288  // WARN( ... ); // TODO: Issue a warning.
6289  file_name = "unknown library";
6290  // Attention! Falling to the next case. That's intentional.
6291  case 1 : { // Neighbor is alive.
6292  // Check it is allowed.
6293  char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
6294  if ( ! __kmp_str_match_true( duplicate_ok ) ) {
6295  // That's not allowed. Issue fatal error.
6296  __kmp_msg(
6297  kmp_ms_fatal,
6298  KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
6299  KMP_HNT( DuplicateLibrary ),
6300  __kmp_msg_null
6301  );
6302  }; // if
6303  KMP_INTERNAL_FREE( duplicate_ok );
6304  __kmp_duplicate_library_ok = 1;
6305  done = 1; // Exit the loop.
6306  } break;
6307  case 2 : { // Neighbor is dead.
6308  // Clear the variable and try to register library again.
6309  __kmp_env_unset( name );
6310  } break;
6311  default : {
6312  KMP_DEBUG_ASSERT( 0 );
6313  } break;
6314  }; // switch
6315 
6316  }; // if
6317  KMP_INTERNAL_FREE( (void *) value );
6318 
6319  }; // while
6320  KMP_INTERNAL_FREE( (void *) name );
6321 
6322 } // func __kmp_register_library_startup
6323 
6324 
6325 void
6326 __kmp_unregister_library( void ) {
6327 
6328  char * name = __kmp_reg_status_name();
6329  char * value = __kmp_env_get( name );
6330 
6331  KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
6332  KMP_DEBUG_ASSERT( __kmp_registration_str != NULL );
6333  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6334  // Ok, this is our variable. Delete it.
6335  __kmp_env_unset( name );
6336  }; // if
6337 
6338  KMP_INTERNAL_FREE( __kmp_registration_str );
6339  KMP_INTERNAL_FREE( value );
6340  KMP_INTERNAL_FREE( name );
6341 
6342  __kmp_registration_flag = 0;
6343  __kmp_registration_str = NULL;
6344 
6345 } // __kmp_unregister_library
6346 
6347 
6348 // End of Library registration stuff.
6349 // -------------------------------------------------------------------------------------------------
6350 
6351 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6352 
6353 static void __kmp_check_mic_type()
6354 {
6355  kmp_cpuid_t cpuid_state = {0};
6356  kmp_cpuid_t * cs_p = &cpuid_state;
6357  __kmp_x86_cpuid(1, 0, cs_p);
6358  // We don't support mic1 at the moment
6359  if( (cs_p->eax & 0xff0) == 0xB10 ) {
6360  __kmp_mic_type = mic2;
6361  } else if( (cs_p->eax & 0xf0ff0) == 0x50670 ) {
6362  __kmp_mic_type = mic3;
6363  } else {
6364  __kmp_mic_type = non_mic;
6365  }
6366 }
6367 
6368 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
6369 
6370 
6371 
6372 static void
6373 __kmp_do_serial_initialize( void )
6374 {
6375  int i, gtid;
6376  int size;
6377 
6378  KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
6379 
6380  KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
6381  KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
6382  KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
6383  KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
6384  KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
6385 
6386  __kmp_validate_locks();
6387 
6388  /* Initialize internal memory allocator */
6389  __kmp_init_allocator();
6390 
6391  /* Register the library startup via an environment variable
6392  and check to see whether another copy of the library is already
6393  registered. */
6394 
6395  __kmp_register_library_startup( );
6396 
6397  /* TODO reinitialization of library */
6398  if( TCR_4(__kmp_global.g.g_done) ) {
6399  KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
6400  }
6401 
6402  __kmp_global.g.g_abort = 0;
6403  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6404 
6405  /* initialize the locks */
6406 #if KMP_USE_ADAPTIVE_LOCKS
6407 #if KMP_DEBUG_ADAPTIVE_LOCKS
6408  __kmp_init_speculative_stats();
6409 #endif
6410 #endif
6411  __kmp_init_lock( & __kmp_global_lock );
6412  __kmp_init_queuing_lock( & __kmp_dispatch_lock );
6413  __kmp_init_lock( & __kmp_debug_lock );
6414  __kmp_init_atomic_lock( & __kmp_atomic_lock );
6415  __kmp_init_atomic_lock( & __kmp_atomic_lock_1i );
6416  __kmp_init_atomic_lock( & __kmp_atomic_lock_2i );
6417  __kmp_init_atomic_lock( & __kmp_atomic_lock_4i );
6418  __kmp_init_atomic_lock( & __kmp_atomic_lock_4r );
6419  __kmp_init_atomic_lock( & __kmp_atomic_lock_8i );
6420  __kmp_init_atomic_lock( & __kmp_atomic_lock_8r );
6421  __kmp_init_atomic_lock( & __kmp_atomic_lock_8c );
6422  __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
6423  __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
6424  __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
6425  __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
6426  __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
6427  __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock );
6428  __kmp_init_bootstrap_lock( & __kmp_exit_lock );
6429  __kmp_init_bootstrap_lock( & __kmp_monitor_lock );
6430  __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
6431 
6432  /* conduct initialization and initial setup of configuration */
6433 
6434  __kmp_runtime_initialize();
6435 
6436 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6437  __kmp_check_mic_type();
6438 #endif
6439 
6440  // Some global variable initialization moved here from kmp_env_initialize()
6441 #ifdef KMP_DEBUG
6442  kmp_diag = 0;
6443 #endif
6444  __kmp_abort_delay = 0;
6445 
6446  // From __kmp_init_dflt_team_nth()
6447  /* assume the entire machine will be used */
6448  __kmp_dflt_team_nth_ub = __kmp_xproc;
6449  if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
6450  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6451  }
6452  if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
6453  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6454  }
6455  __kmp_max_nth = __kmp_sys_max_nth;
6456 
6457  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
6458  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6459  __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6460  __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6461  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6462  __kmp_library = library_throughput;
6463  // From KMP_SCHEDULE initialization
6464  __kmp_static = kmp_sch_static_balanced;
6465  // AC: do not use analytical here, because it is non-monotonous
6466  //__kmp_guided = kmp_sch_guided_iterative_chunked;
6467  //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
6468  // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
6469  // control parts
6470  #if KMP_FAST_REDUCTION_BARRIER
6471  #define kmp_reduction_barrier_gather_bb ((int)1)
6472  #define kmp_reduction_barrier_release_bb ((int)1)
6473  #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6474  #define kmp_reduction_barrier_release_pat bp_hyper_bar
6475  #endif // KMP_FAST_REDUCTION_BARRIER
6476  for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
6477  __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
6478  __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
6479  __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
6480  __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
6481  #if KMP_FAST_REDUCTION_BARRIER
6482  if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
6483  __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
6484  __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
6485  __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
6486  __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
6487  }
6488  #endif // KMP_FAST_REDUCTION_BARRIER
6489  }
6490  #if KMP_FAST_REDUCTION_BARRIER
6491  #undef kmp_reduction_barrier_release_pat
6492  #undef kmp_reduction_barrier_gather_pat
6493  #undef kmp_reduction_barrier_release_bb
6494  #undef kmp_reduction_barrier_gather_bb
6495  #endif // KMP_FAST_REDUCTION_BARRIER
6496 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6497  if( __kmp_mic_type != non_mic ) {
6498  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6499  __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3; // plain gather
6500  __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1; // forkjoin release
6501  __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6502  __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6503  }
6504 #if KMP_FAST_REDUCTION_BARRIER
6505  if( __kmp_mic_type != non_mic ) {
6506  __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
6507  __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
6508  }
6509 #endif
6510 #endif
6511 
6512  // From KMP_CHECKS initialization
6513 #ifdef KMP_DEBUG
6514  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6515 #else
6516  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6517 #endif
6518 
6519  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6520  __kmp_foreign_tp = TRUE;
6521 
6522  __kmp_global.g.g_dynamic = FALSE;
6523  __kmp_global.g.g_dynamic_mode = dynamic_default;
6524 
6525  __kmp_env_initialize( NULL );
6526 
6527 
6528  // Print all messages in message catalog for testing purposes.
6529  #ifdef KMP_DEBUG
6530  char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
6531  if ( __kmp_str_match_true( val ) ) {
6532  kmp_str_buf_t buffer;
6533  __kmp_str_buf_init( & buffer );
6534  __kmp_i18n_dump_catalog( & buffer );
6535  __kmp_printf( "%s", buffer.str );
6536  __kmp_str_buf_free( & buffer );
6537  }; // if
6538  __kmp_env_free( & val );
6539  #endif
6540 
6541  __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
6542  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6543  __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6544 
6545 
6546  // If the library is shut down properly, both pools must be NULL. Just in case, set them
6547  // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
6548  KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
6549  KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
6550  KMP_DEBUG_ASSERT( __kmp_team_pool == NULL );
6551  __kmp_thread_pool = NULL;
6552  __kmp_thread_pool_insert_pt = NULL;
6553  __kmp_team_pool = NULL;
6554 
6555  /* Allocate all of the variable sized records */
6556  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
6557  /* Since allocation is cache-aligned, just add extra padding at the end */
6558  size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
6559  __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
6560  __kmp_root = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
6561 
6562  /* init thread counts */
6563  KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
6564  KMP_DEBUG_ASSERT( __kmp_nth == 0 ); // something was wrong in termination.
6565  __kmp_all_nth = 0;
6566  __kmp_nth = 0;
6567 
6568  /* setup the uber master thread and hierarchy */
6569  gtid = __kmp_register_root( TRUE );
6570  KA_TRACE( 10, ("__kmp_do_serial_initialize T#%d\n", gtid ));
6571  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6572  KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
6573 
6574  KMP_MB(); /* Flush all pending memory write invalidates. */
6575 
6576  __kmp_common_initialize();
6577 
6578  #if KMP_OS_UNIX
6579  /* invoke the child fork handler */
6580  __kmp_register_atfork();
6581  #endif
6582 
6583  #if ! defined KMP_DYNAMIC_LIB
6584  {
6585  /* Invoke the exit handler when the program finishes, only for static library.
6586  For dynamic library, we already have _fini and DllMain.
6587  */
6588  int rc = atexit( __kmp_internal_end_atexit );
6589  if ( rc != 0 ) {
6590  __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
6591  }; // if
6592  }
6593  #endif
6594 
6595  #if KMP_HANDLE_SIGNALS
6596  #if KMP_OS_UNIX
6597  /* NOTE: make sure that this is called before the user installs
6598  * their own signal handlers so that the user handlers
6599  * are called first. this way they can return false,
6600  * not call our handler, avoid terminating the library,
6601  * and continue execution where they left off. */
6602  __kmp_install_signals( FALSE );
6603  #endif /* KMP_OS_UNIX */
6604  #if KMP_OS_WINDOWS
6605  __kmp_install_signals( TRUE );
6606  #endif /* KMP_OS_WINDOWS */
6607  #endif
6608 
6609  /* we have finished the serial initialization */
6610  __kmp_init_counter ++;
6611 
6612  __kmp_init_serial = TRUE;
6613 
6614  if (__kmp_settings) {
6615  __kmp_env_print();
6616  }
6617 
6618 #if OMP_40_ENABLED
6619  if (__kmp_display_env || __kmp_display_env_verbose) {
6620  __kmp_env_print_2();
6621  }
6622 #endif // OMP_40_ENABLED
6623 
6624  KMP_MB();
6625 
6626  KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
6627 #if OMPT_SUPPORT
6628  ompt_init();
6629 #endif
6630 }
6631 
6632 void
6633 __kmp_serial_initialize( void )
6634 {
6635  if ( __kmp_init_serial ) {
6636  return;
6637  }
6638  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6639  if ( __kmp_init_serial ) {
6640  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6641  return;
6642  }
6643  __kmp_do_serial_initialize();
6644  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6645 }
6646 
6647 static void
6648 __kmp_do_middle_initialize( void )
6649 {
6650  int i, j;
6651  int prev_dflt_team_nth;
6652 
6653  if( !__kmp_init_serial ) {
6654  __kmp_do_serial_initialize();
6655  }
6656 
6657  KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
6658 
6659  //
6660  // Save the previous value for the __kmp_dflt_team_nth so that
6661  // we can avoid some reinitialization if it hasn't changed.
6662  //
6663  prev_dflt_team_nth = __kmp_dflt_team_nth;
6664 
6665 #if KMP_AFFINITY_SUPPORTED
6666  //
6667  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6668  // number of cores on the machine.
6669  //
6670  __kmp_affinity_initialize();
6671 
6672  //
6673  // Run through the __kmp_threads array and set the affinity mask
6674  // for each root thread that is currently registered with the RTL.
6675  //
6676  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6677  if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
6678  __kmp_affinity_set_init_mask( i, TRUE );
6679  }
6680  }
6681 #endif /* KMP_AFFINITY_SUPPORTED */
6682 
6683  KMP_ASSERT( __kmp_xproc > 0 );
6684  if ( __kmp_avail_proc == 0 ) {
6685  __kmp_avail_proc = __kmp_xproc;
6686  }
6687 
6688  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
6689  j = 0;
6690  while ( ( j < __kmp_nested_nth.used ) && ! __kmp_nested_nth.nth[ j ] ) {
6691  __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
6692  j++;
6693  }
6694 
6695  if ( __kmp_dflt_team_nth == 0 ) {
6696 #ifdef KMP_DFLT_NTH_CORES
6697  //
6698  // Default #threads = #cores
6699  //
6700  __kmp_dflt_team_nth = __kmp_ncores;
6701  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
6702  __kmp_dflt_team_nth ) );
6703 #else
6704  //
6705  // Default #threads = #available OS procs
6706  //
6707  __kmp_dflt_team_nth = __kmp_avail_proc;
6708  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
6709  __kmp_dflt_team_nth ) );
6710 #endif /* KMP_DFLT_NTH_CORES */
6711  }
6712 
6713  if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
6714  __kmp_dflt_team_nth = KMP_MIN_NTH;
6715  }
6716  if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
6717  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6718  }
6719 
6720  //
6721  // There's no harm in continuing if the following check fails,
6722  // but it indicates an error in the previous logic.
6723  //
6724  KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
6725 
6726  if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
6727  //
6728  // Run through the __kmp_threads array and set the num threads icv
6729  // for each root thread that is currently registered with the RTL
6730  // (which has not already explicitly set its nthreads-var with a
6731  // call to omp_set_num_threads()).
6732  //
6733  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6734  kmp_info_t *thread = __kmp_threads[ i ];
6735  if ( thread == NULL ) continue;
6736  if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
6737 
6738  set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
6739  }
6740  }
6741  KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6742  __kmp_dflt_team_nth) );
6743 
6744 #ifdef KMP_ADJUST_BLOCKTIME
6745  /* Adjust blocktime to zero if necessary */
6746  /* now that __kmp_avail_proc is set */
6747  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6748  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6749  if ( __kmp_nth > __kmp_avail_proc ) {
6750  __kmp_zero_bt = TRUE;
6751  }
6752  }
6753 #endif /* KMP_ADJUST_BLOCKTIME */
6754 
6755  /* we have finished middle initialization */
6756  TCW_SYNC_4(__kmp_init_middle, TRUE);
6757 
6758  KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
6759 }
6760 
6761 void
6762 __kmp_middle_initialize( void )
6763 {
6764  if ( __kmp_init_middle ) {
6765  return;
6766  }
6767  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6768  if ( __kmp_init_middle ) {
6769  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6770  return;
6771  }
6772  __kmp_do_middle_initialize();
6773  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6774 #if OMPT_SUPPORT
6775  ompt_init();
6776 #endif
6777 }
6778 
6779 void
6780 __kmp_parallel_initialize( void )
6781 {
6782  int gtid = __kmp_entry_gtid(); // this might be a new root
6783 
6784  /* syncronize parallel initialization (for sibling) */
6785  if( TCR_4(__kmp_init_parallel) ) return;
6786  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6787  if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
6788 
6789  /* TODO reinitialization after we have already shut down */
6790  if( TCR_4(__kmp_global.g.g_done) ) {
6791  KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
6792  __kmp_infinite_loop();
6793  }
6794 
6795  /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
6796  would cause a deadlock. So we call __kmp_do_serial_initialize directly.
6797  */
6798  if( !__kmp_init_middle ) {
6799  __kmp_do_middle_initialize();
6800  }
6801 
6802  /* begin initialization */
6803  KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
6804  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6805 
6806 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6807  //
6808  // Save the FP control regs.
6809  // Worker threads will set theirs to these values at thread startup.
6810  //
6811  __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
6812  __kmp_store_mxcsr( &__kmp_init_mxcsr );
6813  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6814 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6815 
6816 #if KMP_OS_UNIX
6817 # if KMP_HANDLE_SIGNALS
6818  /* must be after __kmp_serial_initialize */
6819  __kmp_install_signals( TRUE );
6820 # endif
6821 #endif
6822 
6823  __kmp_suspend_initialize();
6824 
6825 # if defined(USE_LOAD_BALANCE)
6826  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6827  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6828  }
6829 #else
6830  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6831  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6832  }
6833 #endif
6834 
6835  if ( __kmp_version ) {
6836  __kmp_print_version_2();
6837  }
6838 
6839  /* we have finished parallel initialization */
6840  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6841 
6842  KMP_MB();
6843  KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
6844 
6845  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6846 #if OMPT_SUPPORT
6847  ompt_init();
6848 #endif
6849 }
6850 
6851 
6852 /* ------------------------------------------------------------------------ */
6853 
6854 void
6855 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6856  kmp_team_t *team )
6857 {
6858  kmp_disp_t *dispatch;
6859 
6860  KMP_MB();
6861 
6862  /* none of the threads have encountered any constructs, yet. */
6863  this_thr->th.th_local.this_construct = 0;
6864 #if KMP_CACHE_MANAGE
6865  KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
6866 #endif /* KMP_CACHE_MANAGE */
6867  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6868  KMP_DEBUG_ASSERT( dispatch );
6869  KMP_DEBUG_ASSERT( team->t.t_dispatch );
6870  //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
6871 
6872  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6873 
6874  if( __kmp_env_consistency_check )
6875  __kmp_push_parallel( gtid, team->t.t_ident );
6876 
6877  KMP_MB(); /* Flush all pending memory write invalidates. */
6878 }
6879 
6880 void
6881 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6882  kmp_team_t *team )
6883 {
6884  if( __kmp_env_consistency_check )
6885  __kmp_pop_parallel( gtid, team->t.t_ident );
6886 }
6887 
6888 int
6889 __kmp_invoke_task_func( int gtid )
6890 {
6891  int rc;
6892  int tid = __kmp_tid_from_gtid( gtid );
6893  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6894  kmp_team_t *team = this_thr->th.th_team;
6895 
6896  __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
6897 #if USE_ITT_BUILD
6898  if ( __itt_stack_caller_create_ptr ) {
6899  __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
6900  }
6901 #endif /* USE_ITT_BUILD */
6902 #if INCLUDE_SSC_MARKS
6903  SSC_MARK_INVOKING();
6904 #endif
6905 
6906 #if OMPT_SUPPORT
6907  void *dummy;
6908  void **exit_runtime_p;
6909  ompt_task_id_t my_task_id;
6910  ompt_parallel_id_t my_parallel_id;
6911 
6912  if (ompt_status & ompt_status_track) {
6913  exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid].
6914  ompt_task_info.frame.exit_runtime_frame);
6915  } else {
6916  exit_runtime_p = &dummy;
6917  }
6918 
6919 #if OMPT_TRACE
6920  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6921  my_parallel_id = team->t.ompt_team_info.parallel_id;
6922  if ((ompt_status == ompt_status_track_callback) &&
6923  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6924  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
6925  my_parallel_id, my_task_id);
6926  }
6927 #endif
6928 #endif
6929 
6930  rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
6931  gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
6932 #if OMPT_SUPPORT
6933  , exit_runtime_p
6934 #endif
6935  );
6936 
6937 #if OMPT_SUPPORT && OMPT_TRACE
6938  if (ompt_status & ompt_status_track) {
6939  if ((ompt_status == ompt_status_track_callback) &&
6940  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
6941  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
6942  my_parallel_id, my_task_id);
6943  }
6944  // the implicit task is not dead yet, so we can't clear its task id here
6945  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_runtime_frame = 0;
6946  }
6947 #endif
6948 
6949 #if USE_ITT_BUILD
6950  if ( __itt_stack_caller_create_ptr ) {
6951  __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
6952  }
6953 #endif /* USE_ITT_BUILD */
6954  __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
6955 
6956  return rc;
6957 }
6958 
6959 #if OMP_40_ENABLED
6960 void
6961 __kmp_teams_master( int gtid )
6962 {
6963  // This routine is called by all master threads in teams construct
6964  kmp_info_t *thr = __kmp_threads[ gtid ];
6965  kmp_team_t *team = thr->th.th_team;
6966  ident_t *loc = team->t.t_ident;
6967  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6968  KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
6969  KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
6970  KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
6971  gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
6972  // Launch league of teams now, but not let workers execute
6973  // (they hang on fork barrier until next parallel)
6974 #if INCLUDE_SSC_MARKS
6975  SSC_MARK_FORKING();
6976 #endif
6977  __kmp_fork_call( loc, gtid, fork_context_intel,
6978  team->t.t_argc,
6979 #if OMPT_SUPPORT
6980  (void *)thr->th.th_teams_microtask, // "unwrapped" task
6981 #endif
6982  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6983  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
6984  NULL );
6985 #if INCLUDE_SSC_MARKS
6986  SSC_MARK_JOINING();
6987 #endif
6988  __kmp_join_call( loc, gtid, 1 ); // AC: last parameter "1" eliminates join barrier which won't work because
6989  // worker threads are in a fork barrier waiting for more parallel regions
6990 }
6991 
6992 int
6993 __kmp_invoke_teams_master( int gtid )
6994 {
6995  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6996  kmp_team_t *team = this_thr->th.th_team;
6997  #if KMP_DEBUG
6998  if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
6999  KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
7000  #endif
7001  __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
7002  __kmp_teams_master( gtid );
7003  __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
7004  return 1;
7005 }
7006 #endif /* OMP_40_ENABLED */
7007 
7008 /* this sets the requested number of threads for the next parallel region
7009  * encountered by this team */
7010 /* since this should be enclosed in the forkjoin critical section it
7011  * should avoid race conditions with assymmetrical nested parallelism */
7012 
7013 void
7014 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
7015 {
7016  kmp_info_t *thr = __kmp_threads[gtid];
7017 
7018  if( num_threads > 0 )
7019  thr->th.th_set_nproc = num_threads;
7020 }
7021 
7022 #if OMP_40_ENABLED
7023 
7024 /* this sets the requested number of teams for the teams region and/or
7025  * the number of threads for the next parallel region encountered */
7026 void
7027 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
7028 {
7029  kmp_info_t *thr = __kmp_threads[gtid];
7030  KMP_DEBUG_ASSERT(num_teams >= 0);
7031  KMP_DEBUG_ASSERT(num_threads >= 0);
7032  if( num_teams == 0 ) {
7033  num_teams = 1; // default number of teams is 1.
7034  }
7035  // Set number of teams (number of threads in the outer "parallel" of the teams)
7036  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7037 
7038  // Remember the number of threads for inner parallel regions
7039  if( num_threads > 0 ) {
7040  thr->th.th_teams_size.nth = num_threads;
7041  } else {
7042  if( !TCR_4(__kmp_init_middle) )
7043  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7044  thr->th.th_teams_size.nth = __kmp_avail_proc / num_teams;
7045  }
7046 }
7047 
7048 
7049 //
7050 // Set the proc_bind var to use in the following parallel region.
7051 //
7052 void
7053 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
7054 {
7055  kmp_info_t *thr = __kmp_threads[gtid];
7056  thr->th.th_set_proc_bind = proc_bind;
7057 }
7058 
7059 #endif /* OMP_40_ENABLED */
7060 
7061 /* Launch the worker threads into the microtask. */
7062 
7063 void
7064 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
7065 {
7066  kmp_info_t *this_thr = __kmp_threads[gtid];
7067 
7068 #ifdef KMP_DEBUG
7069  int f;
7070 #endif /* KMP_DEBUG */
7071 
7072  KMP_DEBUG_ASSERT( team );
7073  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
7074  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7075  KMP_MB(); /* Flush all pending memory write invalidates. */
7076 
7077  team->t.t_construct = 0; /* no single directives seen yet */
7078  team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
7079 
7080  /* Reset the identifiers on the dispatch buffer */
7081  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
7082  if ( team->t.t_max_nproc > 1 ) {
7083  int i;
7084  for (i = 0; i < KMP_MAX_DISP_BUF; ++i)
7085  team->t.t_disp_buffer[ i ].buffer_index = i;
7086  } else {
7087  team->t.t_disp_buffer[ 0 ].buffer_index = 0;
7088  }
7089 
7090  KMP_MB(); /* Flush all pending memory write invalidates. */
7091  KMP_ASSERT( this_thr->th.th_team == team );
7092 
7093 #ifdef KMP_DEBUG
7094  for( f=0 ; f<team->t.t_nproc ; f++ ) {
7095  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
7096  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
7097  }
7098 #endif /* KMP_DEBUG */
7099 
7100  /* release the worker threads so they may begin working */
7101  __kmp_fork_barrier( gtid, 0 );
7102 }
7103 
7104 
7105 void
7106 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
7107 {
7108  kmp_info_t *this_thr = __kmp_threads[gtid];
7109 
7110  KMP_DEBUG_ASSERT( team );
7111  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
7112  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7113  KMP_MB(); /* Flush all pending memory write invalidates. */
7114 
7115  /* Join barrier after fork */
7116 
7117 #ifdef KMP_DEBUG
7118  if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
7119  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
7120  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
7121  gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
7122  __kmp_print_structure();
7123  }
7124  KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
7125  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
7126 #endif /* KMP_DEBUG */
7127 
7128  __kmp_join_barrier( gtid ); /* wait for everyone */
7129 
7130  KMP_MB(); /* Flush all pending memory write invalidates. */
7131  KMP_ASSERT( this_thr->th.th_team == team );
7132 }
7133 
7134 
7135 /* ------------------------------------------------------------------------ */
7136 /* ------------------------------------------------------------------------ */
7137 
7138 #ifdef USE_LOAD_BALANCE
7139 
7140 //
7141 // Return the worker threads actively spinning in the hot team, if we
7142 // are at the outermost level of parallelism. Otherwise, return 0.
7143 //
7144 static int
7145 __kmp_active_hot_team_nproc( kmp_root_t *root )
7146 {
7147  int i;
7148  int retval;
7149  kmp_team_t *hot_team;
7150 
7151  if ( root->r.r_active ) {
7152  return 0;
7153  }
7154  hot_team = root->r.r_hot_team;
7155  if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
7156  return hot_team->t.t_nproc - 1; // Don't count master thread
7157  }
7158 
7159  //
7160  // Skip the master thread - it is accounted for elsewhere.
7161  //
7162  retval = 0;
7163  for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
7164  if ( hot_team->t.t_threads[i]->th.th_active ) {
7165  retval++;
7166  }
7167  }
7168  return retval;
7169 }
7170 
7171 //
7172 // Perform an automatic adjustment to the number of
7173 // threads used by the next parallel region.
7174 //
7175 static int
7176 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
7177 {
7178  int retval;
7179  int pool_active;
7180  int hot_team_active;
7181  int team_curr_active;
7182  int system_active;
7183 
7184  KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
7185  root, set_nproc ) );
7186  KMP_DEBUG_ASSERT( root );
7187  KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
7188  KMP_DEBUG_ASSERT( set_nproc > 1 );
7189 
7190  if ( set_nproc == 1) {
7191  KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
7192  return 1;
7193  }
7194 
7195  //
7196  // Threads that are active in the thread pool, active in the hot team
7197  // for this particular root (if we are at the outer par level), and
7198  // the currently executing thread (to become the master) are available
7199  // to add to the new team, but are currently contributing to the system
7200  // load, and must be accounted for.
7201  //
7202  pool_active = TCR_4(__kmp_thread_pool_active_nth);
7203  hot_team_active = __kmp_active_hot_team_nproc( root );
7204  team_curr_active = pool_active + hot_team_active + 1;
7205 
7206  //
7207  // Check the system load.
7208  //
7209  system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
7210  KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
7211  system_active, pool_active, hot_team_active ) );
7212 
7213  if ( system_active < 0 ) {
7214  //
7215  // There was an error reading the necessary info from /proc,
7216  // so use the thread limit algorithm instead. Once we set
7217  // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
7218  // we shouldn't wind up getting back here.
7219  //
7220  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7221  KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
7222 
7223  //
7224  // Make this call behave like the thread limit algorithm.
7225  //
7226  retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
7227  : root->r.r_hot_team->t.t_nproc);
7228  if ( retval > set_nproc ) {
7229  retval = set_nproc;
7230  }
7231  if ( retval < KMP_MIN_NTH ) {
7232  retval = KMP_MIN_NTH;
7233  }
7234 
7235  KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
7236  return retval;
7237  }
7238 
7239  //
7240  // There is a slight delay in the load balance algorithm in detecting
7241  // new running procs. The real system load at this instant should be
7242  // at least as large as the #active omp thread that are available to
7243  // add to the team.
7244  //
7245  if ( system_active < team_curr_active ) {
7246  system_active = team_curr_active;
7247  }
7248  retval = __kmp_avail_proc - system_active + team_curr_active;
7249  if ( retval > set_nproc ) {
7250  retval = set_nproc;
7251  }
7252  if ( retval < KMP_MIN_NTH ) {
7253  retval = KMP_MIN_NTH;
7254  }
7255 
7256  KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
7257  return retval;
7258 } // __kmp_load_balance_nproc()
7259 
7260 #endif /* USE_LOAD_BALANCE */
7261 
7262 
7263 /* ------------------------------------------------------------------------ */
7264 /* ------------------------------------------------------------------------ */
7265 
7266 /* NOTE: this is called with the __kmp_init_lock held */
7267 void
7268 __kmp_cleanup( void )
7269 {
7270  int f;
7271 
7272  KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
7273 
7274  if (TCR_4(__kmp_init_parallel)) {
7275 #if KMP_HANDLE_SIGNALS
7276  __kmp_remove_signals();
7277 #endif
7278  TCW_4(__kmp_init_parallel, FALSE);
7279  }
7280 
7281  if (TCR_4(__kmp_init_middle)) {
7282 #if KMP_AFFINITY_SUPPORTED
7283  __kmp_affinity_uninitialize();
7284 #endif /* KMP_AFFINITY_SUPPORTED */
7285  __kmp_cleanup_hierarchy();
7286  TCW_4(__kmp_init_middle, FALSE);
7287  }
7288 
7289  KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
7290 
7291  if (__kmp_init_serial) {
7292 
7293  __kmp_runtime_destroy();
7294 
7295  __kmp_init_serial = FALSE;
7296  }
7297 
7298  for ( f = 0; f < __kmp_threads_capacity; f++ ) {
7299  if ( __kmp_root[ f ] != NULL ) {
7300  __kmp_free( __kmp_root[ f ] );
7301  __kmp_root[ f ] = NULL;
7302  }
7303  }
7304  __kmp_free( __kmp_threads );
7305  // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
7306  // freeing __kmp_root.
7307  __kmp_threads = NULL;
7308  __kmp_root = NULL;
7309  __kmp_threads_capacity = 0;
7310 
7311 #if KMP_USE_DYNAMIC_LOCK
7312  __kmp_cleanup_indirect_user_locks();
7313 #else
7314  __kmp_cleanup_user_locks();
7315 #endif
7316 
7317  #if KMP_AFFINITY_SUPPORTED
7318  KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
7319  __kmp_cpuinfo_file = NULL;
7320  #endif /* KMP_AFFINITY_SUPPORTED */
7321 
7322  #if KMP_USE_ADAPTIVE_LOCKS
7323  #if KMP_DEBUG_ADAPTIVE_LOCKS
7324  __kmp_print_speculative_stats();
7325  #endif
7326  #endif
7327  KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
7328  __kmp_nested_nth.nth = NULL;
7329  __kmp_nested_nth.size = 0;
7330  __kmp_nested_nth.used = 0;
7331 
7332  __kmp_i18n_catclose();
7333 
7334 #if KMP_STATS_ENABLED
7335  __kmp_accumulate_stats_at_exit();
7336  __kmp_stats_list.deallocate();
7337 #endif
7338 
7339  KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
7340 }
7341 
7342 /* ------------------------------------------------------------------------ */
7343 /* ------------------------------------------------------------------------ */
7344 
7345 int
7346 __kmp_ignore_mppbeg( void )
7347 {
7348  char *env;
7349 
7350  if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
7351  if (__kmp_str_match_false( env ))
7352  return FALSE;
7353  }
7354  // By default __kmpc_begin() is no-op.
7355  return TRUE;
7356 }
7357 
7358 int
7359 __kmp_ignore_mppend( void )
7360 {
7361  char *env;
7362 
7363  if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
7364  if (__kmp_str_match_false( env ))
7365  return FALSE;
7366  }
7367  // By default __kmpc_end() is no-op.
7368  return TRUE;
7369 }
7370 
7371 void
7372 __kmp_internal_begin( void )
7373 {
7374  int gtid;
7375  kmp_root_t *root;
7376 
7377  /* this is a very important step as it will register new sibling threads
7378  * and assign these new uber threads a new gtid */
7379  gtid = __kmp_entry_gtid();
7380  root = __kmp_threads[ gtid ]->th.th_root;
7381  KMP_ASSERT( KMP_UBER_GTID( gtid ));
7382 
7383  if( root->r.r_begin ) return;
7384  __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
7385  if( root->r.r_begin ) {
7386  __kmp_release_lock( & root->r.r_begin_lock, gtid );
7387  return;
7388  }
7389 
7390  root->r.r_begin = TRUE;
7391 
7392  __kmp_release_lock( & root->r.r_begin_lock, gtid );
7393 }
7394 
7395 
7396 /* ------------------------------------------------------------------------ */
7397 /* ------------------------------------------------------------------------ */
7398 
7399 void
7400 __kmp_user_set_library (enum library_type arg)
7401 {
7402  int gtid;
7403  kmp_root_t *root;
7404  kmp_info_t *thread;
7405 
7406  /* first, make sure we are initialized so we can get our gtid */
7407 
7408  gtid = __kmp_entry_gtid();
7409  thread = __kmp_threads[ gtid ];
7410 
7411  root = thread->th.th_root;
7412 
7413  KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
7414  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
7415  KMP_WARNING( SetLibraryIncorrectCall );
7416  return;
7417  }
7418 
7419  switch ( arg ) {
7420  case library_serial :
7421  thread->th.th_set_nproc = 0;
7422  set__nproc( thread, 1 );
7423  break;
7424  case library_turnaround :
7425  thread->th.th_set_nproc = 0;
7426  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7427  break;
7428  case library_throughput :
7429  thread->th.th_set_nproc = 0;
7430  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7431  break;
7432  default:
7433  KMP_FATAL( UnknownLibraryType, arg );
7434  }
7435 
7436  __kmp_aux_set_library ( arg );
7437 }
7438 
7439 void
7440 __kmp_aux_set_stacksize( size_t arg )
7441 {
7442  if (! __kmp_init_serial)
7443  __kmp_serial_initialize();
7444 
7445 #if KMP_OS_DARWIN
7446  if (arg & (0x1000 - 1)) {
7447  arg &= ~(0x1000 - 1);
7448  if(arg + 0x1000) /* check for overflow if we round up */
7449  arg += 0x1000;
7450  }
7451 #endif
7452  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7453 
7454  /* only change the default stacksize before the first parallel region */
7455  if (! TCR_4(__kmp_init_parallel)) {
7456  size_t value = arg; /* argument is in bytes */
7457 
7458  if (value < __kmp_sys_min_stksize )
7459  value = __kmp_sys_min_stksize ;
7460  else if (value > KMP_MAX_STKSIZE)
7461  value = KMP_MAX_STKSIZE;
7462 
7463  __kmp_stksize = value;
7464 
7465  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7466  }
7467 
7468  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7469 }
7470 
7471 /* set the behaviour of the runtime library */
7472 /* TODO this can cause some odd behaviour with sibling parallelism... */
7473 void
7474 __kmp_aux_set_library (enum library_type arg)
7475 {
7476  __kmp_library = arg;
7477 
7478  switch ( __kmp_library ) {
7479  case library_serial :
7480  {
7481  KMP_INFORM( LibraryIsSerial );
7482  (void) __kmp_change_library( TRUE );
7483  }
7484  break;
7485  case library_turnaround :
7486  (void) __kmp_change_library( TRUE );
7487  break;
7488  case library_throughput :
7489  (void) __kmp_change_library( FALSE );
7490  break;
7491  default:
7492  KMP_FATAL( UnknownLibraryType, arg );
7493  }
7494 }
7495 
7496 /* ------------------------------------------------------------------------ */
7497 /* ------------------------------------------------------------------------ */
7498 
7499 void
7500 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
7501 {
7502  int blocktime = arg; /* argument is in milliseconds */
7503  int bt_intervals;
7504  int bt_set;
7505 
7506  __kmp_save_internal_controls( thread );
7507 
7508  /* Normalize and set blocktime for the teams */
7509  if (blocktime < KMP_MIN_BLOCKTIME)
7510  blocktime = KMP_MIN_BLOCKTIME;
7511  else if (blocktime > KMP_MAX_BLOCKTIME)
7512  blocktime = KMP_MAX_BLOCKTIME;
7513 
7514  set__blocktime_team( thread->th.th_team, tid, blocktime );
7515  set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
7516 
7517  /* Calculate and set blocktime intervals for the teams */
7518  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7519 
7520  set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
7521  set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
7522 
7523  /* Set whether blocktime has been set to "TRUE" */
7524  bt_set = TRUE;
7525 
7526  set__bt_set_team( thread->th.th_team, tid, bt_set );
7527  set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
7528  KF_TRACE(10, ( "kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, bt_intervals=%d, monitor_updates=%d\n",
7529  __kmp_gtid_from_tid(tid, thread->th.th_team),
7530  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, __kmp_monitor_wakeups ) );
7531 }
7532 
7533 void
7534 __kmp_aux_set_defaults(
7535  char const * str,
7536  int len
7537 ) {
7538  if ( ! __kmp_init_serial ) {
7539  __kmp_serial_initialize();
7540  };
7541  __kmp_env_initialize( str );
7542 
7543  if (__kmp_settings
7544 #if OMP_40_ENABLED
7545  || __kmp_display_env || __kmp_display_env_verbose
7546 #endif // OMP_40_ENABLED
7547  ) {
7548  __kmp_env_print();
7549  }
7550 } // __kmp_aux_set_defaults
7551 
7552 /* ------------------------------------------------------------------------ */
7553 
7554 /*
7555  * internal fast reduction routines
7556  */
7557 
7558 PACKED_REDUCTION_METHOD_T
7559 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
7560  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7561  kmp_critical_name *lck )
7562 {
7563 
7564  // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
7565  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
7566  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
7567  // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
7568 
7569  PACKED_REDUCTION_METHOD_T retval;
7570 
7571  int team_size;
7572 
7573  int teamsize_cutoff = 4;
7574 
7575  KMP_DEBUG_ASSERT( loc ); // it would be nice to test ( loc != 0 )
7576  KMP_DEBUG_ASSERT( lck ); // it would be nice to test ( lck != 0 )
7577 
7578  #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
7579  #define FAST_REDUCTION_TREE_METHOD_GENERATED ( ( reduce_data ) && ( reduce_func ) )
7580 
7581  retval = critical_reduce_block;
7582 
7583  team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
7584 
7585  if( team_size == 1 ) {
7586 
7587  retval = empty_reduce_block;
7588 
7589  } else {
7590 
7591  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7592  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7593 
7594  #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64
7595 
7596  #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7597 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7598  if( __kmp_mic_type != non_mic ) {
7599  teamsize_cutoff = 8;
7600  }
7601 #endif
7602  if( tree_available ) {
7603  if( team_size <= teamsize_cutoff ) {
7604  if ( atomic_available ) {
7605  retval = atomic_reduce_block;
7606  }
7607  } else {
7608  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7609  }
7610  } else if ( atomic_available ) {
7611  retval = atomic_reduce_block;
7612  }
7613  #else
7614  #error "Unknown or unsupported OS"
7615  #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7616 
7617  #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH
7618 
7619  #if KMP_OS_LINUX || KMP_OS_WINDOWS
7620 
7621  // basic tuning
7622 
7623  if( atomic_available ) {
7624  if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
7625  retval = atomic_reduce_block;
7626  }
7627  } // otherwise: use critical section
7628 
7629  #elif KMP_OS_DARWIN
7630 
7631  if( atomic_available && ( num_vars <= 3 ) ) {
7632  retval = atomic_reduce_block;
7633  } else if( tree_available ) {
7634  if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
7635  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7636  }
7637  } // otherwise: use critical section
7638 
7639  #else
7640  #error "Unknown or unsupported OS"
7641  #endif
7642 
7643  #else
7644  #error "Unknown or unsupported architecture"
7645  #endif
7646 
7647  }
7648 
7649  // KMP_FORCE_REDUCTION
7650 
7651  // If the team is serialized (team_size == 1), ignore the forced reduction
7652  // method and stay with the unsynchronized method (empty_reduce_block)
7653  if( __kmp_force_reduction_method != reduction_method_not_defined && team_size != 1) {
7654 
7655  PACKED_REDUCTION_METHOD_T forced_retval;
7656 
7657  int atomic_available, tree_available;
7658 
7659  switch( ( forced_retval = __kmp_force_reduction_method ) )
7660  {
7661  case critical_reduce_block:
7662  KMP_ASSERT( lck ); // lck should be != 0
7663  break;
7664 
7665  case atomic_reduce_block:
7666  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7667  KMP_ASSERT( atomic_available ); // atomic_available should be != 0
7668  break;
7669 
7670  case tree_reduce_block:
7671  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7672  KMP_ASSERT( tree_available ); // tree_available should be != 0
7673  #if KMP_FAST_REDUCTION_BARRIER
7674  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7675  #endif
7676  break;
7677 
7678  default:
7679  KMP_ASSERT( 0 ); // "unsupported method specified"
7680  }
7681 
7682  retval = forced_retval;
7683  }
7684 
7685  KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
7686 
7687  #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7688  #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7689 
7690  return ( retval );
7691 }
7692 
7693 // this function is for testing set/get/determine reduce method
7694 kmp_int32
7695 __kmp_get_reduce_method( void ) {
7696  return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
7697 }
7698 
7699 /* ------------------------------------------------------------------------ */
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:668
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
Definition: kmp_stats.h:682
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:629
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:464
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:201
Definition: kmp.h:218
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:449
sched_type
Definition: kmp.h:320
kmp_int32 flags
Definition: kmp.h:220