Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 /* <copyright>
6  Copyright (c) 1997-2015 Intel Corporation. All Rights Reserved.
7 
8  Redistribution and use in source and binary forms, with or without
9  modification, are permitted provided that the following conditions
10  are met:
11 
12  * Redistributions of source code must retain the above copyright
13  notice, this list of conditions and the following disclaimer.
14  * Redistributions in binary form must reproduce the above copyright
15  notice, this list of conditions and the following disclaimer in the
16  documentation and/or other materials provided with the distribution.
17  * Neither the name of Intel Corporation nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 </copyright> */
34 
35 /*
36  * Dynamic scheduling initialization and dispatch.
37  *
38  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
39  * it may change values between parallel regions. __kmp_max_nth
40  * is the largest value __kmp_nth may take, 1 is the smallest.
41  *
42  */
43 
44 /* ------------------------------------------------------------------------ */
45 /* ------------------------------------------------------------------------ */
46 
47 #include "kmp.h"
48 #include "kmp_i18n.h"
49 #include "kmp_itt.h"
50 #include "kmp_str.h"
51 #include "kmp_error.h"
52 #include "kmp_stats.h"
53 #if KMP_OS_WINDOWS && KMP_ARCH_X86
54  #include <float.h>
55 #endif
56 
57 #if OMPT_SUPPORT
58 #include "ompt-internal.h"
59 #include "ompt-specific.h"
60 #endif
61 
62 /* ------------------------------------------------------------------------ */
63 /* ------------------------------------------------------------------------ */
64 
65 // template for type limits
66 template< typename T >
67 struct i_maxmin {
68  static const T mx;
69  static const T mn;
70 };
71 template<>
72 struct i_maxmin< int > {
73  static const int mx = 0x7fffffff;
74  static const int mn = 0x80000000;
75 };
76 template<>
77 struct i_maxmin< unsigned int > {
78  static const unsigned int mx = 0xffffffff;
79  static const unsigned int mn = 0x00000000;
80 };
81 template<>
82 struct i_maxmin< long long > {
83  static const long long mx = 0x7fffffffffffffffLL;
84  static const long long mn = 0x8000000000000000LL;
85 };
86 template<>
87 struct i_maxmin< unsigned long long > {
88  static const unsigned long long mx = 0xffffffffffffffffLL;
89  static const unsigned long long mn = 0x0000000000000000LL;
90 };
91 //-------------------------------------------------------------------------
92 
93 #ifdef KMP_STATIC_STEAL_ENABLED
94 
95  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
96  template< typename T >
97  struct dispatch_private_infoXX_template {
98  typedef typename traits_t< T >::unsigned_t UT;
99  typedef typename traits_t< T >::signed_t ST;
100  UT count; // unsigned
101  T ub;
102  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
103  T lb;
104  ST st; // signed
105  UT tc; // unsigned
106  T static_steal_counter; // for static_steal only; maybe better to put after ub
107 
108  /* parm[1-4] are used in different ways by different scheduling algorithms */
109 
110  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
111  // a) parm3 is properly aligned and
112  // b) all parm1-4 are in the same cache line.
113  // Because of parm1-4 are used together, performance seems to be better
114  // if they are in the same line (not measured though).
115 
116  struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
117  T parm1;
118  T parm2;
119  T parm3;
120  T parm4;
121  };
122 
123  UT ordered_lower; // unsigned
124  UT ordered_upper; // unsigned
125  #if KMP_OS_WINDOWS
126  T last_upper;
127  #endif /* KMP_OS_WINDOWS */
128  };
129 
130 #else /* KMP_STATIC_STEAL_ENABLED */
131 
132  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
133  template< typename T >
134  struct dispatch_private_infoXX_template {
135  typedef typename traits_t< T >::unsigned_t UT;
136  typedef typename traits_t< T >::signed_t ST;
137  T lb;
138  T ub;
139  ST st; // signed
140  UT tc; // unsigned
141 
142  T parm1;
143  T parm2;
144  T parm3;
145  T parm4;
146 
147  UT count; // unsigned
148 
149  UT ordered_lower; // unsigned
150  UT ordered_upper; // unsigned
151  #if KMP_OS_WINDOWS
152  T last_upper;
153  #endif /* KMP_OS_WINDOWS */
154  };
155 
156 #endif /* KMP_STATIC_STEAL_ENABLED */
157 
158 // replaces dispatch_private_info structure and dispatch_private_info_t type
159 template< typename T >
160 struct KMP_ALIGN_CACHE dispatch_private_info_template {
161  // duplicate alignment here, otherwise size of structure is not correct in our compiler
162  union KMP_ALIGN_CACHE private_info_tmpl {
163  dispatch_private_infoXX_template< T > p;
164  dispatch_private_info64_t p64;
165  } u;
166  enum sched_type schedule; /* scheduling algorithm */
167  kmp_uint32 ordered; /* ordered clause specified */
168  kmp_uint32 ordered_bumped;
169  kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
170  dispatch_private_info * next; /* stack of buffers for nest of serial regions */
171  kmp_uint32 nomerge; /* don't merge iters if serialized */
172  kmp_uint32 type_size;
173  enum cons_type pushed_ws;
174 };
175 
176 
177 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
178 template< typename UT >
179 struct dispatch_shared_infoXX_template {
180  /* chunk index under dynamic, number of idle threads under static-steal;
181  iteration index otherwise */
182  volatile UT iteration;
183  volatile UT num_done;
184  volatile UT ordered_iteration;
185  UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
186 };
187 
188 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
189 template< typename UT >
190 struct dispatch_shared_info_template {
191  // we need union here to keep the structure size
192  union shared_info_tmpl {
193  dispatch_shared_infoXX_template< UT > s;
194  dispatch_shared_info64_t s64;
195  } u;
196  volatile kmp_uint32 buffer_index;
197 };
198 
199 /* ------------------------------------------------------------------------ */
200 /* ------------------------------------------------------------------------ */
201 
202 #undef USE_TEST_LOCKS
203 
204 // test_then_add template (general template should NOT be used)
205 template< typename T >
206 static __forceinline T
207 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
208 
209 template<>
210 __forceinline kmp_int32
211 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
212 {
213  kmp_int32 r;
214  r = KMP_TEST_THEN_ADD32( p, d );
215  return r;
216 }
217 
218 template<>
219 __forceinline kmp_int64
220 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
221 {
222  kmp_int64 r;
223  r = KMP_TEST_THEN_ADD64( p, d );
224  return r;
225 }
226 
227 // test_then_inc_acq template (general template should NOT be used)
228 template< typename T >
229 static __forceinline T
230 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
231 
232 template<>
233 __forceinline kmp_int32
234 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
235 {
236  kmp_int32 r;
237  r = KMP_TEST_THEN_INC_ACQ32( p );
238  return r;
239 }
240 
241 template<>
242 __forceinline kmp_int64
243 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
244 {
245  kmp_int64 r;
246  r = KMP_TEST_THEN_INC_ACQ64( p );
247  return r;
248 }
249 
250 // test_then_inc template (general template should NOT be used)
251 template< typename T >
252 static __forceinline T
253 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
254 
255 template<>
256 __forceinline kmp_int32
257 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
258 {
259  kmp_int32 r;
260  r = KMP_TEST_THEN_INC32( p );
261  return r;
262 }
263 
264 template<>
265 __forceinline kmp_int64
266 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
267 {
268  kmp_int64 r;
269  r = KMP_TEST_THEN_INC64( p );
270  return r;
271 }
272 
273 // compare_and_swap template (general template should NOT be used)
274 template< typename T >
275 static __forceinline kmp_int32
276 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
277 
278 template<>
279 __forceinline kmp_int32
280 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
281 {
282  return KMP_COMPARE_AND_STORE_REL32( p, c, s );
283 }
284 
285 template<>
286 __forceinline kmp_int32
287 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
288 {
289  return KMP_COMPARE_AND_STORE_REL64( p, c, s );
290 }
291 
292 /*
293  Spin wait loop that first does pause, then yield.
294  Waits until function returns non-zero when called with *spinner and check.
295  Does NOT put threads to sleep.
296 #if USE_ITT_BUILD
297  Arguments:
298  obj -- is higher-level synchronization object to report to ittnotify. It is used to report
299  locks consistently. For example, if lock is acquired immediately, its address is
300  reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
301  immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
302  address, not an address of low-level spinner.
303 #endif // USE_ITT_BUILD
304 */
305 template< typename UT >
306 // ToDo: make inline function (move to header file for icl)
307 static UT // unsigned 4- or 8-byte type
308 __kmp_wait_yield( volatile UT * spinner,
309  UT checker,
310  kmp_uint32 (* pred)( UT, UT )
311  USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
312  )
313 {
314  // note: we may not belong to a team at this point
315  register volatile UT * spin = spinner;
316  register UT check = checker;
317  register kmp_uint32 spins;
318  register kmp_uint32 (*f) ( UT, UT ) = pred;
319  register UT r;
320 
321  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
322  KMP_INIT_YIELD( spins );
323  // main wait spin loop
324  while(!f(r = *spin, check))
325  {
326  KMP_FSYNC_SPIN_PREPARE( obj );
327  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
328  It causes problems with infinite recursion because of exit lock */
329  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
330  __kmp_abort_thread(); */
331 
332  // if we are oversubscribed,
333  // or have waited a bit (and KMP_LIBRARY=throughput, then yield
334  // pause is in the following code
335  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
336  KMP_YIELD_SPIN( spins );
337  }
338  KMP_FSYNC_SPIN_ACQUIRED( obj );
339  return r;
340 }
341 
342 template< typename UT >
343 static kmp_uint32 __kmp_eq( UT value, UT checker) {
344  return value == checker;
345 }
346 
347 template< typename UT >
348 static kmp_uint32 __kmp_neq( UT value, UT checker) {
349  return value != checker;
350 }
351 
352 template< typename UT >
353 static kmp_uint32 __kmp_lt( UT value, UT checker) {
354  return value < checker;
355 }
356 
357 template< typename UT >
358 static kmp_uint32 __kmp_ge( UT value, UT checker) {
359  return value >= checker;
360 }
361 
362 template< typename UT >
363 static kmp_uint32 __kmp_le( UT value, UT checker) {
364  return value <= checker;
365 }
366 
367 
368 /* ------------------------------------------------------------------------ */
369 /* ------------------------------------------------------------------------ */
370 
371 static void
372 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
373 {
374  kmp_info_t *th;
375 
376  KMP_DEBUG_ASSERT( gtid_ref );
377 
378  if ( __kmp_env_consistency_check ) {
379  th = __kmp_threads[*gtid_ref];
380  if ( th -> th.th_root -> r.r_active
381  && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
382 #if KMP_USE_DYNAMIC_LOCK
383  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
384 #else
385  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
386 #endif
387  }
388  }
389 }
390 
391 template< typename UT >
392 static void
393 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
394 {
395  typedef typename traits_t< UT >::signed_t ST;
396  dispatch_private_info_template< UT > * pr;
397 
398  int gtid = *gtid_ref;
399 // int cid = *cid_ref;
400  kmp_info_t *th = __kmp_threads[ gtid ];
401  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
402 
403  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
404  if ( __kmp_env_consistency_check ) {
405  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
406  ( th -> th.th_dispatch -> th_dispatch_pr_current );
407  if ( pr -> pushed_ws != ct_none ) {
408 #if KMP_USE_DYNAMIC_LOCK
409  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
410 #else
411  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
412 #endif
413  }
414  }
415 
416  if ( ! th -> th.th_team -> t.t_serialized ) {
417  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
418  ( th -> th.th_dispatch -> th_dispatch_sh_current );
419  UT lower;
420 
421  if ( ! __kmp_env_consistency_check ) {
422  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
423  ( th -> th.th_dispatch -> th_dispatch_pr_current );
424  }
425  lower = pr->u.p.ordered_lower;
426 
427  #if ! defined( KMP_GOMP_COMPAT )
428  if ( __kmp_env_consistency_check ) {
429  if ( pr->ordered_bumped ) {
430  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
431  __kmp_error_construct2(
432  kmp_i18n_msg_CnsMultipleNesting,
433  ct_ordered_in_pdo, loc_ref,
434  & p->stack_data[ p->w_top ]
435  );
436  }
437  }
438  #endif /* !defined(KMP_GOMP_COMPAT) */
439 
440  KMP_MB();
441  #ifdef KMP_DEBUG
442  {
443  const char * buff;
444  // create format specifiers before the debug output
445  buff = __kmp_str_format(
446  "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
447  traits_t< UT >::spec, traits_t< UT >::spec );
448  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
449  __kmp_str_free( &buff );
450  }
451  #endif
452 
453  __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
454  USE_ITT_BUILD_ARG( NULL )
455  );
456  KMP_MB(); /* is this necessary? */
457  #ifdef KMP_DEBUG
458  {
459  const char * buff;
460  // create format specifiers before the debug output
461  buff = __kmp_str_format(
462  "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
463  traits_t< UT >::spec, traits_t< UT >::spec );
464  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
465  __kmp_str_free( &buff );
466  }
467  #endif
468  }
469  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
470 }
471 
472 static void
473 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
474 {
475  kmp_info_t *th;
476 
477  if ( __kmp_env_consistency_check ) {
478  th = __kmp_threads[*gtid_ref];
479  if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
480  __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
481  }
482  }
483 }
484 
485 template< typename UT >
486 static void
487 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
488 {
489  typedef typename traits_t< UT >::signed_t ST;
490  dispatch_private_info_template< UT > * pr;
491 
492  int gtid = *gtid_ref;
493 // int cid = *cid_ref;
494  kmp_info_t *th = __kmp_threads[ gtid ];
495  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
496 
497  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
498  if ( __kmp_env_consistency_check ) {
499  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
500  ( th -> th.th_dispatch -> th_dispatch_pr_current );
501  if ( pr -> pushed_ws != ct_none ) {
502  __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
503  }
504  }
505 
506  if ( ! th -> th.th_team -> t.t_serialized ) {
507  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
508  ( th -> th.th_dispatch -> th_dispatch_sh_current );
509 
510  if ( ! __kmp_env_consistency_check ) {
511  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
512  ( th -> th.th_dispatch -> th_dispatch_pr_current );
513  }
514 
515  KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
516  #if ! defined( KMP_GOMP_COMPAT )
517  if ( __kmp_env_consistency_check ) {
518  if ( pr->ordered_bumped != 0 ) {
519  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
520  /* How to test it? - OM */
521  __kmp_error_construct2(
522  kmp_i18n_msg_CnsMultipleNesting,
523  ct_ordered_in_pdo, loc_ref,
524  & p->stack_data[ p->w_top ]
525  );
526  }
527  }
528  #endif /* !defined(KMP_GOMP_COMPAT) */
529 
530  KMP_MB(); /* Flush all pending memory write invalidates. */
531 
532  pr->ordered_bumped += 1;
533 
534  KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
535  gtid, pr->ordered_bumped ) );
536 
537  KMP_MB(); /* Flush all pending memory write invalidates. */
538 
539  /* TODO use general release procedure? */
540  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
541 
542  KMP_MB(); /* Flush all pending memory write invalidates. */
543  }
544  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
545 }
546 
547 /* Computes and returns x to the power of y, where y must a non-negative integer */
548 template< typename UT >
549 static __forceinline long double
550 __kmp_pow(long double x, UT y) {
551  long double s=1.0L;
552 
553  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
554  //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
555  while(y) {
556  if ( y & 1 )
557  s *= x;
558  x *= x;
559  y >>= 1;
560  }
561  return s;
562 }
563 
564 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
565  (the total number of unassigned iterations in chunks with index greater than or equal to idx).
566  __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
567  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
568 */
569 template< typename T >
570 static __inline typename traits_t< T >::unsigned_t
571 __kmp_dispatch_guided_remaining(
572  T tc,
573  typename traits_t< T >::floating_t base,
574  typename traits_t< T >::unsigned_t idx
575 ) {
576  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
577  least for ICL 8.1, long double arithmetic may not really have
578  long double precision, even with /Qlong_double. Currently, we
579  workaround that in the caller code, by manipulating the FPCW for
580  Windows* OS on IA-32 architecture. The lack of precision is not
581  expected to be a correctness issue, though.
582  */
583  typedef typename traits_t< T >::unsigned_t UT;
584 
585  long double x = tc * __kmp_pow< UT >(base, idx);
586  UT r = (UT) x;
587  if ( x == r )
588  return r;
589  return r + 1;
590 }
591 
592 // Parameters of the guided-iterative algorithm:
593 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
594 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
595 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
596 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
597 static int guided_int_param = 2;
598 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
599 
600 // UT - unsigned flavor of T, ST - signed flavor of T,
601 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
602 template< typename T >
603 static void
604 __kmp_dispatch_init(
605  ident_t * loc,
606  int gtid,
607  enum sched_type schedule,
608  T lb,
609  T ub,
610  typename traits_t< T >::signed_t st,
611  typename traits_t< T >::signed_t chunk,
612  int push_ws
613 ) {
614  typedef typename traits_t< T >::unsigned_t UT;
615  typedef typename traits_t< T >::signed_t ST;
616  typedef typename traits_t< T >::floating_t DBL;
617  static const int ___kmp_size_type = sizeof( UT );
618 
619  int active;
620  T tc;
621  kmp_info_t * th;
622  kmp_team_t * team;
623  kmp_uint32 my_buffer_index;
624  dispatch_private_info_template< T > * pr;
625  dispatch_shared_info_template< UT > volatile * sh;
626 
627  KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
628  KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
629 
630  if ( ! TCR_4( __kmp_init_parallel ) )
631  __kmp_parallel_initialize();
632 
633 #if INCLUDE_SSC_MARKS
634  SSC_MARK_DISPATCH_INIT();
635 #endif
636  #ifdef KMP_DEBUG
637  {
638  const char * buff;
639  // create format specifiers before the debug output
640  buff = __kmp_str_format(
641  "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
642  traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
643  KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
644  __kmp_str_free( &buff );
645  }
646  #endif
647  /* setup data */
648  th = __kmp_threads[ gtid ];
649  team = th -> th.th_team;
650  active = ! team -> t.t_serialized;
651  th->th.th_ident = loc;
652 
653 #if USE_ITT_BUILD
654  kmp_uint64 cur_chunk = chunk;
655  int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
656  KMP_MASTER_GTID(gtid) &&
657 #if OMP_40_ENABLED
658  th->th.th_teams_microtask == NULL &&
659 #endif
660  team->t.t_active_level == 1;
661 #endif
662  if ( ! active ) {
663  pr = reinterpret_cast< dispatch_private_info_template< T >* >
664  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
665  } else {
666  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
667  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
668 
669  my_buffer_index = th->th.th_dispatch->th_disp_index ++;
670 
671  /* What happens when number of threads changes, need to resize buffer? */
672  pr = reinterpret_cast< dispatch_private_info_template< T > * >
673  ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
674  sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
675  ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
676  }
677 
678  /* Pick up the nomerge/ordered bits from the scheduling type */
679  if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
680  pr->nomerge = TRUE;
681  schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
682  } else {
683  pr->nomerge = FALSE;
684  }
685  pr->type_size = ___kmp_size_type; // remember the size of variables
686  if ( kmp_ord_lower & schedule ) {
687  pr->ordered = TRUE;
688  schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
689  } else {
690  pr->ordered = FALSE;
691  }
692  if ( schedule == kmp_sch_static ) {
693  schedule = __kmp_static;
694  } else {
695  if ( schedule == kmp_sch_runtime ) {
696  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
697  schedule = team -> t.t_sched.r_sched_type;
698  // Detail the schedule if needed (global controls are differentiated appropriately)
699  if ( schedule == kmp_sch_guided_chunked ) {
700  schedule = __kmp_guided;
701  } else if ( schedule == kmp_sch_static ) {
702  schedule = __kmp_static;
703  }
704  // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
705  chunk = team -> t.t_sched.chunk;
706 
707  #ifdef KMP_DEBUG
708  {
709  const char * buff;
710  // create format specifiers before the debug output
711  buff = __kmp_str_format(
712  "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
713  traits_t< ST >::spec );
714  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
715  __kmp_str_free( &buff );
716  }
717  #endif
718  } else {
719  if ( schedule == kmp_sch_guided_chunked ) {
720  schedule = __kmp_guided;
721  }
722  if ( chunk <= 0 ) {
723  chunk = KMP_DEFAULT_CHUNK;
724  }
725  }
726 
727  if ( schedule == kmp_sch_auto ) {
728  // mapping and differentiation: in the __kmp_do_serial_initialize()
729  schedule = __kmp_auto;
730  #ifdef KMP_DEBUG
731  {
732  const char * buff;
733  // create format specifiers before the debug output
734  buff = __kmp_str_format(
735  "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
736  traits_t< ST >::spec );
737  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
738  __kmp_str_free( &buff );
739  }
740  #endif
741  }
742 
743  /* guided analytical not safe for too many threads */
744  if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
745  schedule = kmp_sch_guided_iterative_chunked;
746  KMP_WARNING( DispatchManyThreads );
747  }
748  pr->u.p.parm1 = chunk;
749  }
750  KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
751  "unknown scheduling type" );
752 
753  pr->u.p.count = 0;
754 
755  if ( __kmp_env_consistency_check ) {
756  if ( st == 0 ) {
757  __kmp_error_construct(
758  kmp_i18n_msg_CnsLoopIncrZeroProhibited,
759  ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
760  );
761  }
762  }
763 
764  tc = ( ub - lb + st );
765  if ( st != 1 ) {
766  if ( st < 0 ) {
767  if ( lb < ub ) {
768  tc = 0; // zero-trip
769  } else { // lb >= ub
770  tc = (ST)tc / st; // convert to signed division
771  }
772  } else { // st > 0
773  if ( ub < lb ) {
774  tc = 0; // zero-trip
775  } else { // lb >= ub
776  tc /= st;
777  }
778  }
779  } else if ( ub < lb ) { // st == 1
780  tc = 0; // zero-trip
781  }
782 
783  pr->u.p.lb = lb;
784  pr->u.p.ub = ub;
785  pr->u.p.st = st;
786  pr->u.p.tc = tc;
787 
788  #if KMP_OS_WINDOWS
789  pr->u.p.last_upper = ub + st;
790  #endif /* KMP_OS_WINDOWS */
791 
792  /* NOTE: only the active parallel region(s) has active ordered sections */
793 
794  if ( active ) {
795  if ( pr->ordered == 0 ) {
796  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
797  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
798  } else {
799  pr->ordered_bumped = 0;
800 
801  pr->u.p.ordered_lower = 1;
802  pr->u.p.ordered_upper = 0;
803 
804  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
805  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
806  }
807  }
808 
809  if ( __kmp_env_consistency_check ) {
810  enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
811  if ( push_ws ) {
812  __kmp_push_workshare( gtid, ws, loc );
813  pr->pushed_ws = ws;
814  } else {
815  __kmp_check_workshare( gtid, ws, loc );
816  pr->pushed_ws = ct_none;
817  }
818  }
819 
820  switch ( schedule ) {
821  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
823  {
824  T nproc = team->t.t_nproc;
825  T ntc, init;
826 
827  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
828 
829  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
830  if ( nproc > 1 && ntc >= nproc ) {
831  T id = __kmp_tid_from_gtid(gtid);
832  T small_chunk, extras;
833 
834  small_chunk = ntc / nproc;
835  extras = ntc % nproc;
836 
837  init = id * small_chunk + ( id < extras ? id : extras );
838  pr->u.p.count = init;
839  pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
840 
841  pr->u.p.parm2 = lb;
842  //pr->pfields.parm3 = 0; // it's not used in static_steal
843  pr->u.p.parm4 = id;
844  pr->u.p.st = st;
845  break;
846  } else {
847  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
848  gtid ) );
849  schedule = kmp_sch_static_balanced;
850  /* too few iterations: fall-through to kmp_sch_static_balanced */
851  } // if
852  /* FALL-THROUGH to static balanced */
853  } // case
854  #endif
855  case kmp_sch_static_balanced:
856  {
857  T nproc = team->t.t_nproc;
858  T init, limit;
859 
860  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
861  gtid ) );
862 
863  if ( nproc > 1 ) {
864  T id = __kmp_tid_from_gtid(gtid);
865 
866  if ( tc < nproc ) {
867  if ( id < tc ) {
868  init = id;
869  limit = id;
870  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
871  } else {
872  pr->u.p.count = 1; /* means no more chunks to execute */
873  pr->u.p.parm1 = FALSE;
874  break;
875  }
876  } else {
877  T small_chunk = tc / nproc;
878  T extras = tc % nproc;
879  init = id * small_chunk + (id < extras ? id : extras);
880  limit = init + small_chunk - (id < extras ? 0 : 1);
881  pr->u.p.parm1 = (id == nproc - 1);
882  }
883  } else {
884  if ( tc > 0 ) {
885  init = 0;
886  limit = tc - 1;
887  pr->u.p.parm1 = TRUE;
888  } else {
889  // zero trip count
890  pr->u.p.count = 1; /* means no more chunks to execute */
891  pr->u.p.parm1 = FALSE;
892  break;
893  }
894  }
895 #if USE_ITT_BUILD
896  // Calculate chunk for metadata report
897  if ( itt_need_metadata_reporting )
898  cur_chunk = limit - init + 1;
899 #endif
900  if ( st == 1 ) {
901  pr->u.p.lb = lb + init;
902  pr->u.p.ub = lb + limit;
903  } else {
904  T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
905  pr->u.p.lb = lb + init * st;
906  // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
907  if ( st > 0 ) {
908  pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
909  } else {
910  pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
911  }
912  }
913  if ( pr->ordered ) {
914  pr->u.p.ordered_lower = init;
915  pr->u.p.ordered_upper = limit;
916  }
917  break;
918  } // case
919  case kmp_sch_guided_iterative_chunked :
920  {
921  T nproc = team->t.t_nproc;
922  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
923 
924  if ( nproc > 1 ) {
925  if ( (2L * chunk + 1 ) * nproc >= tc ) {
926  /* chunk size too large, switch to dynamic */
927  schedule = kmp_sch_dynamic_chunked;
928  } else {
929  // when remaining iters become less than parm2 - switch to dynamic
930  pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
931  *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
932  }
933  } else {
934  KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
935  schedule = kmp_sch_static_greedy;
936  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
937  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
938  pr->u.p.parm1 = tc;
939  } // if
940  } // case
941  break;
942  case kmp_sch_guided_analytical_chunked:
943  {
944  T nproc = team->t.t_nproc;
945  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
946 
947  if ( nproc > 1 ) {
948  if ( (2L * chunk + 1 ) * nproc >= tc ) {
949  /* chunk size too large, switch to dynamic */
950  schedule = kmp_sch_dynamic_chunked;
951  } else {
952  /* commonly used term: (2 nproc - 1)/(2 nproc) */
953  DBL x;
954 
955  #if KMP_OS_WINDOWS && KMP_ARCH_X86
956  /* Linux* OS already has 64-bit computation by default for
957  long double, and on Windows* OS on Intel(R) 64,
958  /Qlong_double doesn't work. On Windows* OS
959  on IA-32 architecture, we need to set precision to
960  64-bit instead of the default 53-bit. Even though long
961  double doesn't work on Windows* OS on Intel(R) 64, the
962  resulting lack of precision is not expected to impact
963  the correctness of the algorithm, but this has not been
964  mathematically proven.
965  */
966  // save original FPCW and set precision to 64-bit, as
967  // Windows* OS on IA-32 architecture defaults to 53-bit
968  unsigned int oldFpcw = _control87(0,0);
969  _control87(_PC_64,_MCW_PC); // 0,0x30000
970  #endif
971  /* value used for comparison in solver for cross-over point */
972  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
973 
974  /* crossover point--chunk indexes equal to or greater than
975  this point switch to dynamic-style scheduling */
976  UT cross;
977 
978  /* commonly used term: (2 nproc - 1)/(2 nproc) */
979  x = (long double)1.0 - (long double)0.5 / nproc;
980 
981  #ifdef KMP_DEBUG
982  { // test natural alignment
983  struct _test_a {
984  char a;
985  union {
986  char b;
987  DBL d;
988  };
989  } t;
990  ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
991  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
992  KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
993  }
994  #endif // KMP_DEBUG
995 
996  /* save the term in thread private dispatch structure */
997  *(DBL*)&pr->u.p.parm3 = x;
998 
999  /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
1000  {
1001  UT left, right, mid;
1002  long double p;
1003 
1004  /* estimate initial upper and lower bound */
1005 
1006  /* doesn't matter what value right is as long as it is positive, but
1007  it affects performance of the solver
1008  */
1009  right = 229;
1010  p = __kmp_pow< UT >(x,right);
1011  if ( p > target ) {
1012  do{
1013  p *= p;
1014  right <<= 1;
1015  } while(p>target && right < (1<<27));
1016  left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1017  } else {
1018  left = 0;
1019  }
1020 
1021  /* bisection root-finding method */
1022  while ( left + 1 < right ) {
1023  mid = (left + right) / 2;
1024  if ( __kmp_pow< UT >(x,mid) > target ) {
1025  left = mid;
1026  } else {
1027  right = mid;
1028  }
1029  } // while
1030  cross = right;
1031  }
1032  /* assert sanity of computed crossover point */
1033  KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1034 
1035  /* save the crossover point in thread private dispatch structure */
1036  pr->u.p.parm2 = cross;
1037 
1038  // C75803
1039  #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1040  #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1041  #else
1042  #define GUIDED_ANALYTICAL_WORKAROUND (x)
1043  #endif
1044  /* dynamic-style scheduling offset */
1045  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1046  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1047  // restore FPCW
1048  _control87(oldFpcw,_MCW_PC);
1049  #endif
1050  } // if
1051  } else {
1052  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1053  gtid ) );
1054  schedule = kmp_sch_static_greedy;
1055  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1056  pr->u.p.parm1 = tc;
1057  } // if
1058  } // case
1059  break;
1060  case kmp_sch_static_greedy:
1061  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1062  pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1063  ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1064  tc;
1065  break;
1066  case kmp_sch_static_chunked :
1067  case kmp_sch_dynamic_chunked :
1068  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1069  break;
1070  case kmp_sch_trapezoidal :
1071  {
1072  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1073 
1074  T parm1, parm2, parm3, parm4;
1075  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1076 
1077  parm1 = chunk;
1078 
1079  /* F : size of the first cycle */
1080  parm2 = ( tc / (2 * team->t.t_nproc) );
1081 
1082  if ( parm2 < 1 ) {
1083  parm2 = 1;
1084  }
1085 
1086  /* L : size of the last cycle. Make sure the last cycle
1087  * is not larger than the first cycle.
1088  */
1089  if ( parm1 < 1 ) {
1090  parm1 = 1;
1091  } else if ( parm1 > parm2 ) {
1092  parm1 = parm2;
1093  }
1094 
1095  /* N : number of cycles */
1096  parm3 = ( parm2 + parm1 );
1097  parm3 = ( 2 * tc + parm3 - 1) / parm3;
1098 
1099  if ( parm3 < 2 ) {
1100  parm3 = 2;
1101  }
1102 
1103  /* sigma : decreasing incr of the trapezoid */
1104  parm4 = ( parm3 - 1 );
1105  parm4 = ( parm2 - parm1 ) / parm4;
1106 
1107  // pointless check, because parm4 >= 0 always
1108  //if ( parm4 < 0 ) {
1109  // parm4 = 0;
1110  //}
1111 
1112  pr->u.p.parm1 = parm1;
1113  pr->u.p.parm2 = parm2;
1114  pr->u.p.parm3 = parm3;
1115  pr->u.p.parm4 = parm4;
1116  } // case
1117  break;
1118 
1119  default:
1120  {
1121  __kmp_msg(
1122  kmp_ms_fatal, // Severity
1123  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1124  KMP_HNT( GetNewerLibrary ), // Hint
1125  __kmp_msg_null // Variadic argument list terminator
1126  );
1127  }
1128  break;
1129  } // switch
1130  pr->schedule = schedule;
1131  if ( active ) {
1132  /* The name of this buffer should be my_buffer_index when it's free to use it */
1133 
1134  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1135  gtid, my_buffer_index, sh->buffer_index) );
1136  __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1137  USE_ITT_BUILD_ARG( NULL )
1138  );
1139  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1140  // *always* 32-bit integers.
1141  KMP_MB(); /* is this necessary? */
1142  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1143  gtid, my_buffer_index, sh->buffer_index) );
1144 
1145  th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1146  th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1147 #if USE_ITT_BUILD
1148  if ( pr->ordered ) {
1149  __kmp_itt_ordered_init( gtid );
1150  }; // if
1151  // Report loop metadata
1152  if ( itt_need_metadata_reporting ) {
1153  // Only report metadata by master of active team at level 1
1154  kmp_uint64 schedtype = 0;
1155  switch ( schedule ) {
1156  case kmp_sch_static_chunked:
1157  case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1158  break;
1159  case kmp_sch_static_greedy:
1160  cur_chunk = pr->u.p.parm1;
1161  break;
1162  case kmp_sch_dynamic_chunked:
1163  schedtype = 1;
1164  break;
1165  case kmp_sch_guided_iterative_chunked:
1166  case kmp_sch_guided_analytical_chunked:
1167  schedtype = 2;
1168  break;
1169  default:
1170 // Should we put this case under "static"?
1171 // case kmp_sch_static_steal:
1172  schedtype = 3;
1173  break;
1174  }
1175  __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1176  }
1177 #endif /* USE_ITT_BUILD */
1178  }; // if
1179 
1180  #ifdef KMP_DEBUG
1181  {
1182  const char * buff;
1183  // create format specifiers before the debug output
1184  buff = __kmp_str_format(
1185  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1186  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1187  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1188  traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1189  traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1190  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1191  traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1192  KD_TRACE(10, ( buff,
1193  gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1194  pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1195  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1196  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1197  __kmp_str_free( &buff );
1198  }
1199  #endif
1200  #if ( KMP_STATIC_STEAL_ENABLED )
1201  if ( ___kmp_size_type < 8 ) {
1202  // It cannot be guaranteed that after execution of a loop with some other schedule kind
1203  // all the parm3 variables will contain the same value.
1204  // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1205  // rather than program life-time increment.
1206  // So the dedicated variable is required. The 'static_steal_counter' is used.
1207  if( schedule == kmp_sch_static_steal ) {
1208  // Other threads will inspect this variable when searching for a victim.
1209  // This is a flag showing that other threads may steal from this thread since then.
1210  volatile T * p = &pr->u.p.static_steal_counter;
1211  *p = *p + 1;
1212  }
1213  }
1214  #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1215 
1216 #if OMPT_SUPPORT && OMPT_TRACE
1217  if ((ompt_status == ompt_status_track_callback) &&
1218  ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1219  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1220  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1221  ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1222  team_info->parallel_id, task_info->task_id, team_info->microtask);
1223  }
1224 #endif
1225 }
1226 
1227 /*
1228  * For ordered loops, either __kmp_dispatch_finish() should be called after
1229  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1230  * every chunk of iterations. If the ordered section(s) were not executed
1231  * for this iteration (or every iteration in this chunk), we need to set the
1232  * ordered iteration counters so that the next thread can proceed.
1233  */
1234 template< typename UT >
1235 static void
1236 __kmp_dispatch_finish( int gtid, ident_t *loc )
1237 {
1238  typedef typename traits_t< UT >::signed_t ST;
1239  kmp_info_t *th = __kmp_threads[ gtid ];
1240 
1241  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1242  if ( ! th -> th.th_team -> t.t_serialized ) {
1243 
1244  dispatch_private_info_template< UT > * pr =
1245  reinterpret_cast< dispatch_private_info_template< UT >* >
1246  ( th->th.th_dispatch->th_dispatch_pr_current );
1247  dispatch_shared_info_template< UT > volatile * sh =
1248  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1249  ( th->th.th_dispatch->th_dispatch_sh_current );
1250  KMP_DEBUG_ASSERT( pr );
1251  KMP_DEBUG_ASSERT( sh );
1252  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1253  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1254 
1255  if ( pr->ordered_bumped ) {
1256  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1257  gtid ) );
1258  pr->ordered_bumped = 0;
1259  } else {
1260  UT lower = pr->u.p.ordered_lower;
1261 
1262  #ifdef KMP_DEBUG
1263  {
1264  const char * buff;
1265  // create format specifiers before the debug output
1266  buff = __kmp_str_format(
1267  "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1268  traits_t< UT >::spec, traits_t< UT >::spec );
1269  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1270  __kmp_str_free( &buff );
1271  }
1272  #endif
1273 
1274  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1275  USE_ITT_BUILD_ARG(NULL)
1276  );
1277  KMP_MB(); /* is this necessary? */
1278  #ifdef KMP_DEBUG
1279  {
1280  const char * buff;
1281  // create format specifiers before the debug output
1282  buff = __kmp_str_format(
1283  "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1284  traits_t< UT >::spec, traits_t< UT >::spec );
1285  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1286  __kmp_str_free( &buff );
1287  }
1288  #endif
1289 
1290  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1291  } // if
1292  } // if
1293  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1294 }
1295 
1296 #ifdef KMP_GOMP_COMPAT
1297 
1298 template< typename UT >
1299 static void
1300 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1301 {
1302  typedef typename traits_t< UT >::signed_t ST;
1303  kmp_info_t *th = __kmp_threads[ gtid ];
1304 
1305  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1306  if ( ! th -> th.th_team -> t.t_serialized ) {
1307 // int cid;
1308  dispatch_private_info_template< UT > * pr =
1309  reinterpret_cast< dispatch_private_info_template< UT >* >
1310  ( th->th.th_dispatch->th_dispatch_pr_current );
1311  dispatch_shared_info_template< UT > volatile * sh =
1312  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1313  ( th->th.th_dispatch->th_dispatch_sh_current );
1314  KMP_DEBUG_ASSERT( pr );
1315  KMP_DEBUG_ASSERT( sh );
1316  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1317  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1318 
1319 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1320  UT lower = pr->u.p.ordered_lower;
1321  UT upper = pr->u.p.ordered_upper;
1322  UT inc = upper - lower + 1;
1323 
1324  if ( pr->ordered_bumped == inc ) {
1325  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1326  gtid ) );
1327  pr->ordered_bumped = 0;
1328  } else {
1329  inc -= pr->ordered_bumped;
1330 
1331  #ifdef KMP_DEBUG
1332  {
1333  const char * buff;
1334  // create format specifiers before the debug output
1335  buff = __kmp_str_format(
1336  "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1337  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1338  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1339  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1340  __kmp_str_free( &buff );
1341  }
1342  #endif
1343 
1344  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1345  USE_ITT_BUILD_ARG(NULL)
1346  );
1347 
1348  KMP_MB(); /* is this necessary? */
1349  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1350  gtid ) );
1351  pr->ordered_bumped = 0;
1353  #ifdef KMP_DEBUG
1354  {
1355  const char * buff;
1356  // create format specifiers before the debug output
1357  buff = __kmp_str_format(
1358  "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1359  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1360  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1361  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1362  __kmp_str_free( &buff );
1363  }
1364  #endif
1365 
1366  test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1367  }
1368 // }
1369  }
1370  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1371 }
1372 
1373 #endif /* KMP_GOMP_COMPAT */
1374 
1375 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1376  * (no more work), then tell OMPT the loop is over. In some cases
1377  * kmp_dispatch_fini() is not called. */
1378 #if OMPT_SUPPORT && OMPT_TRACE
1379 #define OMPT_LOOP_END \
1380  if (status == 0) { \
1381  if ((ompt_status == ompt_status_track_callback) && \
1382  ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1383  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1384  ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1385  ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1386  team_info->parallel_id, task_info->task_id); \
1387  } \
1388  }
1389 #else
1390 #define OMPT_LOOP_END // no-op
1391 #endif
1392 
1393 template< typename T >
1394 static int
1395 __kmp_dispatch_next(
1396  ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1397 ) {
1398 
1399  typedef typename traits_t< T >::unsigned_t UT;
1400  typedef typename traits_t< T >::signed_t ST;
1401  typedef typename traits_t< T >::floating_t DBL;
1402  static const int ___kmp_size_type = sizeof( UT );
1403 
1404  int status;
1405  dispatch_private_info_template< T > * pr;
1406  kmp_info_t * th = __kmp_threads[ gtid ];
1407  kmp_team_t * team = th -> th.th_team;
1408 
1409  KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
1410  #ifdef KMP_DEBUG
1411  {
1412  const char * buff;
1413  // create format specifiers before the debug output
1414  buff = __kmp_str_format(
1415  "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1416  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1417  KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1418  __kmp_str_free( &buff );
1419  }
1420  #endif
1421 
1422  if ( team -> t.t_serialized ) {
1423  /* NOTE: serialize this dispatch becase we are not at the active level */
1424  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1425  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1426  KMP_DEBUG_ASSERT( pr );
1427 
1428  if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1429  *p_lb = 0;
1430  *p_ub = 0;
1431 // if ( p_last != NULL )
1432 // *p_last = 0;
1433  if ( p_st != NULL )
1434  *p_st = 0;
1435  if ( __kmp_env_consistency_check ) {
1436  if ( pr->pushed_ws != ct_none ) {
1437  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1438  }
1439  }
1440  } else if ( pr->nomerge ) {
1441  kmp_int32 last;
1442  T start;
1443  UT limit, trip, init;
1444  ST incr;
1445  T chunk = pr->u.p.parm1;
1446 
1447  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1448 
1449  init = chunk * pr->u.p.count++;
1450  trip = pr->u.p.tc - 1;
1451 
1452  if ( (status = (init <= trip)) == 0 ) {
1453  *p_lb = 0;
1454  *p_ub = 0;
1455 // if ( p_last != NULL )
1456 // *p_last = 0;
1457  if ( p_st != NULL )
1458  *p_st = 0;
1459  if ( __kmp_env_consistency_check ) {
1460  if ( pr->pushed_ws != ct_none ) {
1461  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1462  }
1463  }
1464  } else {
1465  start = pr->u.p.lb;
1466  limit = chunk + init - 1;
1467  incr = pr->u.p.st;
1468 
1469  if ( (last = (limit >= trip)) != 0 ) {
1470  limit = trip;
1471  #if KMP_OS_WINDOWS
1472  pr->u.p.last_upper = pr->u.p.ub;
1473  #endif /* KMP_OS_WINDOWS */
1474  }
1475  if ( p_last != NULL )
1476  *p_last = last;
1477  if ( p_st != NULL )
1478  *p_st = incr;
1479  if ( incr == 1 ) {
1480  *p_lb = start + init;
1481  *p_ub = start + limit;
1482  } else {
1483  *p_lb = start + init * incr;
1484  *p_ub = start + limit * incr;
1485  }
1486 
1487  if ( pr->ordered ) {
1488  pr->u.p.ordered_lower = init;
1489  pr->u.p.ordered_upper = limit;
1490  #ifdef KMP_DEBUG
1491  {
1492  const char * buff;
1493  // create format specifiers before the debug output
1494  buff = __kmp_str_format(
1495  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1496  traits_t< UT >::spec, traits_t< UT >::spec );
1497  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1498  __kmp_str_free( &buff );
1499  }
1500  #endif
1501  } // if
1502  } // if
1503  } else {
1504  pr->u.p.tc = 0;
1505  *p_lb = pr->u.p.lb;
1506  *p_ub = pr->u.p.ub;
1507  #if KMP_OS_WINDOWS
1508  pr->u.p.last_upper = *p_ub;
1509  #endif /* KMP_OS_WINDOWS */
1510  if ( p_last != NULL )
1511  *p_last = TRUE;
1512  if ( p_st != NULL )
1513  *p_st = pr->u.p.st;
1514  } // if
1515  #ifdef KMP_DEBUG
1516  {
1517  const char * buff;
1518  // create format specifiers before the debug output
1519  buff = __kmp_str_format(
1520  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1521  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1522  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1523  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1524  __kmp_str_free( &buff );
1525  }
1526  #endif
1527 #if INCLUDE_SSC_MARKS
1528  SSC_MARK_DISPATCH_NEXT();
1529 #endif
1530  OMPT_LOOP_END;
1531  return status;
1532  } else {
1533  kmp_int32 last = 0;
1534  dispatch_shared_info_template< UT > *sh;
1535  T start;
1536  ST incr;
1537  UT limit, trip, init;
1538 
1539  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1540  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1541 
1542  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1543  ( th->th.th_dispatch->th_dispatch_pr_current );
1544  KMP_DEBUG_ASSERT( pr );
1545  sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1546  ( th->th.th_dispatch->th_dispatch_sh_current );
1547  KMP_DEBUG_ASSERT( sh );
1548 
1549  if ( pr->u.p.tc == 0 ) {
1550  // zero trip count
1551  status = 0;
1552  } else {
1553  switch (pr->schedule) {
1554  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1555  case kmp_sch_static_steal:
1556  {
1557  T chunk = pr->u.p.parm1;
1558 
1559  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1560 
1561  trip = pr->u.p.tc - 1;
1562 
1563  if ( ___kmp_size_type > 4 ) {
1564  // Other threads do not look into the data of this thread,
1565  // so it's not necessary to make volatile casting.
1566  init = ( pr->u.p.count )++;
1567  status = ( init < (UT)pr->u.p.ub );
1568  } else {
1569  typedef union {
1570  struct {
1571  UT count;
1572  T ub;
1573  } p;
1574  kmp_int64 b;
1575  } union_i4;
1576  // All operations on 'count' or 'ub' must be combined atomically together.
1577  // stealing implemented only for 4-byte indexes
1578  {
1579  union_i4 vold, vnew;
1580  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1581  vnew = vold;
1582  vnew.p.count++;
1583  while( ! KMP_COMPARE_AND_STORE_ACQ64(
1584  ( volatile kmp_int64* )&pr->u.p.count,
1585  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1586  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1587  KMP_CPU_PAUSE();
1588  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1589  vnew = vold;
1590  vnew.p.count++;
1591  }
1592  vnew = vold;
1593  init = vnew.p.count;
1594  status = ( init < (UT)vnew.p.ub ) ;
1595  }
1596 
1597  if( !status ) {
1598  kmp_info_t **other_threads = team->t.t_threads;
1599  int while_limit = 10;
1600  int while_index = 0;
1601 
1602  // TODO: algorithm of searching for a victim
1603  // should be cleaned up and measured
1604  while ( ( !status ) && ( while_limit != ++while_index ) ) {
1605  union_i4 vold, vnew;
1606  kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1607  T victimIdx = pr->u.p.parm4;
1608  T oldVictimIdx = victimIdx;
1609  dispatch_private_info_template< T > * victim;
1610 
1611  do {
1612  if( !victimIdx ) {
1613  victimIdx = team->t.t_nproc - 1;
1614  } else {
1615  --victimIdx;
1616  }
1617  victim = reinterpret_cast< dispatch_private_info_template< T >* >
1618  ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1619  } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1620  // TODO: think about a proper place of this test
1621  if ( ( !victim ) ||
1622  ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1623  (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1624  // TODO: delay would be nice
1625  continue;
1626  // the victim is not ready yet to participate in stealing
1627  // because the victim is still in kmp_init_dispatch
1628  }
1629  if ( oldVictimIdx == victimIdx ) {
1630  break;
1631  }
1632  pr->u.p.parm4 = victimIdx;
1633 
1634  while( 1 ) {
1635  vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1636  vnew = vold;
1637 
1638  KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1639  if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1640  break;
1641  }
1642  vnew.p.ub -= (remaining >> 2);
1643  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1644  #pragma warning( push )
1645  // disable warning on pointless comparison of unsigned with 0
1646  #pragma warning( disable: 186 )
1647  KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1648  #pragma warning( pop )
1649  // TODO: Should this be acquire or release?
1650  if ( KMP_COMPARE_AND_STORE_ACQ64(
1651  ( volatile kmp_int64 * )&victim->u.p.count,
1652  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1653  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1654  status = 1;
1655  while_index = 0;
1656  // now update own count and ub
1657  #if KMP_ARCH_X86
1658  // stealing executed on non-KMP_ARCH_X86 only
1659  // Atomic 64-bit write on ia32 is
1660  // unavailable, so we do this in steps.
1661  // This code is not tested.
1662  init = vold.p.count;
1663  pr->u.p.ub = 0;
1664  pr->u.p.count = init + 1;
1665  pr->u.p.ub = vnew.p.count;
1666  #else
1667  init = vnew.p.ub;
1668  vold.p.count = init + 1;
1669  // TODO: is it safe and enough?
1670  *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1671  #endif // KMP_ARCH_X86
1672  break;
1673  } // if
1674  KMP_CPU_PAUSE();
1675  } // while (1)
1676  } // while
1677  } // if
1678  } // if
1679  if ( !status ) {
1680  *p_lb = 0;
1681  *p_ub = 0;
1682  if ( p_st != NULL ) *p_st = 0;
1683  } else {
1684  start = pr->u.p.parm2;
1685  init *= chunk;
1686  limit = chunk + init - 1;
1687  incr = pr->u.p.st;
1688 
1689  KMP_DEBUG_ASSERT(init <= trip);
1690  if ( (last = (limit >= trip)) != 0 )
1691  limit = trip;
1692  if ( p_st != NULL ) *p_st = incr;
1693 
1694  if ( incr == 1 ) {
1695  *p_lb = start + init;
1696  *p_ub = start + limit;
1697  } else {
1698  *p_lb = start + init * incr;
1699  *p_ub = start + limit * incr;
1700  }
1701 
1702  if ( pr->ordered ) {
1703  pr->u.p.ordered_lower = init;
1704  pr->u.p.ordered_upper = limit;
1705  #ifdef KMP_DEBUG
1706  {
1707  const char * buff;
1708  // create format specifiers before the debug output
1709  buff = __kmp_str_format(
1710  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1711  traits_t< UT >::spec, traits_t< UT >::spec );
1712  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1713  __kmp_str_free( &buff );
1714  }
1715  #endif
1716  } // if
1717  } // if
1718  break;
1719  } // case
1720  #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1721  case kmp_sch_static_balanced:
1722  {
1723  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1724  if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1725  pr->u.p.count = 1;
1726  *p_lb = pr->u.p.lb;
1727  *p_ub = pr->u.p.ub;
1728  last = pr->u.p.parm1;
1729  if ( p_st != NULL )
1730  *p_st = pr->u.p.st;
1731  } else { /* no iterations to do */
1732  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1733  }
1734  if ( pr->ordered ) {
1735  #ifdef KMP_DEBUG
1736  {
1737  const char * buff;
1738  // create format specifiers before the debug output
1739  buff = __kmp_str_format(
1740  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1741  traits_t< UT >::spec, traits_t< UT >::spec );
1742  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1743  __kmp_str_free( &buff );
1744  }
1745  #endif
1746  } // if
1747  } // case
1748  break;
1749  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1750  case kmp_sch_static_chunked:
1751  {
1752  T parm1;
1753 
1754  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1755  gtid ) );
1756  parm1 = pr->u.p.parm1;
1757 
1758  trip = pr->u.p.tc - 1;
1759  init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1760 
1761  if ( (status = (init <= trip)) != 0 ) {
1762  start = pr->u.p.lb;
1763  incr = pr->u.p.st;
1764  limit = parm1 + init - 1;
1765 
1766  if ( (last = (limit >= trip)) != 0 )
1767  limit = trip;
1768 
1769  if ( p_st != NULL ) *p_st = incr;
1770 
1771  pr->u.p.count += team->t.t_nproc;
1772 
1773  if ( incr == 1 ) {
1774  *p_lb = start + init;
1775  *p_ub = start + limit;
1776  }
1777  else {
1778  *p_lb = start + init * incr;
1779  *p_ub = start + limit * incr;
1780  }
1781 
1782  if ( pr->ordered ) {
1783  pr->u.p.ordered_lower = init;
1784  pr->u.p.ordered_upper = limit;
1785  #ifdef KMP_DEBUG
1786  {
1787  const char * buff;
1788  // create format specifiers before the debug output
1789  buff = __kmp_str_format(
1790  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1791  traits_t< UT >::spec, traits_t< UT >::spec );
1792  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1793  __kmp_str_free( &buff );
1794  }
1795  #endif
1796  } // if
1797  } // if
1798  } // case
1799  break;
1800 
1801  case kmp_sch_dynamic_chunked:
1802  {
1803  T chunk = pr->u.p.parm1;
1804 
1805  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1806  gtid ) );
1807 
1808  init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1809  trip = pr->u.p.tc - 1;
1810 
1811  if ( (status = (init <= trip)) == 0 ) {
1812  *p_lb = 0;
1813  *p_ub = 0;
1814  if ( p_st != NULL ) *p_st = 0;
1815  } else {
1816  start = pr->u.p.lb;
1817  limit = chunk + init - 1;
1818  incr = pr->u.p.st;
1819 
1820  if ( (last = (limit >= trip)) != 0 )
1821  limit = trip;
1822 
1823  if ( p_st != NULL ) *p_st = incr;
1824 
1825  if ( incr == 1 ) {
1826  *p_lb = start + init;
1827  *p_ub = start + limit;
1828  } else {
1829  *p_lb = start + init * incr;
1830  *p_ub = start + limit * incr;
1831  }
1832 
1833  if ( pr->ordered ) {
1834  pr->u.p.ordered_lower = init;
1835  pr->u.p.ordered_upper = limit;
1836  #ifdef KMP_DEBUG
1837  {
1838  const char * buff;
1839  // create format specifiers before the debug output
1840  buff = __kmp_str_format(
1841  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1842  traits_t< UT >::spec, traits_t< UT >::spec );
1843  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1844  __kmp_str_free( &buff );
1845  }
1846  #endif
1847  } // if
1848  } // if
1849  } // case
1850  break;
1851 
1852  case kmp_sch_guided_iterative_chunked:
1853  {
1854  T chunkspec = pr->u.p.parm1;
1855  KD_TRACE(100,
1856  ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1857  trip = pr->u.p.tc;
1858  // Start atomic part of calculations
1859  while(1) {
1860  ST remaining; // signed, because can be < 0
1861  init = sh->u.s.iteration; // shared value
1862  remaining = trip - init;
1863  if ( remaining <= 0 ) { // AC: need to compare with 0 first
1864  // nothing to do, don't try atomic op
1865  status = 0;
1866  break;
1867  }
1868  if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1869  // use dynamic-style shcedule
1870  // atomically inrement iterations, get old value
1871  init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1872  remaining = trip - init;
1873  if (remaining <= 0) {
1874  status = 0; // all iterations got by other threads
1875  } else {
1876  // got some iterations to work on
1877  status = 1;
1878  if ( (T)remaining > chunkspec ) {
1879  limit = init + chunkspec - 1;
1880  } else {
1881  last = 1; // the last chunk
1882  limit = init + remaining - 1;
1883  } // if
1884  } // if
1885  break;
1886  } // if
1887  limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1888  if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1889  // CAS was successful, chunk obtained
1890  status = 1;
1891  --limit;
1892  break;
1893  } // if
1894  } // while
1895  if ( status != 0 ) {
1896  start = pr->u.p.lb;
1897  incr = pr->u.p.st;
1898  if ( p_st != NULL )
1899  *p_st = incr;
1900  *p_lb = start + init * incr;
1901  *p_ub = start + limit * incr;
1902  if ( pr->ordered ) {
1903  pr->u.p.ordered_lower = init;
1904  pr->u.p.ordered_upper = limit;
1905  #ifdef KMP_DEBUG
1906  {
1907  const char * buff;
1908  // create format specifiers before the debug output
1909  buff = __kmp_str_format(
1910  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1911  traits_t< UT >::spec, traits_t< UT >::spec );
1912  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1913  __kmp_str_free( &buff );
1914  }
1915  #endif
1916  } // if
1917  } else {
1918  *p_lb = 0;
1919  *p_ub = 0;
1920  if ( p_st != NULL )
1921  *p_st = 0;
1922  } // if
1923  } // case
1924  break;
1925 
1926  case kmp_sch_guided_analytical_chunked:
1927  {
1928  T chunkspec = pr->u.p.parm1;
1929  UT chunkIdx;
1930  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1931  /* for storing original FPCW value for Windows* OS on
1932  IA-32 architecture 8-byte version */
1933  unsigned int oldFpcw;
1934  unsigned int fpcwSet = 0;
1935  #endif
1936  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1937  gtid ) );
1938 
1939  trip = pr->u.p.tc;
1940 
1941  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1942  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1943 
1944  while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1945  chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1946  if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1947  --trip;
1948  /* use dynamic-style scheduling */
1949  init = chunkIdx * chunkspec + pr->u.p.count;
1950  /* need to verify init > 0 in case of overflow in the above calculation */
1951  if ( (status = (init > 0 && init <= trip)) != 0 ) {
1952  limit = init + chunkspec -1;
1953 
1954  if ( (last = (limit >= trip)) != 0 )
1955  limit = trip;
1956  }
1957  break;
1958  } else {
1959  /* use exponential-style scheduling */
1960  /* The following check is to workaround the lack of long double precision on Windows* OS.
1961  This check works around the possible effect that init != 0 for chunkIdx == 0.
1962  */
1963  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1964  /* If we haven't already done so, save original
1965  FPCW and set precision to 64-bit, as Windows* OS
1966  on IA-32 architecture defaults to 53-bit */
1967  if ( !fpcwSet ) {
1968  oldFpcw = _control87(0,0);
1969  _control87(_PC_64,_MCW_PC);
1970  fpcwSet = 0x30000;
1971  }
1972  #endif
1973  if ( chunkIdx ) {
1974  init = __kmp_dispatch_guided_remaining< T >(
1975  trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1976  KMP_DEBUG_ASSERT(init);
1977  init = trip - init;
1978  } else
1979  init = 0;
1980  limit = trip - __kmp_dispatch_guided_remaining< T >(
1981  trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1982  KMP_ASSERT(init <= limit);
1983  if ( init < limit ) {
1984  KMP_DEBUG_ASSERT(limit <= trip);
1985  --limit;
1986  status = 1;
1987  break;
1988  } // if
1989  } // if
1990  } // while (1)
1991  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1992  /* restore FPCW if necessary
1993  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1994  */
1995  if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1996  _control87(oldFpcw,_MCW_PC);
1997  #endif
1998  if ( status != 0 ) {
1999  start = pr->u.p.lb;
2000  incr = pr->u.p.st;
2001  if ( p_st != NULL )
2002  *p_st = incr;
2003  *p_lb = start + init * incr;
2004  *p_ub = start + limit * incr;
2005  if ( pr->ordered ) {
2006  pr->u.p.ordered_lower = init;
2007  pr->u.p.ordered_upper = limit;
2008  #ifdef KMP_DEBUG
2009  {
2010  const char * buff;
2011  // create format specifiers before the debug output
2012  buff = __kmp_str_format(
2013  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2014  traits_t< UT >::spec, traits_t< UT >::spec );
2015  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2016  __kmp_str_free( &buff );
2017  }
2018  #endif
2019  }
2020  } else {
2021  *p_lb = 0;
2022  *p_ub = 0;
2023  if ( p_st != NULL )
2024  *p_st = 0;
2025  }
2026  } // case
2027  break;
2028 
2029  case kmp_sch_trapezoidal:
2030  {
2031  UT index;
2032  T parm2 = pr->u.p.parm2;
2033  T parm3 = pr->u.p.parm3;
2034  T parm4 = pr->u.p.parm4;
2035  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2036  gtid ) );
2037 
2038  index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2039 
2040  init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2041  trip = pr->u.p.tc - 1;
2042 
2043  if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2044  *p_lb = 0;
2045  *p_ub = 0;
2046  if ( p_st != NULL ) *p_st = 0;
2047  } else {
2048  start = pr->u.p.lb;
2049  limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2050  incr = pr->u.p.st;
2051 
2052  if ( (last = (limit >= trip)) != 0 )
2053  limit = trip;
2054 
2055  if ( p_st != NULL ) *p_st = incr;
2056 
2057  if ( incr == 1 ) {
2058  *p_lb = start + init;
2059  *p_ub = start + limit;
2060  } else {
2061  *p_lb = start + init * incr;
2062  *p_ub = start + limit * incr;
2063  }
2064 
2065  if ( pr->ordered ) {
2066  pr->u.p.ordered_lower = init;
2067  pr->u.p.ordered_upper = limit;
2068  #ifdef KMP_DEBUG
2069  {
2070  const char * buff;
2071  // create format specifiers before the debug output
2072  buff = __kmp_str_format(
2073  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2074  traits_t< UT >::spec, traits_t< UT >::spec );
2075  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2076  __kmp_str_free( &buff );
2077  }
2078  #endif
2079  } // if
2080  } // if
2081  } // case
2082  break;
2083  default:
2084  {
2085  status = 0; // to avoid complaints on uninitialized variable use
2086  __kmp_msg(
2087  kmp_ms_fatal, // Severity
2088  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2089  KMP_HNT( GetNewerLibrary ), // Hint
2090  __kmp_msg_null // Variadic argument list terminator
2091  );
2092  }
2093  break;
2094  } // switch
2095  } // if tc == 0;
2096 
2097  if ( status == 0 ) {
2098  UT num_done;
2099 
2100  num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2101  #ifdef KMP_DEBUG
2102  {
2103  const char * buff;
2104  // create format specifiers before the debug output
2105  buff = __kmp_str_format(
2106  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2107  traits_t< UT >::spec );
2108  KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2109  __kmp_str_free( &buff );
2110  }
2111  #endif
2112 
2113  if ( (ST)num_done == team->t.t_nproc-1 ) {
2114  /* NOTE: release this buffer to be reused */
2115 
2116  KMP_MB(); /* Flush all pending memory write invalidates. */
2117 
2118  sh->u.s.num_done = 0;
2119  sh->u.s.iteration = 0;
2120 
2121  /* TODO replace with general release procedure? */
2122  if ( pr->ordered ) {
2123  sh->u.s.ordered_iteration = 0;
2124  }
2125 
2126  KMP_MB(); /* Flush all pending memory write invalidates. */
2127 
2128  sh -> buffer_index += KMP_MAX_DISP_BUF;
2129  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2130  gtid, sh->buffer_index) );
2131 
2132  KMP_MB(); /* Flush all pending memory write invalidates. */
2133 
2134  } // if
2135  if ( __kmp_env_consistency_check ) {
2136  if ( pr->pushed_ws != ct_none ) {
2137  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2138  }
2139  }
2140 
2141  th -> th.th_dispatch -> th_deo_fcn = NULL;
2142  th -> th.th_dispatch -> th_dxo_fcn = NULL;
2143  th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2144  th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2145  } // if (status == 0)
2146 #if KMP_OS_WINDOWS
2147  else if ( last ) {
2148  pr->u.p.last_upper = pr->u.p.ub;
2149  }
2150 #endif /* KMP_OS_WINDOWS */
2151  if ( p_last != NULL && status != 0 )
2152  *p_last = last;
2153  } // if
2154 
2155  #ifdef KMP_DEBUG
2156  {
2157  const char * buff;
2158  // create format specifiers before the debug output
2159  buff = __kmp_str_format(
2160  "__kmp_dispatch_next: T#%%d normal case: " \
2161  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2162  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2163  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2164  __kmp_str_free( &buff );
2165  }
2166  #endif
2167 #if INCLUDE_SSC_MARKS
2168  SSC_MARK_DISPATCH_NEXT();
2169 #endif
2170  OMPT_LOOP_END;
2171  return status;
2172 }
2173 
2174 template< typename T >
2175 static void
2176 __kmp_dist_get_bounds(
2177  ident_t *loc,
2178  kmp_int32 gtid,
2179  kmp_int32 *plastiter,
2180  T *plower,
2181  T *pupper,
2182  typename traits_t< T >::signed_t incr
2183 ) {
2184  KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2185  typedef typename traits_t< T >::unsigned_t UT;
2186  typedef typename traits_t< T >::signed_t ST;
2187  register kmp_uint32 team_id;
2188  register kmp_uint32 nteams;
2189  register UT trip_count;
2190  register kmp_team_t *team;
2191  kmp_info_t * th;
2192 
2193  KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2194  KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2195  #ifdef KMP_DEBUG
2196  {
2197  const char * buff;
2198  // create format specifiers before the debug output
2199  buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2200  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2201  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2202  traits_t< T >::spec );
2203  KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2204  __kmp_str_free( &buff );
2205  }
2206  #endif
2207 
2208  if( __kmp_env_consistency_check ) {
2209  if( incr == 0 ) {
2210  __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2211  }
2212  if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2213  // The loop is illegal.
2214  // Some zero-trip loops maintained by compiler, e.g.:
2215  // for(i=10;i<0;++i) // lower >= upper - run-time check
2216  // for(i=0;i>10;--i) // lower <= upper - run-time check
2217  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2218  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2219  // Compiler does not check the following illegal loops:
2220  // for(i=0;i<10;i+=incr) // where incr<0
2221  // for(i=10;i>0;i-=incr) // where incr<0
2222  __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2223  }
2224  }
2225  th = __kmp_threads[gtid];
2226  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2227  team = th->th.th_team;
2228  #if OMP_40_ENABLED
2229  nteams = th->th.th_teams_size.nteams;
2230  #endif
2231  team_id = team->t.t_master_tid;
2232  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2233 
2234  // compute global trip count
2235  if( incr == 1 ) {
2236  trip_count = *pupper - *plower + 1;
2237  } else if(incr == -1) {
2238  trip_count = *plower - *pupper + 1;
2239  } else {
2240  trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2241  }
2242  if( trip_count <= nteams ) {
2243  KMP_DEBUG_ASSERT(
2244  __kmp_static == kmp_sch_static_greedy || \
2245  __kmp_static == kmp_sch_static_balanced
2246  ); // Unknown static scheduling type.
2247  // only some teams get single iteration, others get nothing
2248  if( team_id < trip_count ) {
2249  *pupper = *plower = *plower + team_id * incr;
2250  } else {
2251  *plower = *pupper + incr; // zero-trip loop
2252  }
2253  if( plastiter != NULL )
2254  *plastiter = ( team_id == trip_count - 1 );
2255  } else {
2256  if( __kmp_static == kmp_sch_static_balanced ) {
2257  register UT chunk = trip_count / nteams;
2258  register UT extras = trip_count % nteams;
2259  *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2260  *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2261  if( plastiter != NULL )
2262  *plastiter = ( team_id == nteams - 1 );
2263  } else {
2264  register T chunk_inc_count =
2265  ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2266  register T upper = *pupper;
2267  KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2268  // Unknown static scheduling type.
2269  *plower += team_id * chunk_inc_count;
2270  *pupper = *plower + chunk_inc_count - incr;
2271  // Check/correct bounds if needed
2272  if( incr > 0 ) {
2273  if( *pupper < *plower )
2274  *pupper = i_maxmin< T >::mx;
2275  if( plastiter != NULL )
2276  *plastiter = *plower <= upper && *pupper > upper - incr;
2277  if( *pupper > upper )
2278  *pupper = upper; // tracker C73258
2279  } else {
2280  if( *pupper > *plower )
2281  *pupper = i_maxmin< T >::mn;
2282  if( plastiter != NULL )
2283  *plastiter = *plower >= upper && *pupper < upper - incr;
2284  if( *pupper < upper )
2285  *pupper = upper; // tracker C73258
2286  }
2287  }
2288  }
2289 }
2290 
2291 //-----------------------------------------------------------------------------------------
2292 // Dispatch routines
2293 // Transfer call to template< type T >
2294 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2295 // T lb, T ub, ST st, ST chunk )
2296 extern "C" {
2297 
2313 void
2314 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2315  kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2316 {
2317  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2318  KMP_DEBUG_ASSERT( __kmp_init_serial );
2319  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2320 }
2324 void
2325 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2326  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2327 {
2328  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2329  KMP_DEBUG_ASSERT( __kmp_init_serial );
2330  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2331 }
2332 
2336 void
2337 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2338  kmp_int64 lb, kmp_int64 ub,
2339  kmp_int64 st, kmp_int64 chunk )
2340 {
2341  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2342  KMP_DEBUG_ASSERT( __kmp_init_serial );
2343  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2344 }
2345 
2349 void
2350 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2351  kmp_uint64 lb, kmp_uint64 ub,
2352  kmp_int64 st, kmp_int64 chunk )
2353 {
2354  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2355  KMP_DEBUG_ASSERT( __kmp_init_serial );
2356  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2357 }
2358 
2368 void
2369 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2370  kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2371 {
2372  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2373  KMP_DEBUG_ASSERT( __kmp_init_serial );
2374  __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2375  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2376 }
2377 
2378 void
2379 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2380  kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2381 {
2382  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2383  KMP_DEBUG_ASSERT( __kmp_init_serial );
2384  __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2385  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2386 }
2387 
2388 void
2389 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2390  kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2391 {
2392  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2393  KMP_DEBUG_ASSERT( __kmp_init_serial );
2394  __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2395  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2396 }
2397 
2398 void
2399 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2400  kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2401 {
2402  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2403  KMP_DEBUG_ASSERT( __kmp_init_serial );
2404  __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2405  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2406 }
2407 
2420 int
2421 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2422  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2423 {
2424  return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2425 }
2426 
2430 int
2431 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2432  kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2433 {
2434  return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2435 }
2436 
2440 int
2441 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2442  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2443 {
2444  return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2445 }
2446 
2450 int
2451 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2452  kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2453 {
2454  return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2455 }
2456 
2463 void
2464 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2465 {
2466  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2467 }
2468 
2472 void
2473 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2474 {
2475  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2476 }
2477 
2481 void
2482 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2483 {
2484  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2485 }
2486 
2490 void
2491 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2492 {
2493  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2494 }
2497 //-----------------------------------------------------------------------------------------
2498 //Non-template routines from kmp_dispatch.c used in other sources
2499 
2500 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2501  return value == checker;
2502 }
2503 
2504 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2505  return value != checker;
2506 }
2507 
2508 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2509  return value < checker;
2510 }
2511 
2512 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2513  return value >= checker;
2514 }
2515 
2516 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2517  return value <= checker;
2518 }
2519 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2520  return value == checker;
2521 }
2522 
2523 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2524  return value != checker;
2525 }
2526 
2527 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2528  return value < checker;
2529 }
2530 
2531 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2532  return value >= checker;
2533 }
2534 
2535 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2536  return value <= checker;
2537 }
2538 
2539 kmp_uint32
2540 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2541  kmp_uint32 checker,
2542  kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2543  , void * obj // Higher-level synchronization object, or NULL.
2544  )
2545 {
2546  // note: we may not belong to a team at this point
2547  register volatile kmp_uint32 * spin = spinner;
2548  register kmp_uint32 check = checker;
2549  register kmp_uint32 spins;
2550  register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2551  register kmp_uint32 r;
2552 
2553  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2554  KMP_INIT_YIELD( spins );
2555  // main wait spin loop
2556  while(!f(r = TCR_4(*spin), check)) {
2557  KMP_FSYNC_SPIN_PREPARE( obj );
2558  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2559  It causes problems with infinite recursion because of exit lock */
2560  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2561  __kmp_abort_thread(); */
2562 
2563  /* if we have waited a bit, or are oversubscribed, yield */
2564  /* pause is in the following code */
2565  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2566  KMP_YIELD_SPIN( spins );
2567  }
2568  KMP_FSYNC_SPIN_ACQUIRED( obj );
2569  return r;
2570 }
2571 
2572 kmp_uint64
2573 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2574  kmp_uint64 checker,
2575  kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2576  , void * obj // Higher-level synchronization object, or NULL.
2577  )
2578 {
2579  // note: we may not belong to a team at this point
2580  register volatile kmp_uint64 * spin = spinner;
2581  register kmp_uint64 check = checker;
2582  register kmp_uint32 spins;
2583  register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2584  register kmp_uint64 r;
2585 
2586  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2587  KMP_INIT_YIELD( spins );
2588  // main wait spin loop
2589  while(!f(r = *spin, check))
2590  {
2591  KMP_FSYNC_SPIN_PREPARE( obj );
2592  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2593  It causes problems with infinite recursion because of exit lock */
2594  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2595  __kmp_abort_thread(); */
2596 
2597  // if we are oversubscribed,
2598  // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2599  // pause is in the following code
2600  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2601  KMP_YIELD_SPIN( spins );
2602  }
2603  KMP_FSYNC_SPIN_ACQUIRED( obj );
2604  return r;
2605 }
2606 
2607 } // extern "C"
2608 
2609 #ifdef KMP_GOMP_COMPAT
2610 
2611 void
2612 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2613  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2614  kmp_int32 chunk, int push_ws )
2615 {
2616  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2617  push_ws );
2618 }
2619 
2620 void
2621 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2622  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2623  kmp_int32 chunk, int push_ws )
2624 {
2625  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2626  push_ws );
2627 }
2628 
2629 void
2630 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2631  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2632  kmp_int64 chunk, int push_ws )
2633 {
2634  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2635  push_ws );
2636 }
2637 
2638 void
2639 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2640  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2641  kmp_int64 chunk, int push_ws )
2642 {
2643  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2644  push_ws );
2645 }
2646 
2647 void
2648 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2649 {
2650  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2651 }
2652 
2653 void
2654 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2655 {
2656  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2657 }
2658 
2659 void
2660 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2661 {
2662  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2663 }
2664 
2665 void
2666 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2667 {
2668  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2669 }
2670 
2671 #endif /* KMP_GOMP_COMPAT */
2672 
2673 /* ------------------------------------------------------------------------ */
2674 /* ------------------------------------------------------------------------ */
2675 
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:654
Definition: kmp.h:218
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
sched_type
Definition: kmp.h:320
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)