Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_taskq.c
1 /*
2  * kmp_taskq.c -- TASKQ support for OpenMP.
3  */
4 
5 /* <copyright>
6  Copyright (c) 1997-2015 Intel Corporation. All Rights Reserved.
7 
8  Redistribution and use in source and binary forms, with or without
9  modification, are permitted provided that the following conditions
10  are met:
11 
12  * Redistributions of source code must retain the above copyright
13  notice, this list of conditions and the following disclaimer.
14  * Redistributions in binary form must reproduce the above copyright
15  notice, this list of conditions and the following disclaimer in the
16  documentation and/or other materials provided with the distribution.
17  * Neither the name of Intel Corporation nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 </copyright> */
34 
35 #include "kmp.h"
36 #include "kmp_i18n.h"
37 #include "kmp_io.h"
38 #include "kmp_error.h"
39 
40 #define MAX_MESSAGE 512
41 
42 /* ------------------------------------------------------------------------ */
43 /* ------------------------------------------------------------------------ */
44 
45 /*
46  * Taskq routines and global variables
47  */
48 
49 #define KMP_DEBUG_REF_CTS(x) KF_TRACE(1, x);
50 
51 #define THREAD_ALLOC_FOR_TASKQ
52 
53 static int
54 in_parallel_context( kmp_team_t *team )
55 {
56  return ! team -> t.t_serialized;
57 }
58 
59 static void
60 __kmp_taskq_eo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
61 {
62  int gtid = *gtid_ref;
63  int tid = __kmp_tid_from_gtid( gtid );
64  kmp_uint32 spins;
65  kmp_uint32 my_token;
66  kmpc_task_queue_t *taskq;
67  kmp_taskq_t *tq = & __kmp_threads[gtid] -> th.th_team -> t.t_taskq;
68 
69  if ( __kmp_env_consistency_check )
70 #if KMP_USE_DYNAMIC_LOCK
71  __kmp_push_sync( gtid, ct_ordered_in_taskq, loc_ref, NULL, 0 );
72 #else
73  __kmp_push_sync( gtid, ct_ordered_in_taskq, loc_ref, NULL );
74 #endif
75 
76  if ( ! __kmp_threads[ gtid ]-> th.th_team -> t.t_serialized ) {
77  KMP_MB(); /* Flush all pending memory write invalidates. */
78 
79  /* GEH - need check here under stats to make sure */
80  /* inside task (curr_thunk[*tid_ref] != NULL) */
81 
82  my_token =tq->tq_curr_thunk[ tid ]-> th_tasknum;
83 
84  taskq = tq->tq_curr_thunk[ tid ]-> th.th_shareds -> sv_queue;
85 
86  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
87  KMP_MB();
88  }
89 }
90 
91 static void
92 __kmp_taskq_xo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
93 {
94  int gtid = *gtid_ref;
95  int tid = __kmp_tid_from_gtid( gtid );
96  kmp_uint32 my_token;
97  kmp_taskq_t *tq = & __kmp_threads[gtid] -> th.th_team -> t.t_taskq;
98 
99  if ( __kmp_env_consistency_check )
100  __kmp_pop_sync( gtid, ct_ordered_in_taskq, loc_ref );
101 
102  if ( ! __kmp_threads[ gtid ]-> th.th_team -> t.t_serialized ) {
103  KMP_MB(); /* Flush all pending memory write invalidates. */
104 
105  /* GEH - need check here under stats to make sure */
106  /* inside task (curr_thunk[tid] != NULL) */
107 
108  my_token = tq->tq_curr_thunk[ tid ]->th_tasknum;
109 
110  KMP_MB(); /* Flush all pending memory write invalidates. */
111 
112  tq->tq_curr_thunk[ tid ]-> th.th_shareds -> sv_queue -> tq_tasknum_serving = my_token + 1;
113 
114  KMP_MB(); /* Flush all pending memory write invalidates. */
115  }
116 }
117 
118 static void
119 __kmp_taskq_check_ordered( kmp_int32 gtid, kmpc_thunk_t *thunk )
120 {
121  kmp_uint32 spins;
122  kmp_uint32 my_token;
123  kmpc_task_queue_t *taskq;
124 
125  /* assume we are always called from an active parallel context */
126 
127  KMP_MB(); /* Flush all pending memory write invalidates. */
128 
129  my_token = thunk -> th_tasknum;
130 
131  taskq = thunk -> th.th_shareds -> sv_queue;
132 
133  if(taskq->tq_tasknum_serving <= my_token) {
134  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
135  KMP_MB();
136  taskq->tq_tasknum_serving = my_token +1;
137  KMP_MB();
138  }
139 }
140 
141 static void
142 __kmp_dump_TQF(kmp_int32 flags)
143 {
144  if (flags & TQF_IS_ORDERED)
145  __kmp_printf("ORDERED ");
146  if (flags & TQF_IS_LASTPRIVATE)
147  __kmp_printf("LAST_PRIV ");
148  if (flags & TQF_IS_NOWAIT)
149  __kmp_printf("NOWAIT ");
150  if (flags & TQF_HEURISTICS)
151  __kmp_printf("HEURIST ");
152  if (flags & TQF_INTERFACE_RESERVED1)
153  __kmp_printf("RESERV1 ");
154  if (flags & TQF_INTERFACE_RESERVED2)
155  __kmp_printf("RESERV2 ");
156  if (flags & TQF_INTERFACE_RESERVED3)
157  __kmp_printf("RESERV3 ");
158  if (flags & TQF_INTERFACE_RESERVED4)
159  __kmp_printf("RESERV4 ");
160  if (flags & TQF_IS_LAST_TASK)
161  __kmp_printf("LAST_TASK ");
162  if (flags & TQF_TASKQ_TASK)
163  __kmp_printf("TASKQ_TASK ");
164  if (flags & TQF_RELEASE_WORKERS)
165  __kmp_printf("RELEASE ");
166  if (flags & TQF_ALL_TASKS_QUEUED)
167  __kmp_printf("ALL_QUEUED ");
168  if (flags & TQF_PARALLEL_CONTEXT)
169  __kmp_printf("PARALLEL ");
170  if (flags & TQF_DEALLOCATED)
171  __kmp_printf("DEALLOC ");
172  if (!(flags & (TQF_INTERNAL_FLAGS|TQF_INTERFACE_FLAGS)))
173  __kmp_printf("(NONE)");
174 }
175 
176 static void
177 __kmp_dump_thunk( kmp_taskq_t *tq, kmpc_thunk_t *thunk, kmp_int32 global_tid )
178 {
179  int i;
180  int nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
181 
182  __kmp_printf("\tThunk at %p on (%d): ", thunk, global_tid);
183 
184  if (thunk != NULL) {
185  for (i = 0; i < nproc; i++) {
186  if( tq->tq_curr_thunk[i] == thunk ) {
187  __kmp_printf("[%i] ", i);
188  }
189  }
190  __kmp_printf("th_shareds=%p, ", thunk->th.th_shareds);
191  __kmp_printf("th_task=%p, ", thunk->th_task);
192  __kmp_printf("th_encl_thunk=%p, ", thunk->th_encl_thunk);
193  __kmp_printf("th_status=%d, ", thunk->th_status);
194  __kmp_printf("th_tasknum=%u, ", thunk->th_tasknum);
195  __kmp_printf("th_flags="); __kmp_dump_TQF(thunk->th_flags);
196  }
197 
198  __kmp_printf("\n");
199 }
200 
201 static void
202 __kmp_dump_thunk_stack(kmpc_thunk_t *thunk, kmp_int32 thread_num)
203 {
204  kmpc_thunk_t *th;
205 
206  __kmp_printf(" Thunk stack for T#%d: ", thread_num);
207 
208  for (th = thunk; th != NULL; th = th->th_encl_thunk )
209  __kmp_printf("%p ", th);
210 
211  __kmp_printf("\n");
212 }
213 
214 static void
215 __kmp_dump_task_queue( kmp_taskq_t *tq, kmpc_task_queue_t *queue, kmp_int32 global_tid )
216 {
217  int qs, count, i;
218  kmpc_thunk_t *thunk;
219  kmpc_task_queue_t *taskq;
220 
221  __kmp_printf("Task Queue at %p on (%d):\n", queue, global_tid);
222 
223  if (queue != NULL) {
224  int in_parallel = queue->tq_flags & TQF_PARALLEL_CONTEXT;
225 
226  if ( __kmp_env_consistency_check ) {
227  __kmp_printf(" tq_loc : ");
228  }
229  if (in_parallel) {
230 
231  //if (queue->tq.tq_parent != 0)
232  //__kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
233 
234  //__kmp_acquire_lock(& queue->tq_link_lck, global_tid);
235 
236  KMP_MB(); /* make sure data structures are in consistent state before querying them */
237  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
238 
239  __kmp_printf(" tq_parent : %p\n", queue->tq.tq_parent);
240  __kmp_printf(" tq_first_child : %p\n", queue->tq_first_child);
241  __kmp_printf(" tq_next_child : %p\n", queue->tq_next_child);
242  __kmp_printf(" tq_prev_child : %p\n", queue->tq_prev_child);
243  __kmp_printf(" tq_ref_count : %d\n", queue->tq_ref_count);
244 
245  //__kmp_release_lock(& queue->tq_link_lck, global_tid);
246 
247  //if (queue->tq.tq_parent != 0)
248  //__kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
249 
250  //__kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
251  //__kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
252 
253  KMP_MB(); /* make sure data structures are in consistent state before querying them */
254  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
255  }
256 
257  __kmp_printf(" tq_shareds : ");
258  for (i=0; i<((queue == tq->tq_root) ? queue->tq_nproc : 1); i++)
259  __kmp_printf("%p ", queue->tq_shareds[i].ai_data);
260  __kmp_printf("\n");
261 
262  if (in_parallel) {
263  __kmp_printf(" tq_tasknum_queuing : %u\n", queue->tq_tasknum_queuing);
264  __kmp_printf(" tq_tasknum_serving : %u\n", queue->tq_tasknum_serving);
265  }
266 
267  __kmp_printf(" tq_queue : %p\n", queue->tq_queue);
268  __kmp_printf(" tq_thunk_space : %p\n", queue->tq_thunk_space);
269  __kmp_printf(" tq_taskq_slot : %p\n", queue->tq_taskq_slot);
270 
271  __kmp_printf(" tq_free_thunks : ");
272  for (thunk = queue->tq_free_thunks; thunk != NULL; thunk = thunk->th.th_next_free )
273  __kmp_printf("%p ", thunk);
274  __kmp_printf("\n");
275 
276  __kmp_printf(" tq_nslots : %d\n", queue->tq_nslots);
277  __kmp_printf(" tq_head : %d\n", queue->tq_head);
278  __kmp_printf(" tq_tail : %d\n", queue->tq_tail);
279  __kmp_printf(" tq_nfull : %d\n", queue->tq_nfull);
280  __kmp_printf(" tq_hiwat : %d\n", queue->tq_hiwat);
281  __kmp_printf(" tq_flags : "); __kmp_dump_TQF(queue->tq_flags);
282  __kmp_printf("\n");
283 
284  if (in_parallel) {
285  __kmp_printf(" tq_th_thunks : ");
286  for (i = 0; i < queue->tq_nproc; i++) {
287  __kmp_printf("%d ", queue->tq_th_thunks[i].ai_data);
288  }
289  __kmp_printf("\n");
290  }
291 
292  __kmp_printf("\n");
293  __kmp_printf(" Queue slots:\n");
294 
295 
296  qs = queue->tq_tail;
297  for ( count = 0; count < queue->tq_nfull; ++count ) {
298  __kmp_printf("(%d)", qs);
299  __kmp_dump_thunk( tq, queue->tq_queue[qs].qs_thunk, global_tid );
300  qs = (qs+1) % queue->tq_nslots;
301  }
302 
303  __kmp_printf("\n");
304 
305  if (in_parallel) {
306  if (queue->tq_taskq_slot != NULL) {
307  __kmp_printf(" TaskQ slot:\n");
308  __kmp_dump_thunk( tq, (kmpc_thunk_t *) queue->tq_taskq_slot, global_tid );
309  __kmp_printf("\n");
310  }
311  //__kmp_release_lock(& queue->tq_queue_lck, global_tid);
312  //__kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
313  }
314  }
315 
316  __kmp_printf(" Taskq freelist: ");
317 
318  //__kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
319 
320  KMP_MB(); /* make sure data structures are in consistent state before querying them */
321  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
322 
323  for( taskq = tq->tq_freelist; taskq != NULL; taskq = taskq->tq.tq_next_free )
324  __kmp_printf("%p ", taskq);
325 
326  //__kmp_release_lock( & tq->tq_freelist_lck, global_tid );
327 
328  __kmp_printf("\n\n");
329 }
330 
331 static void
332 __kmp_aux_dump_task_queue_tree( kmp_taskq_t *tq, kmpc_task_queue_t *curr_queue, kmp_int32 level, kmp_int32 global_tid )
333 {
334  int i, count, qs;
335  int nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
336  kmpc_task_queue_t *queue = curr_queue;
337 
338  if (curr_queue == NULL)
339  return;
340 
341  __kmp_printf(" ");
342 
343  for (i=0; i<level; i++)
344  __kmp_printf(" ");
345 
346  __kmp_printf("%p", curr_queue);
347 
348  for (i = 0; i < nproc; i++) {
349  if( tq->tq_curr_thunk[i] && tq->tq_curr_thunk[i]->th.th_shareds->sv_queue == curr_queue ) {
350  __kmp_printf(" [%i]", i);
351  }
352  }
353 
354  __kmp_printf(":");
355 
356  //__kmp_acquire_lock(& curr_queue->tq_queue_lck, global_tid);
357 
358  KMP_MB(); /* make sure data structures are in consistent state before querying them */
359  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
360 
361  qs = curr_queue->tq_tail;
362 
363  for ( count = 0; count < curr_queue->tq_nfull; ++count ) {
364  __kmp_printf("%p ", curr_queue->tq_queue[qs].qs_thunk);
365  qs = (qs+1) % curr_queue->tq_nslots;
366  }
367 
368  //__kmp_release_lock(& curr_queue->tq_queue_lck, global_tid);
369 
370  __kmp_printf("\n");
371 
372  if (curr_queue->tq_first_child) {
373  //__kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
374 
375  KMP_MB(); /* make sure data structures are in consistent state before querying them */
376  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
377 
378  if (curr_queue->tq_first_child) {
379  for(queue = (kmpc_task_queue_t *)curr_queue->tq_first_child;
380  queue != NULL;
381  queue = queue->tq_next_child) {
382  __kmp_aux_dump_task_queue_tree( tq, queue, level+1, global_tid );
383  }
384  }
385 
386  //__kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
387  }
388 }
389 
390 static void
391 __kmp_dump_task_queue_tree( kmp_taskq_t *tq, kmpc_task_queue_t *tqroot, kmp_int32 global_tid)
392 {
393  __kmp_printf("TaskQ Tree at root %p on (%d):\n", tqroot, global_tid);
394 
395  __kmp_aux_dump_task_queue_tree( tq, tqroot, 0, global_tid );
396 
397  __kmp_printf("\n");
398 }
399 
400 /* --------------------------------------------------------------------------- */
401 
402 /*
403  New taskq storage routines that try to minimize overhead of mallocs but
404  still provide cache line alignment.
405 */
406 
407 
408 static void *
409 __kmp_taskq_allocate(size_t size, kmp_int32 global_tid)
410 {
411  void *addr, *orig_addr;
412  size_t bytes;
413 
414  KB_TRACE( 5, ("__kmp_taskq_allocate: called size=%d, gtid=%d\n", (int) size, global_tid ) );
415 
416  bytes = sizeof(void *) + CACHE_LINE + size;
417 
418 #ifdef THREAD_ALLOC_FOR_TASKQ
419  orig_addr = (void *) __kmp_thread_malloc( __kmp_thread_from_gtid(global_tid), bytes );
420 #else
421  KE_TRACE( 10, ("%%%%%% MALLOC( %d )\n", bytes ) );
422  orig_addr = (void *) KMP_INTERNAL_MALLOC( bytes );
423 #endif /* THREAD_ALLOC_FOR_TASKQ */
424 
425  if (orig_addr == 0)
426  KMP_FATAL( OutOfHeapMemory );
427 
428  addr = orig_addr;
429 
430  if (((kmp_uintptr_t) addr & ( CACHE_LINE - 1 )) != 0) {
431  KB_TRACE( 50, ("__kmp_taskq_allocate: adjust for cache alignment\n" ) );
432  addr = (void *) (((kmp_uintptr_t) addr + CACHE_LINE) & ~( CACHE_LINE - 1 ));
433  }
434 
435  (* (void **) addr) = orig_addr;
436 
437  KB_TRACE( 10, ("__kmp_taskq_allocate: allocate: %p, use: %p - %p, size: %d, gtid: %d\n",
438  orig_addr, ((void **) addr) + 1, ((char *)(((void **) addr) + 1)) + size-1,
439  (int) size, global_tid ));
440 
441  return ( ((void **) addr) + 1 );
442 }
443 
444 static void
445 __kmpc_taskq_free(void *p, kmp_int32 global_tid)
446 {
447  KB_TRACE( 5, ("__kmpc_taskq_free: called addr=%p, gtid=%d\n", p, global_tid ) );
448 
449  KB_TRACE(10, ("__kmpc_taskq_free: freeing: %p, gtid: %d\n", (*( ((void **) p)-1)), global_tid ));
450 
451 #ifdef THREAD_ALLOC_FOR_TASKQ
452  __kmp_thread_free( __kmp_thread_from_gtid(global_tid), *( ((void **) p)-1) );
453 #else
454  KMP_INTERNAL_FREE( *( ((void **) p)-1) );
455 #endif /* THREAD_ALLOC_FOR_TASKQ */
456 }
457 
458 /* --------------------------------------------------------------------------- */
459 
460 /*
461  * Keep freed kmpc_task_queue_t on an internal freelist and recycle since
462  * they're of constant size.
463  */
464 
465 static kmpc_task_queue_t *
466 __kmp_alloc_taskq ( kmp_taskq_t *tq, int in_parallel, kmp_int32 nslots, kmp_int32 nthunks,
467  kmp_int32 nshareds, kmp_int32 nproc, size_t sizeof_thunk,
468  size_t sizeof_shareds, kmpc_thunk_t **new_taskq_thunk, kmp_int32 global_tid )
469 {
470  kmp_int32 i;
471  size_t bytes;
472  kmpc_task_queue_t *new_queue;
473  kmpc_aligned_shared_vars_t *shared_var_array;
474  char *shared_var_storage;
475  char *pt; /* for doing byte-adjusted address computations */
476 
477  __kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
478 
479  KMP_MB(); /* make sure data structures are in consistent state before querying them */
480  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
481 
482  if( tq->tq_freelist ) {
483  new_queue = tq -> tq_freelist;
484  tq -> tq_freelist = tq -> tq_freelist -> tq.tq_next_free;
485 
486  KMP_DEBUG_ASSERT(new_queue->tq_flags & TQF_DEALLOCATED);
487 
488  new_queue->tq_flags = 0;
489 
490  __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
491  }
492  else {
493  __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
494 
495  new_queue = (kmpc_task_queue_t *) __kmp_taskq_allocate (sizeof (kmpc_task_queue_t), global_tid);
496  new_queue->tq_flags = 0;
497  }
498 
499  /* space in the task queue for queue slots (allocate as one big chunk */
500  /* of storage including new_taskq_task space) */
501 
502  sizeof_thunk += (CACHE_LINE - (sizeof_thunk % CACHE_LINE)); /* pad to cache line size */
503  pt = (char *) __kmp_taskq_allocate (nthunks * sizeof_thunk, global_tid);
504  new_queue->tq_thunk_space = (kmpc_thunk_t *)pt;
505  *new_taskq_thunk = (kmpc_thunk_t *)(pt + (nthunks - 1) * sizeof_thunk);
506 
507  /* chain the allocated thunks into a freelist for this queue */
508 
509  new_queue->tq_free_thunks = (kmpc_thunk_t *)pt;
510 
511  for (i = 0; i < (nthunks - 2); i++) {
512  ((kmpc_thunk_t *)(pt+i*sizeof_thunk))->th.th_next_free = (kmpc_thunk_t *)(pt + (i+1)*sizeof_thunk);
513 #ifdef KMP_DEBUG
514  ((kmpc_thunk_t *)(pt+i*sizeof_thunk))->th_flags = TQF_DEALLOCATED;
515 #endif
516  }
517 
518  ((kmpc_thunk_t *)(pt+(nthunks-2)*sizeof_thunk))->th.th_next_free = NULL;
519 #ifdef KMP_DEBUG
520  ((kmpc_thunk_t *)(pt+(nthunks-2)*sizeof_thunk))->th_flags = TQF_DEALLOCATED;
521 #endif
522 
523  /* initialize the locks */
524 
525  if (in_parallel) {
526  __kmp_init_lock( & new_queue->tq_link_lck );
527  __kmp_init_lock( & new_queue->tq_free_thunks_lck );
528  __kmp_init_lock( & new_queue->tq_queue_lck );
529  }
530 
531  /* now allocate the slots */
532 
533  bytes = nslots * sizeof (kmpc_aligned_queue_slot_t);
534  new_queue->tq_queue = (kmpc_aligned_queue_slot_t *) __kmp_taskq_allocate( bytes, global_tid );
535 
536  /* space for array of pointers to shared variable structures */
537  sizeof_shareds += sizeof(kmpc_task_queue_t *);
538  sizeof_shareds += (CACHE_LINE - (sizeof_shareds % CACHE_LINE)); /* pad to cache line size */
539 
540  bytes = nshareds * sizeof (kmpc_aligned_shared_vars_t);
541  shared_var_array = (kmpc_aligned_shared_vars_t *) __kmp_taskq_allocate ( bytes, global_tid);
542 
543  bytes = nshareds * sizeof_shareds;
544  shared_var_storage = (char *) __kmp_taskq_allocate ( bytes, global_tid);
545 
546  for (i=0; i<nshareds; i++) {
547  shared_var_array[i].ai_data = (kmpc_shared_vars_t *) (shared_var_storage + i*sizeof_shareds);
548  shared_var_array[i].ai_data->sv_queue = new_queue;
549  }
550  new_queue->tq_shareds = shared_var_array;
551 
552 
553  /* array for number of outstanding thunks per thread */
554 
555  if (in_parallel) {
556  bytes = nproc * sizeof(kmpc_aligned_int32_t);
557  new_queue->tq_th_thunks = (kmpc_aligned_int32_t *) __kmp_taskq_allocate ( bytes, global_tid);
558  new_queue->tq_nproc = nproc;
559 
560  for (i=0; i<nproc; i++)
561  new_queue->tq_th_thunks[i].ai_data = 0;
562  }
563 
564  return new_queue;
565 }
566 
567 static void
568 __kmp_free_taskq (kmp_taskq_t *tq, kmpc_task_queue_t *p, int in_parallel, kmp_int32 global_tid)
569 {
570  __kmpc_taskq_free(p->tq_thunk_space, global_tid);
571  __kmpc_taskq_free(p->tq_queue, global_tid);
572 
573  /* free shared var structure storage */
574  __kmpc_taskq_free((void *) p->tq_shareds[0].ai_data, global_tid);
575 
576  /* free array of pointers to shared vars storage */
577  __kmpc_taskq_free(p->tq_shareds, global_tid);
578 
579 #ifdef KMP_DEBUG
580  p->tq_first_child = NULL;
581  p->tq_next_child = NULL;
582  p->tq_prev_child = NULL;
583  p->tq_ref_count = -10;
584  p->tq_shareds = NULL;
585  p->tq_tasknum_queuing = 0;
586  p->tq_tasknum_serving = 0;
587  p->tq_queue = NULL;
588  p->tq_thunk_space = NULL;
589  p->tq_taskq_slot = NULL;
590  p->tq_free_thunks = NULL;
591  p->tq_nslots = 0;
592  p->tq_head = 0;
593  p->tq_tail = 0;
594  p->tq_nfull = 0;
595  p->tq_hiwat = 0;
596 
597  if (in_parallel) {
598  int i;
599 
600  for (i=0; i<p->tq_nproc; i++)
601  p->tq_th_thunks[i].ai_data = 0;
602  }
603  if ( __kmp_env_consistency_check )
604  p->tq_loc = NULL;
605  KMP_DEBUG_ASSERT( p->tq_flags & TQF_DEALLOCATED );
606  p->tq_flags = TQF_DEALLOCATED;
607 #endif /* KMP_DEBUG */
608 
609  if (in_parallel) {
610  __kmpc_taskq_free(p->tq_th_thunks, global_tid);
611  __kmp_destroy_lock(& p->tq_link_lck);
612  __kmp_destroy_lock(& p->tq_queue_lck);
613  __kmp_destroy_lock(& p->tq_free_thunks_lck);
614  }
615 #ifdef KMP_DEBUG
616  p->tq_th_thunks = NULL;
617 #endif /* KMP_DEBUG */
618 
619  KMP_MB(); /* make sure data structures are in consistent state before querying them */
620  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
621 
622  __kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
623  p->tq.tq_next_free = tq->tq_freelist;
624 
625  tq->tq_freelist = p;
626  __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
627 }
628 
629 /*
630  * Once a group of thunks has been allocated for use in a particular queue,
631  * these are managed via a per-queue freelist.
632  * We force a check that there's always a thunk free if we need one.
633  */
634 
635 static kmpc_thunk_t *
636 __kmp_alloc_thunk (kmpc_task_queue_t *queue, int in_parallel, kmp_int32 global_tid)
637 {
638  kmpc_thunk_t *fl;
639 
640  if (in_parallel) {
641  __kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
642 
643  KMP_MB(); /* make sure data structures are in consistent state before querying them */
644  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
645  }
646 
647  fl = queue->tq_free_thunks;
648 
649  KMP_DEBUG_ASSERT (fl != NULL);
650 
651  queue->tq_free_thunks = fl->th.th_next_free;
652  fl->th_flags = 0;
653 
654  if (in_parallel)
655  __kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
656 
657  return fl;
658 }
659 
660 static void
661 __kmp_free_thunk (kmpc_task_queue_t *queue, kmpc_thunk_t *p, int in_parallel, kmp_int32 global_tid)
662 {
663 #ifdef KMP_DEBUG
664  p->th_task = 0;
665  p->th_encl_thunk = 0;
666  p->th_status = 0;
667  p->th_tasknum = 0;
668  /* Also could zero pointers to private vars */
669 #endif
670 
671  if (in_parallel) {
672  __kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
673 
674  KMP_MB(); /* make sure data structures are in consistent state before querying them */
675  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
676  }
677 
678  p->th.th_next_free = queue->tq_free_thunks;
679  queue->tq_free_thunks = p;
680 
681 #ifdef KMP_DEBUG
682  p->th_flags = TQF_DEALLOCATED;
683 #endif
684 
685  if (in_parallel)
686  __kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
687 }
688 
689 /* --------------------------------------------------------------------------- */
690 
691 /* returns nonzero if the queue just became full after the enqueue */
692 
693 static kmp_int32
694 __kmp_enqueue_task ( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue, kmpc_thunk_t *thunk, int in_parallel )
695 {
696  kmp_int32 ret;
697 
698  /* dkp: can we get around the lock in the TQF_RELEASE_WORKERS case (only the master is executing then) */
699  if (in_parallel) {
700  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
701 
702  KMP_MB(); /* make sure data structures are in consistent state before querying them */
703  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
704  }
705 
706  KMP_DEBUG_ASSERT (queue->tq_nfull < queue->tq_nslots); /* check queue not full */
707 
708  queue->tq_queue[(queue->tq_head)++].qs_thunk = thunk;
709 
710  if (queue->tq_head >= queue->tq_nslots)
711  queue->tq_head = 0;
712 
713  (queue->tq_nfull)++;
714 
715  KMP_MB(); /* to assure that nfull is seen to increase before TQF_ALL_TASKS_QUEUED is set */
716 
717  ret = (in_parallel) ? (queue->tq_nfull == queue->tq_nslots) : FALSE;
718 
719  if (in_parallel) {
720  /* don't need to wait until workers are released before unlocking */
721  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
722 
723  if( tq->tq_global_flags & TQF_RELEASE_WORKERS ) {
724  /* If just creating the root queue, the worker threads are waiting at */
725  /* a join barrier until now, when there's something in the queue for */
726  /* them to do; release them now to do work. */
727  /* This should only be done when this is the first task enqueued, */
728  /* so reset the flag here also. */
729 
730  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; /* no lock needed, workers are still in spin mode */
731 
732  KMP_MB(); /* avoid releasing barrier twice if taskq_task switches threads */
733 
734  __kmpc_end_barrier_master( NULL, global_tid);
735  }
736  }
737 
738  return ret;
739 }
740 
741 static kmpc_thunk_t *
742 __kmp_dequeue_task (kmp_int32 global_tid, kmpc_task_queue_t *queue, int in_parallel)
743 {
744  kmpc_thunk_t *pt;
745  int tid = __kmp_tid_from_gtid( global_tid );
746 
747  KMP_DEBUG_ASSERT (queue->tq_nfull > 0); /* check queue not empty */
748 
749  if (queue->tq.tq_parent != NULL && in_parallel) {
750  int ct;
751  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
752  ct = ++(queue->tq_ref_count);
753  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
754  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
755  __LINE__, global_tid, queue, ct));
756  }
757 
758  pt = queue->tq_queue[(queue->tq_tail)++].qs_thunk;
759 
760  if (queue->tq_tail >= queue->tq_nslots)
761  queue->tq_tail = 0;
762 
763  if (in_parallel) {
764  queue->tq_th_thunks[tid].ai_data++;
765 
766  KMP_MB(); /* necessary so ai_data increment is propagated to other threads immediately (digital) */
767 
768  KF_TRACE(200, ("__kmp_dequeue_task: T#%d(:%d) now has %d outstanding thunks from queue %p\n",
769  global_tid, tid, queue->tq_th_thunks[tid].ai_data, queue));
770  }
771 
772  (queue->tq_nfull)--;
773 
774 #ifdef KMP_DEBUG
775  KMP_MB();
776 
777  /* necessary so (queue->tq_nfull > 0) above succeeds after tq_nfull is decremented */
778 
779  KMP_DEBUG_ASSERT(queue->tq_nfull >= 0);
780 
781  if (in_parallel) {
782  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data <= __KMP_TASKQ_THUNKS_PER_TH);
783  }
784 #endif
785 
786  return pt;
787 }
788 
789 /*
790  * Find the next (non-null) task to dequeue and return it.
791  * This is never called unless in_parallel=TRUE
792  *
793  * Here are the rules for deciding which queue to take the task from:
794  * 1. Walk up the task queue tree from the current queue's parent and look
795  * on the way up (for loop, below).
796  * 2. Do a depth-first search back down the tree from the root and
797  * look (find_task_in_descendant_queue()).
798  *
799  * Here are the rules for deciding which task to take from a queue
800  * (__kmp_find_task_in_queue ()):
801  * 1. Never take the last task from a queue if TQF_IS_LASTPRIVATE; this task
802  * must be staged to make sure we execute the last one with
803  * TQF_IS_LAST_TASK at the end of task queue execution.
804  * 2. If the queue length is below some high water mark and the taskq task
805  * is enqueued, prefer running the taskq task.
806  * 3. Otherwise, take a (normal) task from the queue.
807  *
808  * If we do all this and return pt == NULL at the bottom of this routine,
809  * this means there are no more tasks to execute (except possibly for
810  * TQF_IS_LASTPRIVATE).
811  */
812 
813 static kmpc_thunk_t *
814 __kmp_find_task_in_queue (kmp_int32 global_tid, kmpc_task_queue_t *queue)
815 {
816  kmpc_thunk_t *pt = NULL;
817  int tid = __kmp_tid_from_gtid( global_tid );
818 
819  /* To prevent deadlock from tq_queue_lck if queue already deallocated */
820  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
821 
822  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
823 
824  /* Check again to avoid race in __kmpc_end_taskq() */
825  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
826 
827  KMP_MB(); /* make sure data structures are in consistent state before querying them */
828  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
829 
830  if ((queue->tq_taskq_slot != NULL) && (queue->tq_nfull <= queue->tq_hiwat)) {
831  /* if there's enough room in the queue and the dispatcher */
832  /* (taskq task) is available, schedule more tasks */
833  pt = (kmpc_thunk_t *) queue->tq_taskq_slot;
834  queue->tq_taskq_slot = NULL;
835  }
836  else if (queue->tq_nfull == 0 ||
837  queue->tq_th_thunks[tid].ai_data >= __KMP_TASKQ_THUNKS_PER_TH) {
838  /* do nothing if no thunks available or this thread can't */
839  /* run any because it already is executing too many */
840 
841  pt = NULL;
842  }
843  else if (queue->tq_nfull > 1) {
844  /* always safe to schedule a task even if TQF_IS_LASTPRIVATE */
845 
846  pt = __kmp_dequeue_task (global_tid, queue, TRUE);
847  }
848  else if (!(queue->tq_flags & TQF_IS_LASTPRIVATE)) {
849  /* one thing in queue, always safe to schedule if !TQF_IS_LASTPRIVATE */
850 
851  pt = __kmp_dequeue_task (global_tid, queue, TRUE);
852  }
853  else if (queue->tq_flags & TQF_IS_LAST_TASK) {
854  /* TQF_IS_LASTPRIVATE, one thing in queue, kmpc_end_taskq_task() */
855  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
856  /* instrumentation does copy-out. */
857 
858  pt = __kmp_dequeue_task (global_tid, queue, TRUE);
859  pt->th_flags |= TQF_IS_LAST_TASK; /* don't need test_then_or since already locked */
860  }
861  }
862 
863  /* GEH - What happens here if is lastprivate, but not last task? */
864  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
865  }
866 
867  return pt;
868 }
869 
870 /*
871  * Walk a tree of queues starting at queue's first child
872  * and return a non-NULL thunk if one can be scheduled.
873  * Must only be called when in_parallel=TRUE
874  */
875 
876 static kmpc_thunk_t *
877 __kmp_find_task_in_descendant_queue (kmp_int32 global_tid, kmpc_task_queue_t *curr_queue)
878 {
879  kmpc_thunk_t *pt = NULL;
880  kmpc_task_queue_t *queue = curr_queue;
881 
882  if (curr_queue->tq_first_child != NULL) {
883  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
884 
885  KMP_MB(); /* make sure data structures are in consistent state before querying them */
886  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
887 
888  queue = (kmpc_task_queue_t *) curr_queue->tq_first_child;
889  if (queue == NULL) {
890  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
891  return NULL;
892  }
893 
894  while (queue != NULL) {
895  int ct;
896  kmpc_task_queue_t *next;
897 
898  ct= ++(queue->tq_ref_count);
899  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
900  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
901  __LINE__, global_tid, queue, ct));
902 
903  pt = __kmp_find_task_in_queue (global_tid, queue);
904 
905  if (pt != NULL) {
906  int ct;
907 
908  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
909 
910  KMP_MB(); /* make sure data structures are in consistent state before querying them */
911  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
912 
913  ct = --(queue->tq_ref_count);
914  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
915  __LINE__, global_tid, queue, ct));
916  KMP_DEBUG_ASSERT( queue->tq_ref_count >= 0 );
917 
918  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
919 
920  return pt;
921  }
922 
923  /* although reference count stays active during descendant walk, shouldn't matter */
924  /* since if children still exist, reference counts aren't being monitored anyway */
925 
926  pt = __kmp_find_task_in_descendant_queue (global_tid, queue);
927 
928  if (pt != NULL) {
929  int ct;
930 
931  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
932 
933  KMP_MB(); /* make sure data structures are in consistent state before querying them */
934  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
935 
936  ct = --(queue->tq_ref_count);
937  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
938  __LINE__, global_tid, queue, ct));
939  KMP_DEBUG_ASSERT( ct >= 0 );
940 
941  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
942 
943  return pt;
944  }
945 
946  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
947 
948  KMP_MB(); /* make sure data structures are in consistent state before querying them */
949  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
950 
951  next = queue->tq_next_child;
952 
953  ct = --(queue->tq_ref_count);
954  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
955  __LINE__, global_tid, queue, ct));
956  KMP_DEBUG_ASSERT( ct >= 0 );
957 
958  queue = next;
959  }
960 
961  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
962  }
963 
964  return pt;
965 }
966 
967 /*
968  * Walk up the taskq tree looking for a task to execute.
969  * If we get to the root, search the tree for a descendent queue task.
970  * Must only be called when in_parallel=TRUE
971  */
972 
973 static kmpc_thunk_t *
974 __kmp_find_task_in_ancestor_queue (kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue)
975 {
976  kmpc_task_queue_t *queue;
977  kmpc_thunk_t *pt;
978 
979  pt = NULL;
980 
981  if (curr_queue->tq.tq_parent != NULL) {
982  queue = curr_queue->tq.tq_parent;
983 
984  while (queue != NULL) {
985  if (queue->tq.tq_parent != NULL) {
986  int ct;
987  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
988 
989  KMP_MB(); /* make sure data structures are in consistent state before querying them */
990  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
991 
992  ct = ++(queue->tq_ref_count);
993  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
994  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
995  __LINE__, global_tid, queue, ct));
996  }
997 
998  pt = __kmp_find_task_in_queue (global_tid, queue);
999  if (pt != NULL) {
1000  if (queue->tq.tq_parent != NULL) {
1001  int ct;
1002  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1003 
1004  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1005  /* Seems to work without this call for digital/alpha, needed for IBM/RS6000 */
1006 
1007  ct = --(queue->tq_ref_count);
1008  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1009  __LINE__, global_tid, queue, ct));
1010  KMP_DEBUG_ASSERT( ct >= 0 );
1011 
1012  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1013  }
1014 
1015  return pt;
1016  }
1017 
1018  if (queue->tq.tq_parent != NULL) {
1019  int ct;
1020  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1021 
1022  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1023  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1024 
1025  ct = --(queue->tq_ref_count);
1026  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1027  __LINE__, global_tid, queue, ct));
1028  KMP_DEBUG_ASSERT( ct >= 0 );
1029  }
1030  queue = queue->tq.tq_parent;
1031 
1032  if (queue != NULL)
1033  __kmp_release_lock(& queue->tq_link_lck, global_tid);
1034  }
1035 
1036  }
1037 
1038  pt = __kmp_find_task_in_descendant_queue( global_tid, tq->tq_root );
1039 
1040  return pt;
1041 }
1042 
1043 static int
1044 __kmp_taskq_tasks_finished (kmpc_task_queue_t *queue)
1045 {
1046  int i;
1047 
1048  /* KMP_MB(); *//* is this really necessary? */
1049 
1050  for (i=0; i<queue->tq_nproc; i++) {
1051  if (queue->tq_th_thunks[i].ai_data != 0)
1052  return FALSE;
1053  }
1054 
1055  return TRUE;
1056 }
1057 
1058 static int
1059 __kmp_taskq_has_any_children (kmpc_task_queue_t *queue)
1060 {
1061  return (queue->tq_first_child != NULL);
1062 }
1063 
1064 static void
1065 __kmp_remove_queue_from_tree( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue, int in_parallel )
1066 {
1067 #ifdef KMP_DEBUG
1068  kmp_int32 i;
1069  kmpc_thunk_t *thunk;
1070 #endif
1071 
1072  KF_TRACE(50, ("Before Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1073  KF_DUMP(50, __kmp_dump_task_queue( tq, queue, global_tid ));
1074 
1075  /* sub-queue in a recursion, not the root task queue */
1076  KMP_DEBUG_ASSERT (queue->tq.tq_parent != NULL);
1077 
1078  if (in_parallel) {
1079  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1080 
1081  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1082  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1083  }
1084 
1085  KMP_DEBUG_ASSERT (queue->tq_first_child == NULL);
1086 
1087  /* unlink queue from its siblings if any at this level */
1088  if (queue->tq_prev_child != NULL)
1089  queue->tq_prev_child->tq_next_child = queue->tq_next_child;
1090  if (queue->tq_next_child != NULL)
1091  queue->tq_next_child->tq_prev_child = queue->tq_prev_child;
1092  if (queue->tq.tq_parent->tq_first_child == queue)
1093  queue->tq.tq_parent->tq_first_child = queue->tq_next_child;
1094 
1095  queue->tq_prev_child = NULL;
1096  queue->tq_next_child = NULL;
1097 
1098  if (in_parallel) {
1099  kmp_uint32 spins;
1100 
1101  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p waiting for ref_count of %d to reach 1\n",
1102  __LINE__, global_tid, queue, queue->tq_ref_count));
1103 
1104  /* wait until all other threads have stopped accessing this queue */
1105  while (queue->tq_ref_count > 1) {
1106  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1107 
1108  KMP_WAIT_YIELD((volatile kmp_uint32*)&queue->tq_ref_count, 1, KMP_LE, NULL);
1109 
1110  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1111 
1112  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1113  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1114  }
1115 
1116  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1117  }
1118 
1119  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p freeing queue\n",
1120  __LINE__, global_tid, queue));
1121 
1122 #ifdef KMP_DEBUG
1123  KMP_DEBUG_ASSERT(queue->tq_flags & TQF_ALL_TASKS_QUEUED);
1124  KMP_DEBUG_ASSERT(queue->tq_nfull == 0);
1125 
1126  for (i=0; i<queue->tq_nproc; i++) {
1127  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1128  }
1129 
1130  i = 0;
1131  for (thunk=queue->tq_free_thunks; thunk != NULL; thunk=thunk->th.th_next_free)
1132  ++i;
1133 
1134  KMP_ASSERT (i == queue->tq_nslots + (queue->tq_nproc * __KMP_TASKQ_THUNKS_PER_TH));
1135 #endif
1136 
1137  /* release storage for queue entry */
1138  __kmp_free_taskq ( tq, queue, TRUE, global_tid );
1139 
1140  KF_TRACE(50, ("After Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1141  KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
1142 }
1143 
1144 /*
1145  * Starting from indicated queue, proceed downward through tree and
1146  * remove all taskqs which are finished, but only go down to taskqs
1147  * which have the "nowait" clause present. Assume this is only called
1148  * when in_parallel=TRUE.
1149  */
1150 
1151 static void
1152 __kmp_find_and_remove_finished_child_taskq( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue )
1153 {
1154  kmpc_task_queue_t *queue = curr_queue;
1155 
1156  if (curr_queue->tq_first_child != NULL) {
1157  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
1158 
1159  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1160  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1161 
1162  queue = (kmpc_task_queue_t *) curr_queue->tq_first_child;
1163  if (queue != NULL) {
1164  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
1165  return;
1166  }
1167 
1168  while (queue != NULL) {
1169  kmpc_task_queue_t *next;
1170  int ct = ++(queue->tq_ref_count);
1171  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
1172  __LINE__, global_tid, queue, ct));
1173 
1174 
1175  /* although reference count stays active during descendant walk, */
1176  /* shouldn't matter since if children still exist, reference */
1177  /* counts aren't being monitored anyway */
1178 
1179  if (queue->tq_flags & TQF_IS_NOWAIT) {
1180  __kmp_find_and_remove_finished_child_taskq ( tq, global_tid, queue );
1181 
1182  if ((queue->tq_flags & TQF_ALL_TASKS_QUEUED) && (queue->tq_nfull == 0) &&
1183  __kmp_taskq_tasks_finished(queue) && ! __kmp_taskq_has_any_children(queue)) {
1184 
1185  /*
1186  Only remove this if we have not already marked it for deallocation.
1187  This should prevent multiple threads from trying to free this.
1188  */
1189 
1190  if ( __kmp_test_lock(& queue->tq_queue_lck, global_tid) ) {
1191  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
1192  queue->tq_flags |= TQF_DEALLOCATED;
1193  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1194 
1195  __kmp_remove_queue_from_tree( tq, global_tid, queue, TRUE );
1196 
1197  /* Can't do any more here since can't be sure where sibling queue is so just exit this level */
1198  return;
1199  }
1200  else {
1201  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1202  }
1203  }
1204  /* otherwise, just fall through and decrement reference count */
1205  }
1206  }
1207 
1208  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
1209 
1210  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1211  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1212 
1213  next = queue->tq_next_child;
1214 
1215  ct = --(queue->tq_ref_count);
1216  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1217  __LINE__, global_tid, queue, ct));
1218  KMP_DEBUG_ASSERT( ct >= 0 );
1219 
1220  queue = next;
1221  }
1222 
1223  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
1224  }
1225 }
1226 
1227 /*
1228  * Starting from indicated queue, proceed downward through tree and
1229  * remove all taskq's assuming all are finished and
1230  * assuming NO other threads are executing at this point.
1231  */
1232 
1233 static void
1234 __kmp_remove_all_child_taskq( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue )
1235 {
1236  kmpc_task_queue_t *next_child;
1237 
1238  queue = (kmpc_task_queue_t *) queue->tq_first_child;
1239 
1240  while (queue != NULL) {
1241  __kmp_remove_all_child_taskq ( tq, global_tid, queue );
1242 
1243  next_child = queue->tq_next_child;
1244  queue->tq_flags |= TQF_DEALLOCATED;
1245  __kmp_remove_queue_from_tree ( tq, global_tid, queue, FALSE );
1246  queue = next_child;
1247  }
1248 }
1249 
1250 static void
1251 __kmp_execute_task_from_queue( kmp_taskq_t *tq, ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, int in_parallel )
1252 {
1253  kmpc_task_queue_t *queue = thunk->th.th_shareds->sv_queue;
1254  kmp_int32 tid = __kmp_tid_from_gtid( global_tid );
1255 
1256  KF_TRACE(100, ("After dequeueing this Task on (%d):\n", global_tid));
1257  KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
1258  KF_TRACE(100, ("Task Queue: %p looks like this (%d):\n", queue, global_tid));
1259  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1260 
1261  /*
1262  * For the taskq task, the curr_thunk pushes and pop pairs are set up as follows:
1263  *
1264  * happens exactly once:
1265  * 1) __kmpc_taskq : push (if returning thunk only)
1266  * 4) __kmpc_end_taskq_task : pop
1267  *
1268  * optionally happens *each* time taskq task is dequeued/enqueued:
1269  * 2) __kmpc_taskq_task : pop
1270  * 3) __kmp_execute_task_from_queue : push
1271  *
1272  * execution ordering: 1,(2,3)*,4
1273  */
1274 
1275  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1276  kmp_int32 index = (queue == tq->tq_root) ? tid : 0;
1277  thunk->th.th_shareds = (kmpc_shared_vars_t *) queue->tq_shareds[index].ai_data;
1278 
1279  if ( __kmp_env_consistency_check ) {
1280  __kmp_push_workshare( global_tid,
1281  (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered : ct_task,
1282  queue->tq_loc );
1283  }
1284  }
1285  else {
1286  if ( __kmp_env_consistency_check )
1287  __kmp_push_workshare( global_tid, ct_taskq, queue->tq_loc );
1288  }
1289 
1290  if (in_parallel) {
1291  thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1292  tq->tq_curr_thunk[tid] = thunk;
1293 
1294  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1295  }
1296 
1297  KF_TRACE( 50, ("Begin Executing Thunk %p from queue %p on (%d)\n", thunk, queue, global_tid));
1298  thunk->th_task (global_tid, thunk);
1299  KF_TRACE( 50, ("End Executing Thunk %p from queue %p on (%d)\n", thunk, queue, global_tid));
1300 
1301  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1302  if ( __kmp_env_consistency_check )
1303  __kmp_pop_workshare( global_tid, (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered : ct_task,
1304  queue->tq_loc );
1305 
1306  if (in_parallel) {
1307  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1308  thunk->th_encl_thunk = NULL;
1309  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1310  }
1311 
1312  if ((thunk->th_flags & TQF_IS_ORDERED) && in_parallel) {
1313  __kmp_taskq_check_ordered(global_tid, thunk);
1314  }
1315 
1316  __kmp_free_thunk (queue, thunk, in_parallel, global_tid);
1317 
1318  KF_TRACE(100, ("T#%d After freeing thunk: %p, TaskQ looks like this:\n", global_tid, thunk));
1319  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1320 
1321  if (in_parallel) {
1322  KMP_MB(); /* needed so thunk put on free list before outstanding thunk count is decremented */
1323 
1324  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data >= 1);
1325 
1326  KF_TRACE( 200, ("__kmp_execute_task_from_queue: T#%d has %d thunks in queue %p\n",
1327  global_tid, queue->tq_th_thunks[tid].ai_data-1, queue));
1328 
1329  queue->tq_th_thunks[tid].ai_data--;
1330 
1331  /* KMP_MB(); */ /* is MB really necessary ? */
1332  }
1333 
1334  if (queue->tq.tq_parent != NULL && in_parallel) {
1335  int ct;
1336  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1337  ct = --(queue->tq_ref_count);
1338  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1339  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1340  __LINE__, global_tid, queue, ct));
1341  KMP_DEBUG_ASSERT( ct >= 0 );
1342  }
1343  }
1344 }
1345 
1346 /* --------------------------------------------------------------------------- */
1347 
1348 /* starts a taskq; creates and returns a thunk for the taskq_task */
1349 /* also, returns pointer to shared vars for this thread in "shareds" arg */
1350 
1351 kmpc_thunk_t *
1352 __kmpc_taskq( ident_t *loc, kmp_int32 global_tid, kmpc_task_t taskq_task,
1353  size_t sizeof_thunk, size_t sizeof_shareds,
1354  kmp_int32 flags, kmpc_shared_vars_t **shareds )
1355 {
1356  int in_parallel;
1357  kmp_int32 nslots, nthunks, nshareds, nproc;
1358  kmpc_task_queue_t *new_queue, *curr_queue;
1359  kmpc_thunk_t *new_taskq_thunk;
1360  kmp_info_t *th;
1361  kmp_team_t *team;
1362  kmp_taskq_t *tq;
1363  kmp_int32 tid;
1364 
1365  KE_TRACE( 10, ("__kmpc_taskq called (%d)\n", global_tid));
1366 
1367  th = __kmp_threads[ global_tid ];
1368  team = th -> th.th_team;
1369  tq = & team -> t.t_taskq;
1370  nproc = team -> t.t_nproc;
1371  tid = __kmp_tid_from_gtid( global_tid );
1372 
1373  /* find out whether this is a parallel taskq or serialized one. */
1374  in_parallel = in_parallel_context( team );
1375 
1376  if( ! tq->tq_root ) {
1377  if (in_parallel) {
1378  /* Vector ORDERED SECTION to taskq version */
1379  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1380 
1381  /* Vector ORDERED SECTION to taskq version */
1382  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1383  }
1384 
1385  if (in_parallel) {
1386  /* This shouldn't be a barrier region boundary, it will confuse the user. */
1387  /* Need the boundary to be at the end taskq instead. */
1388  if ( __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL )) {
1389  /* Creating the active root queue, and we are not the master thread. */
1390  /* The master thread below created the queue and tasks have been */
1391  /* enqueued, and the master thread released this barrier. This */
1392  /* worker thread can now proceed and execute tasks. See also the */
1393  /* TQF_RELEASE_WORKERS which is used to handle this case. */
1394 
1395  *shareds = (kmpc_shared_vars_t *) tq->tq_root->tq_shareds[tid].ai_data;
1396 
1397  KE_TRACE( 10, ("__kmpc_taskq return (%d)\n", global_tid));
1398 
1399  return NULL;
1400  }
1401  }
1402 
1403  /* master thread only executes this code */
1404 
1405  if( tq->tq_curr_thunk_capacity < nproc ) {
1406  int i;
1407 
1408  if(tq->tq_curr_thunk)
1409  __kmp_free(tq->tq_curr_thunk);
1410  else {
1411  /* only need to do this once at outer level, i.e. when tq_curr_thunk is still NULL */
1412  __kmp_init_lock( & tq->tq_freelist_lck );
1413  }
1414 
1415  tq->tq_curr_thunk = (kmpc_thunk_t **) __kmp_allocate( nproc * sizeof(kmpc_thunk_t *) );
1416  tq -> tq_curr_thunk_capacity = nproc;
1417  }
1418 
1419  if (in_parallel)
1420  tq->tq_global_flags = TQF_RELEASE_WORKERS;
1421  }
1422 
1423  /* dkp: in future, if flags & TQF_HEURISTICS, will choose nslots based */
1424  /* on some heuristics (e.g., depth of queue nesting?). */
1425 
1426  nslots = (in_parallel) ? (2 * nproc) : 1;
1427 
1428  /* There must be nproc * __KMP_TASKQ_THUNKS_PER_TH extra slots for pending */
1429  /* jobs being executed by other threads, and one extra for taskq slot */
1430 
1431  nthunks = (in_parallel) ? (nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH) + 1) : nslots + 2;
1432 
1433  /* Only the root taskq gets a per-thread array of shareds. */
1434  /* The rest of the taskq's only get one copy of the shared vars. */
1435 
1436  nshareds = ( !tq->tq_root && in_parallel) ? nproc : 1;
1437 
1438  /* create overall queue data structure and its components that require allocation */
1439 
1440  new_queue = __kmp_alloc_taskq ( tq, in_parallel, nslots, nthunks, nshareds, nproc,
1441  sizeof_thunk, sizeof_shareds, &new_taskq_thunk, global_tid );
1442 
1443  /* rest of new_queue initializations */
1444 
1445  new_queue->tq_flags = flags & TQF_INTERFACE_FLAGS;
1446 
1447  if (in_parallel) {
1448  new_queue->tq_tasknum_queuing = 0;
1449  new_queue->tq_tasknum_serving = 0;
1450  new_queue->tq_flags |= TQF_PARALLEL_CONTEXT;
1451  }
1452 
1453  new_queue->tq_taskq_slot = NULL;
1454  new_queue->tq_nslots = nslots;
1455  new_queue->tq_hiwat = HIGH_WATER_MARK (nslots);
1456  new_queue->tq_nfull = 0;
1457  new_queue->tq_head = 0;
1458  new_queue->tq_tail = 0;
1459  new_queue->tq_loc = loc;
1460 
1461  if ((new_queue->tq_flags & TQF_IS_ORDERED) && in_parallel) {
1462  /* prepare to serve the first-queued task's ORDERED directive */
1463  new_queue->tq_tasknum_serving = 1;
1464 
1465  /* Vector ORDERED SECTION to taskq version */
1466  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1467 
1468  /* Vector ORDERED SECTION to taskq version */
1469  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1470  }
1471 
1472  /* create a new thunk for the taskq_task in the new_queue */
1473  *shareds = (kmpc_shared_vars_t *) new_queue->tq_shareds[0].ai_data;
1474 
1475  new_taskq_thunk->th.th_shareds = *shareds;
1476  new_taskq_thunk->th_task = taskq_task;
1477  new_taskq_thunk->th_flags = new_queue->tq_flags | TQF_TASKQ_TASK;
1478  new_taskq_thunk->th_status = 0;
1479 
1480  KMP_DEBUG_ASSERT (new_taskq_thunk->th_flags & TQF_TASKQ_TASK);
1481 
1482  /* KMP_MB(); */ /* make sure these inits complete before threads start using this queue (necessary?) */
1483 
1484  /* insert the new task queue into the tree, but only after all fields initialized */
1485 
1486  if (in_parallel) {
1487  if( ! tq->tq_root ) {
1488  new_queue->tq.tq_parent = NULL;
1489  new_queue->tq_first_child = NULL;
1490  new_queue->tq_next_child = NULL;
1491  new_queue->tq_prev_child = NULL;
1492  new_queue->tq_ref_count = 1;
1493  tq->tq_root = new_queue;
1494  }
1495  else {
1496  curr_queue = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
1497  new_queue->tq.tq_parent = curr_queue;
1498  new_queue->tq_first_child = NULL;
1499  new_queue->tq_prev_child = NULL;
1500  new_queue->tq_ref_count = 1; /* for this the thread that built the queue */
1501 
1502  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p alloc %d\n",
1503  __LINE__, global_tid, new_queue, new_queue->tq_ref_count));
1504 
1505  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
1506 
1507  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1508  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1509 
1510  new_queue->tq_next_child = (struct kmpc_task_queue_t *) curr_queue->tq_first_child;
1511 
1512  if (curr_queue->tq_first_child != NULL)
1513  curr_queue->tq_first_child->tq_prev_child = new_queue;
1514 
1515  curr_queue->tq_first_child = new_queue;
1516 
1517  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
1518  }
1519 
1520  /* set up thunk stack only after code that determines curr_queue above */
1521  new_taskq_thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1522  tq->tq_curr_thunk[tid] = new_taskq_thunk;
1523 
1524  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1525  }
1526  else {
1527  new_taskq_thunk->th_encl_thunk = 0;
1528  new_queue->tq.tq_parent = NULL;
1529  new_queue->tq_first_child = NULL;
1530  new_queue->tq_next_child = NULL;
1531  new_queue->tq_prev_child = NULL;
1532  new_queue->tq_ref_count = 1;
1533  }
1534 
1535 #ifdef KMP_DEBUG
1536  KF_TRACE(150, ("Creating TaskQ Task on (%d):\n", global_tid));
1537  KF_DUMP(150, __kmp_dump_thunk( tq, new_taskq_thunk, global_tid ));
1538 
1539  if (in_parallel) {
1540  KF_TRACE(25, ("After TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1541  } else {
1542  KF_TRACE(25, ("After Serial TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1543  }
1544 
1545  KF_DUMP(25, __kmp_dump_task_queue( tq, new_queue, global_tid ));
1546 
1547  if (in_parallel) {
1548  KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
1549  }
1550 #endif /* KMP_DEBUG */
1551 
1552  if ( __kmp_env_consistency_check )
1553  __kmp_push_workshare( global_tid, ct_taskq, new_queue->tq_loc );
1554 
1555  KE_TRACE( 10, ("__kmpc_taskq return (%d)\n", global_tid));
1556 
1557  return new_taskq_thunk;
1558 }
1559 
1560 
1561 /* ends a taskq; last thread out destroys the queue */
1562 
1563 void
1564 __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk)
1565 {
1566 #ifdef KMP_DEBUG
1567  kmp_int32 i;
1568 #endif
1569  kmp_taskq_t *tq;
1570  int in_parallel;
1571  kmp_info_t *th;
1572  kmp_int32 is_outermost;
1573  kmpc_task_queue_t *queue;
1574  kmpc_thunk_t *thunk;
1575  int nproc;
1576 
1577  KE_TRACE( 10, ("__kmpc_end_taskq called (%d)\n", global_tid));
1578 
1579  tq = & __kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1580  nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
1581 
1582  /* For the outermost taskq only, all but one thread will have taskq_thunk == NULL */
1583  queue = (taskq_thunk == NULL) ? tq->tq_root : taskq_thunk->th.th_shareds->sv_queue;
1584 
1585  KE_TRACE( 50, ("__kmpc_end_taskq queue=%p (%d) \n", queue, global_tid));
1586  is_outermost = (queue == tq->tq_root);
1587  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1588 
1589  if (in_parallel) {
1590  kmp_uint32 spins;
1591 
1592  /* this is just a safeguard to release the waiting threads if */
1593  /* the outermost taskq never queues a task */
1594 
1595  if (is_outermost && (KMP_MASTER_GTID( global_tid ))) {
1596  if( tq->tq_global_flags & TQF_RELEASE_WORKERS ) {
1597  /* no lock needed, workers are still in spin mode */
1598  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS;
1599 
1600  __kmp_end_split_barrier( bs_plain_barrier, global_tid );
1601  }
1602  }
1603 
1604  /* keep dequeueing work until all tasks are queued and dequeued */
1605 
1606  do {
1607  /* wait until something is available to dequeue */
1608  KMP_INIT_YIELD(spins);
1609 
1610  while ( (queue->tq_nfull == 0)
1611  && (queue->tq_taskq_slot == NULL)
1612  && (! __kmp_taskq_has_any_children(queue) )
1613  && (! (queue->tq_flags & TQF_ALL_TASKS_QUEUED) )
1614  ) {
1615  KMP_YIELD_WHEN( TRUE, spins );
1616  }
1617 
1618  /* check to see if we can execute tasks in the queue */
1619  while ( ( (queue->tq_nfull != 0) || (queue->tq_taskq_slot != NULL) )
1620  && (thunk = __kmp_find_task_in_queue(global_tid, queue)) != NULL
1621  ) {
1622  KF_TRACE(50, ("Found thunk: %p in primary queue %p (%d)\n", thunk, queue, global_tid));
1623  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1624  }
1625 
1626  /* see if work found can be found in a descendant queue */
1627  if ( (__kmp_taskq_has_any_children(queue))
1628  && (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != NULL
1629  ) {
1630 
1631  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
1632  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid ));
1633 
1634  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1635  }
1636 
1637  } while ( (! (queue->tq_flags & TQF_ALL_TASKS_QUEUED))
1638  || (queue->tq_nfull != 0)
1639  );
1640 
1641  KF_TRACE(50, ("All tasks queued and dequeued in queue: %p (%d)\n", queue, global_tid));
1642 
1643  /* wait while all tasks are not finished and more work found
1644  in descendant queues */
1645 
1646  while ( (!__kmp_taskq_tasks_finished(queue))
1647  && (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != NULL
1648  ) {
1649 
1650  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
1651  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1652 
1653  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1654  }
1655 
1656  KF_TRACE(50, ("No work found in descendent queues or all work finished in queue: %p (%d)\n", queue, global_tid));
1657 
1658  if (!is_outermost) {
1659  /* need to return if NOWAIT present and not outermost taskq */
1660 
1661  if (queue->tq_flags & TQF_IS_NOWAIT) {
1662  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1663  queue->tq_ref_count--;
1664  KMP_DEBUG_ASSERT( queue->tq_ref_count >= 0 );
1665  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1666 
1667  KE_TRACE( 10, ("__kmpc_end_taskq return for nowait case (%d)\n", global_tid));
1668 
1669  return;
1670  }
1671 
1672  __kmp_find_and_remove_finished_child_taskq( tq, global_tid, queue );
1673 
1674  /* WAIT until all tasks are finished and no child queues exist before proceeding */
1675  KMP_INIT_YIELD(spins);
1676 
1677  while (!__kmp_taskq_tasks_finished(queue) || __kmp_taskq_has_any_children(queue)) {
1678  thunk = __kmp_find_task_in_ancestor_queue( tq, global_tid, queue );
1679 
1680  if (thunk != NULL) {
1681  KF_TRACE(50, ("Stole thunk: %p in ancestor queue: %p while waiting in queue: %p (%d)\n",
1682  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1683  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1684  }
1685 
1686  KMP_YIELD_WHEN( thunk == NULL, spins );
1687 
1688  __kmp_find_and_remove_finished_child_taskq( tq, global_tid, queue );
1689  }
1690 
1691  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
1692  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
1693  queue->tq_flags |= TQF_DEALLOCATED;
1694  }
1695  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1696 
1697  /* only the allocating thread can deallocate the queue */
1698  if (taskq_thunk != NULL) {
1699  __kmp_remove_queue_from_tree( tq, global_tid, queue, TRUE );
1700  }
1701 
1702  KE_TRACE( 10, ("__kmpc_end_taskq return for non_outermost queue, wait case (%d)\n", global_tid));
1703 
1704  return;
1705  }
1706 
1707  /* Outermost Queue: steal work from descendants until all tasks are finished */
1708 
1709  KMP_INIT_YIELD(spins);
1710 
1711  while (!__kmp_taskq_tasks_finished(queue)) {
1712  thunk = __kmp_find_task_in_descendant_queue(global_tid, queue);
1713 
1714  if (thunk != NULL) {
1715  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
1716  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1717 
1718  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1719  }
1720 
1721  KMP_YIELD_WHEN( thunk == NULL, spins );
1722  }
1723 
1724  /* Need this barrier to prevent destruction of queue before threads have all executed above code */
1725  /* This may need to be done earlier when NOWAIT is implemented for the outermost level */
1726 
1727  if ( !__kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL )) {
1728  /* the queue->tq_flags & TQF_IS_NOWAIT case is not yet handled here; */
1729  /* for right now, everybody waits, and the master thread destroys the */
1730  /* remaining queues. */
1731 
1732  __kmp_remove_all_child_taskq( tq, global_tid, queue );
1733 
1734  /* Now destroy the root queue */
1735  KF_TRACE(100, ("T#%d Before Deletion of top-level TaskQ at %p:\n", global_tid, queue ));
1736  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1737 
1738 #ifdef KMP_DEBUG
1739  /* the root queue entry */
1740  KMP_DEBUG_ASSERT ((queue->tq.tq_parent == NULL) && (queue->tq_next_child == NULL));
1741 
1742  /* children must all be gone by now because of barrier above */
1743  KMP_DEBUG_ASSERT (queue->tq_first_child == NULL);
1744 
1745  for (i=0; i<nproc; i++) {
1746  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1747  }
1748 
1749  for (i=0, thunk=queue->tq_free_thunks; thunk != NULL; i++, thunk=thunk->th.th_next_free);
1750 
1751  KMP_DEBUG_ASSERT (i == queue->tq_nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH));
1752 
1753  for (i = 0; i < nproc; i++) {
1754  KMP_DEBUG_ASSERT( ! tq->tq_curr_thunk[i] );
1755  }
1756 #endif
1757  /* unlink the root queue entry */
1758  tq -> tq_root = NULL;
1759 
1760  /* release storage for root queue entry */
1761  KF_TRACE(50, ("After Deletion of top-level TaskQ at %p on (%d):\n", queue, global_tid));
1762 
1763  queue->tq_flags |= TQF_DEALLOCATED;
1764  __kmp_free_taskq ( tq, queue, in_parallel, global_tid );
1765 
1766  KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
1767 
1768  /* release the workers now that the data structures are up to date */
1769  __kmp_end_split_barrier( bs_plain_barrier, global_tid );
1770  }
1771 
1772  th = __kmp_threads[ global_tid ];
1773 
1774  /* Reset ORDERED SECTION to parallel version */
1775  th->th.th_dispatch->th_deo_fcn = 0;
1776 
1777  /* Reset ORDERED SECTION to parallel version */
1778  th->th.th_dispatch->th_dxo_fcn = 0;
1779  }
1780  else {
1781  /* in serial execution context, dequeue the last task */
1782  /* and execute it, if there were any tasks encountered */
1783 
1784  if (queue->tq_nfull > 0) {
1785  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1786 
1787  thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1788 
1789  if (queue->tq_flags & TQF_IS_LAST_TASK) {
1790  /* TQF_IS_LASTPRIVATE, one thing in queue, __kmpc_end_taskq_task() */
1791  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
1792  /* instrumentation does copy-out. */
1793 
1794  /* no need for test_then_or call since already locked */
1795  thunk->th_flags |= TQF_IS_LAST_TASK;
1796  }
1797 
1798  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, thunk, queue));
1799 
1800  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1801  }
1802 
1803  /* destroy the unattached serial queue now that there is no more work to do */
1804  KF_TRACE(100, ("Before Deletion of Serialized TaskQ at %p on (%d):\n", queue, global_tid));
1805  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1806 
1807 #ifdef KMP_DEBUG
1808  i = 0;
1809  for (thunk=queue->tq_free_thunks; thunk != NULL; thunk=thunk->th.th_next_free)
1810  ++i;
1811  KMP_DEBUG_ASSERT (i == queue->tq_nslots + 1);
1812 #endif
1813  /* release storage for unattached serial queue */
1814  KF_TRACE(50, ("Serialized TaskQ at %p deleted on (%d).\n", queue, global_tid));
1815 
1816  queue->tq_flags |= TQF_DEALLOCATED;
1817  __kmp_free_taskq ( tq, queue, in_parallel, global_tid );
1818  }
1819 
1820  KE_TRACE( 10, ("__kmpc_end_taskq return (%d)\n", global_tid));
1821 }
1822 
1823 /* Enqueues a task for thunk previously created by __kmpc_task_buffer. */
1824 /* Returns nonzero if just filled up queue */
1825 
1826 kmp_int32
1827 __kmpc_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk)
1828 {
1829  kmp_int32 ret;
1830  kmpc_task_queue_t *queue;
1831  int in_parallel;
1832  kmp_taskq_t *tq;
1833 
1834  KE_TRACE( 10, ("__kmpc_task called (%d)\n", global_tid));
1835 
1836  KMP_DEBUG_ASSERT (!(thunk->th_flags & TQF_TASKQ_TASK)); /* thunk->th_task is a regular task */
1837 
1838  tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1839  queue = thunk->th.th_shareds->sv_queue;
1840  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1841 
1842  if (in_parallel && (thunk->th_flags & TQF_IS_ORDERED))
1843  thunk->th_tasknum = ++queue->tq_tasknum_queuing;
1844 
1845  /* For serial execution dequeue the preceding task and execute it, if one exists */
1846  /* This cannot be the last task. That one is handled in __kmpc_end_taskq */
1847 
1848  if (!in_parallel && queue->tq_nfull > 0) {
1849  kmpc_thunk_t *prev_thunk;
1850 
1851  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1852 
1853  prev_thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1854 
1855  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, prev_thunk, queue));
1856 
1857  __kmp_execute_task_from_queue( tq, loc, global_tid, prev_thunk, in_parallel );
1858  }
1859 
1860  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private */
1861  /* variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the */
1862  /* task queue is not full and allocates a thunk (which is then passed to */
1863  /* __kmpc_task()). So, the enqueue below should never fail due to a full queue. */
1864 
1865  KF_TRACE(100, ("After enqueueing this Task on (%d):\n", global_tid));
1866  KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
1867 
1868  ret = __kmp_enqueue_task ( tq, global_tid, queue, thunk, in_parallel );
1869 
1870  KF_TRACE(100, ("Task Queue looks like this on (%d):\n", global_tid));
1871  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1872 
1873  KE_TRACE( 10, ("__kmpc_task return (%d)\n", global_tid));
1874 
1875  return ret;
1876 }
1877 
1878 /* enqueues a taskq_task for thunk previously created by __kmpc_taskq */
1879 /* this should never be called unless in a parallel context */
1880 
1881 void
1882 __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, kmp_int32 status)
1883 {
1884  kmpc_task_queue_t *queue;
1885  kmp_taskq_t *tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1886  int tid = __kmp_tid_from_gtid( global_tid );
1887 
1888  KE_TRACE( 10, ("__kmpc_taskq_task called (%d)\n", global_tid));
1889  KF_TRACE(100, ("TaskQ Task argument thunk on (%d):\n", global_tid));
1890  KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
1891 
1892  queue = thunk->th.th_shareds->sv_queue;
1893 
1894  if ( __kmp_env_consistency_check )
1895  __kmp_pop_workshare( global_tid, ct_taskq, loc );
1896 
1897  /* thunk->th_task is the taskq_task */
1898  KMP_DEBUG_ASSERT (thunk->th_flags & TQF_TASKQ_TASK);
1899 
1900  /* not supposed to call __kmpc_taskq_task if it's already enqueued */
1901  KMP_DEBUG_ASSERT (queue->tq_taskq_slot == NULL);
1902 
1903  /* dequeue taskq thunk from curr_thunk stack */
1904  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1905  thunk->th_encl_thunk = NULL;
1906 
1907  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1908 
1909  thunk->th_status = status;
1910 
1911  KMP_MB(); /* flush thunk->th_status before taskq_task enqueued to avoid race condition */
1912 
1913  /* enqueue taskq_task in thunk into special slot in queue */
1914  /* GEH - probably don't need to lock taskq slot since only one */
1915  /* thread enqueues & already a lock set at dequeue point */
1916 
1917  queue->tq_taskq_slot = thunk;
1918 
1919  KE_TRACE( 10, ("__kmpc_taskq_task return (%d)\n", global_tid));
1920 }
1921 
1922 /* ends a taskq_task; done generating tasks */
1923 
1924 void
1925 __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk)
1926 {
1927  kmp_taskq_t *tq;
1928  kmpc_task_queue_t *queue;
1929  int in_parallel;
1930  int tid;
1931 
1932  KE_TRACE( 10, ("__kmpc_end_taskq_task called (%d)\n", global_tid));
1933 
1934  tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1935  queue = thunk->th.th_shareds->sv_queue;
1936  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1937  tid = __kmp_tid_from_gtid( global_tid );
1938 
1939  if ( __kmp_env_consistency_check )
1940  __kmp_pop_workshare( global_tid, ct_taskq, loc );
1941 
1942  if (in_parallel) {
1943 #if KMP_ARCH_X86 || \
1944  KMP_ARCH_X86_64
1945 
1946  KMP_TEST_THEN_OR32( &queue->tq_flags, (kmp_int32) TQF_ALL_TASKS_QUEUED );
1947 #else
1948  {
1949  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
1950 
1951  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1952  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1953 
1954  queue->tq_flags |= TQF_ALL_TASKS_QUEUED;
1955 
1956  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1957  }
1958 #endif
1959  }
1960 
1961  if (thunk->th_flags & TQF_IS_LASTPRIVATE) {
1962  /* Normally, __kmp_find_task_in_queue() refuses to schedule the last task in the */
1963  /* queue if TQF_IS_LASTPRIVATE so we can positively identify that last task */
1964  /* and run it with its TQF_IS_LAST_TASK bit turned on in th_flags. When */
1965  /* __kmpc_end_taskq_task() is called we are done generating all the tasks, so */
1966  /* we know the last one in the queue is the lastprivate task. Mark the queue */
1967  /* as having gotten to this state via tq_flags & TQF_IS_LAST_TASK; when that */
1968  /* task actually executes mark it via th_flags & TQF_IS_LAST_TASK (this th_flags */
1969  /* bit signals the instrumented code to do copy-outs after execution). */
1970 
1971  if (! in_parallel) {
1972  /* No synchronization needed for serial context */
1973  queue->tq_flags |= TQF_IS_LAST_TASK;
1974  }
1975  else {
1976 #if KMP_ARCH_X86 || \
1977  KMP_ARCH_X86_64
1978 
1979  KMP_TEST_THEN_OR32( &queue->tq_flags, (kmp_int32) TQF_IS_LAST_TASK );
1980 #else
1981  {
1982  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
1983 
1984  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1985  /* Seems to work without this call for digital/alpha, needed for IBM/RS6000 */
1986 
1987  queue->tq_flags |= TQF_IS_LAST_TASK;
1988 
1989  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1990  }
1991 #endif
1992  /* to prevent race condition where last task is dequeued but */
1993  /* flag isn't visible yet (not sure about this) */
1994  KMP_MB();
1995  }
1996  }
1997 
1998  /* dequeue taskq thunk from curr_thunk stack */
1999  if (in_parallel) {
2000  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
2001  thunk->th_encl_thunk = NULL;
2002 
2003  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
2004  }
2005 
2006  KE_TRACE( 10, ("__kmpc_end_taskq_task return (%d)\n", global_tid));
2007 }
2008 
2009 /* returns thunk for a regular task based on taskq_thunk */
2010 /* (__kmpc_taskq_task does the analogous thing for a TQF_TASKQ_TASK) */
2011 
2012 kmpc_thunk_t *
2013 __kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk, kmpc_task_t task)
2014 {
2015  kmp_taskq_t *tq;
2016  kmpc_task_queue_t *queue;
2017  kmpc_thunk_t *new_thunk;
2018  int in_parallel;
2019 
2020  KE_TRACE( 10, ("__kmpc_task_buffer called (%d)\n", global_tid));
2021 
2022  KMP_DEBUG_ASSERT (taskq_thunk->th_flags & TQF_TASKQ_TASK); /* taskq_thunk->th_task is the taskq_task */
2023 
2024  tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
2025  queue = taskq_thunk->th.th_shareds->sv_queue;
2026  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
2027 
2028  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private */
2029  /* variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the */
2030  /* task queue is not full and allocates a thunk (which is then passed to */
2031  /* __kmpc_task()). So, we can pre-allocate a thunk here assuming it will be */
2032  /* the next to be enqueued in __kmpc_task(). */
2033 
2034  new_thunk = __kmp_alloc_thunk (queue, in_parallel, global_tid);
2035  new_thunk->th.th_shareds = (kmpc_shared_vars_t *) queue->tq_shareds[0].ai_data;
2036  new_thunk->th_encl_thunk = NULL;
2037  new_thunk->th_task = task;
2038 
2039  /* GEH - shouldn't need to lock the read of tq_flags here */
2040  new_thunk->th_flags = queue->tq_flags & TQF_INTERFACE_FLAGS;
2041 
2042  new_thunk->th_status = 0;
2043 
2044  KMP_DEBUG_ASSERT (!(new_thunk->th_flags & TQF_TASKQ_TASK));
2045 
2046  KF_TRACE(100, ("Creating Regular Task on (%d):\n", global_tid));
2047  KF_DUMP(100, __kmp_dump_thunk( tq, new_thunk, global_tid ));
2048 
2049  KE_TRACE( 10, ("__kmpc_task_buffer return (%d)\n", global_tid));
2050 
2051  return new_thunk;
2052 }
2053 
2054 /* --------------------------------------------------------------------------- */
Definition: kmp.h:218
KMP_EXPORT void __kmpc_end_barrier_master(ident_t *, kmp_int32 global_tid)