Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 /* <copyright>
6  Copyright (c) 1997-2015 Intel Corporation. All Rights Reserved.
7 
8  Redistribution and use in source and binary forms, with or without
9  modification, are permitted provided that the following conditions
10  are met:
11 
12  * Redistributions of source code must retain the above copyright
13  notice, this list of conditions and the following disclaimer.
14  * Redistributions in binary form must reproduce the above copyright
15  notice, this list of conditions and the following disclaimer in the
16  documentation and/or other materials provided with the distribution.
17  * Neither the name of Intel Corporation nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 </copyright> */
34 
35 #include "kmp.h"
36 #include "kmp_i18n.h"
37 #include "kmp_io.h"
38 #include "kmp_str.h"
39 #include "kmp_wrapper_getpid.h"
40 #include "kmp_affinity.h"
41 
42 // Store the real or imagined machine hierarchy here
43 static hierarchy_info machine_hierarchy;
44 
45 void __kmp_cleanup_hierarchy() {
46  machine_hierarchy.fini();
47 }
48 
49 
50 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
51  kmp_uint32 depth;
52  // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
53  if (TCR_1(machine_hierarchy.uninitialized))
54  machine_hierarchy.init(NULL, nproc);
55 
56  depth = machine_hierarchy.depth;
57  KMP_DEBUG_ASSERT(depth > 0);
58  // Adjust the hierarchy in case num threads exceeds original
59  if (nproc > machine_hierarchy.skipPerLevel[depth-1])
60  machine_hierarchy.resize(nproc);
61 
62  thr_bar->depth = depth;
63  thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
64  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
65 }
66 
67 #if KMP_AFFINITY_SUPPORTED
68 
69 //
70 // Print the affinity mask to the character array in a pretty format.
71 //
72 char *
73 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
74 {
75  KMP_ASSERT(buf_len >= 40);
76  char *scan = buf;
77  char *end = buf + buf_len - 1;
78 
79  //
80  // Find first element / check for empty set.
81  //
82  size_t i;
83  for (i = 0; i < KMP_CPU_SETSIZE; i++) {
84  if (KMP_CPU_ISSET(i, mask)) {
85  break;
86  }
87  }
88  if (i == KMP_CPU_SETSIZE) {
89  KMP_SNPRINTF(scan, end-scan+1, "{<empty>}");
90  while (*scan != '\0') scan++;
91  KMP_ASSERT(scan <= end);
92  return buf;
93  }
94 
95  KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i);
96  while (*scan != '\0') scan++;
97  i++;
98  for (; i < KMP_CPU_SETSIZE; i++) {
99  if (! KMP_CPU_ISSET(i, mask)) {
100  continue;
101  }
102 
103  //
104  // Check for buffer overflow. A string of the form ",<n>" will have
105  // at most 10 characters, plus we want to leave room to print ",...}"
106  // if the set is too large to print for a total of 15 characters.
107  // We already left room for '\0' in setting end.
108  //
109  if (end - scan < 15) {
110  break;
111  }
112  KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i);
113  while (*scan != '\0') scan++;
114  }
115  if (i < KMP_CPU_SETSIZE) {
116  KMP_SNPRINTF(scan, end-scan+1, ",...");
117  while (*scan != '\0') scan++;
118  }
119  KMP_SNPRINTF(scan, end-scan+1, "}");
120  while (*scan != '\0') scan++;
121  KMP_ASSERT(scan <= end);
122  return buf;
123 }
124 
125 
126 void
127 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
128 {
129  KMP_CPU_ZERO(mask);
130 
131 # if KMP_GROUP_AFFINITY
132 
133  if (__kmp_num_proc_groups > 1) {
134  int group;
135  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
136  for (group = 0; group < __kmp_num_proc_groups; group++) {
137  int i;
138  int num = __kmp_GetActiveProcessorCount(group);
139  for (i = 0; i < num; i++) {
140  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
141  }
142  }
143  }
144  else
145 
146 # endif /* KMP_GROUP_AFFINITY */
147 
148  {
149  int proc;
150  for (proc = 0; proc < __kmp_xproc; proc++) {
151  KMP_CPU_SET(proc, mask);
152  }
153  }
154 }
155 
156 //
157 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
158 // called to renumber the labels from [0..n] and place them into the child_num
159 // vector of the address object. This is done in case the labels used for
160 // the children at one node of the hierarchy differ from those used for
161 // another node at the same level. Example: suppose the machine has 2 nodes
162 // with 2 packages each. The first node contains packages 601 and 602, and
163 // second node contains packages 603 and 604. If we try to sort the table
164 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
165 // because we are paying attention to the labels themselves, not the ordinal
166 // child numbers. By using the child numbers in the sort, the result is
167 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
168 //
169 static void
170 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
171  int numAddrs)
172 {
173  KMP_DEBUG_ASSERT(numAddrs > 0);
174  int depth = address2os->first.depth;
175  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
176  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
177  * sizeof(unsigned));
178  int labCt;
179  for (labCt = 0; labCt < depth; labCt++) {
180  address2os[0].first.childNums[labCt] = counts[labCt] = 0;
181  lastLabel[labCt] = address2os[0].first.labels[labCt];
182  }
183  int i;
184  for (i = 1; i < numAddrs; i++) {
185  for (labCt = 0; labCt < depth; labCt++) {
186  if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
187  int labCt2;
188  for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
189  counts[labCt2] = 0;
190  lastLabel[labCt2] = address2os[i].first.labels[labCt2];
191  }
192  counts[labCt]++;
193  lastLabel[labCt] = address2os[i].first.labels[labCt];
194  break;
195  }
196  }
197  for (labCt = 0; labCt < depth; labCt++) {
198  address2os[i].first.childNums[labCt] = counts[labCt];
199  }
200  for (; labCt < (int)Address::maxDepth; labCt++) {
201  address2os[i].first.childNums[labCt] = 0;
202  }
203  }
204 }
205 
206 
207 //
208 // All of the __kmp_affinity_create_*_map() routines should set
209 // __kmp_affinity_masks to a vector of affinity mask objects of length
210 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
211 // return the number of levels in the machine topology tree (zero if
212 // __kmp_affinity_type == affinity_none).
213 //
214 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
215 // to the affinity mask for the initialization thread. They need to save and
216 // restore the mask, and it could be needed later, so saving it is just an
217 // optimization to avoid calling kmp_get_system_affinity() again.
218 //
219 static kmp_affin_mask_t *fullMask = NULL;
220 
221 kmp_affin_mask_t *
222 __kmp_affinity_get_fullMask() { return fullMask; }
223 
224 
225 static int nCoresPerPkg, nPackages;
226 static int __kmp_nThreadsPerCore;
227 #ifndef KMP_DFLT_NTH_CORES
228 static int __kmp_ncores;
229 #endif
230 
231 //
232 // __kmp_affinity_uniform_topology() doesn't work when called from
233 // places which support arbitrarily many levels in the machine topology
234 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
235 // __kmp_affinity_create_x2apicid_map().
236 //
237 inline static bool
238 __kmp_affinity_uniform_topology()
239 {
240  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
241 }
242 
243 
244 //
245 // Print out the detailed machine topology map, i.e. the physical locations
246 // of each OS proc.
247 //
248 static void
249 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
250  int pkgLevel, int coreLevel, int threadLevel)
251 {
252  int proc;
253 
254  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
255  for (proc = 0; proc < len; proc++) {
256  int level;
257  kmp_str_buf_t buf;
258  __kmp_str_buf_init(&buf);
259  for (level = 0; level < depth; level++) {
260  if (level == threadLevel) {
261  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
262  }
263  else if (level == coreLevel) {
264  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
265  }
266  else if (level == pkgLevel) {
267  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
268  }
269  else if (level > pkgLevel) {
270  __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
271  level - pkgLevel - 1);
272  }
273  else {
274  __kmp_str_buf_print(&buf, "L%d ", level);
275  }
276  __kmp_str_buf_print(&buf, "%d ",
277  address2os[proc].first.labels[level]);
278  }
279  KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
280  buf.str);
281  __kmp_str_buf_free(&buf);
282  }
283 }
284 
285 
286 //
287 // If we don't know how to retrieve the machine's processor topology, or
288 // encounter an error in doing so, this routine is called to form a "flat"
289 // mapping of os thread id's <-> processor id's.
290 //
291 static int
292 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
293  kmp_i18n_id_t *const msg_id)
294 {
295  *address2os = NULL;
296  *msg_id = kmp_i18n_null;
297 
298  //
299  // Even if __kmp_affinity_type == affinity_none, this routine might still
300  // called to set __kmp_ncores, as well as
301  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
302  //
303  if (! KMP_AFFINITY_CAPABLE()) {
304  KMP_ASSERT(__kmp_affinity_type == affinity_none);
305  __kmp_ncores = nPackages = __kmp_xproc;
306  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
307  if (__kmp_affinity_verbose) {
308  KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
309  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
310  KMP_INFORM(Uniform, "KMP_AFFINITY");
311  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
312  __kmp_nThreadsPerCore, __kmp_ncores);
313  }
314  return 0;
315  }
316 
317  //
318  // When affinity is off, this routine will still be called to set
319  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
320  // nCoresPerPkg, & nPackages. Make sure all these vars are set
321  // correctly, and return now if affinity is not enabled.
322  //
323  __kmp_ncores = nPackages = __kmp_avail_proc;
324  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
325  if (__kmp_affinity_verbose) {
326  char buf[KMP_AFFIN_MASK_PRINT_LEN];
327  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
328 
329  KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
330  if (__kmp_affinity_respect_mask) {
331  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
332  } else {
333  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
334  }
335  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
336  KMP_INFORM(Uniform, "KMP_AFFINITY");
337  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
338  __kmp_nThreadsPerCore, __kmp_ncores);
339  }
340  if (__kmp_affinity_type == affinity_none) {
341  return 0;
342  }
343 
344  //
345  // Contruct the data structure to be returned.
346  //
347  *address2os = (AddrUnsPair*)
348  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
349  int avail_ct = 0;
350  unsigned int i;
351  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
352  //
353  // Skip this proc if it is not included in the machine model.
354  //
355  if (! KMP_CPU_ISSET(i, fullMask)) {
356  continue;
357  }
358 
359  Address addr(1);
360  addr.labels[0] = i;
361  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
362  }
363  if (__kmp_affinity_verbose) {
364  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
365  }
366 
367  if (__kmp_affinity_gran_levels < 0) {
368  //
369  // Only the package level is modeled in the machine topology map,
370  // so the #levels of granularity is either 0 or 1.
371  //
372  if (__kmp_affinity_gran > affinity_gran_package) {
373  __kmp_affinity_gran_levels = 1;
374  }
375  else {
376  __kmp_affinity_gran_levels = 0;
377  }
378  }
379  return 1;
380 }
381 
382 
383 # if KMP_GROUP_AFFINITY
384 
385 //
386 // If multiple Windows* OS processor groups exist, we can create a 2-level
387 // topology map with the groups at level 0 and the individual procs at
388 // level 1.
389 //
390 // This facilitates letting the threads float among all procs in a group,
391 // if granularity=group (the default when there are multiple groups).
392 //
393 static int
394 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
395  kmp_i18n_id_t *const msg_id)
396 {
397  *address2os = NULL;
398  *msg_id = kmp_i18n_null;
399 
400  //
401  // If we don't have multiple processor groups, return now.
402  // The flat mapping will be used.
403  //
404  if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
405  // FIXME set *msg_id
406  return -1;
407  }
408 
409  //
410  // Contruct the data structure to be returned.
411  //
412  *address2os = (AddrUnsPair*)
413  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
414  int avail_ct = 0;
415  int i;
416  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
417  //
418  // Skip this proc if it is not included in the machine model.
419  //
420  if (! KMP_CPU_ISSET(i, fullMask)) {
421  continue;
422  }
423 
424  Address addr(2);
425  addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
426  addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
427  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
428 
429  if (__kmp_affinity_verbose) {
430  KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
431  addr.labels[1]);
432  }
433  }
434 
435  if (__kmp_affinity_gran_levels < 0) {
436  if (__kmp_affinity_gran == affinity_gran_group) {
437  __kmp_affinity_gran_levels = 1;
438  }
439  else if ((__kmp_affinity_gran == affinity_gran_fine)
440  || (__kmp_affinity_gran == affinity_gran_thread)) {
441  __kmp_affinity_gran_levels = 0;
442  }
443  else {
444  const char *gran_str = NULL;
445  if (__kmp_affinity_gran == affinity_gran_core) {
446  gran_str = "core";
447  }
448  else if (__kmp_affinity_gran == affinity_gran_package) {
449  gran_str = "package";
450  }
451  else if (__kmp_affinity_gran == affinity_gran_node) {
452  gran_str = "node";
453  }
454  else {
455  KMP_ASSERT(0);
456  }
457 
458  // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
459  __kmp_affinity_gran_levels = 0;
460  }
461  }
462  return 2;
463 }
464 
465 # endif /* KMP_GROUP_AFFINITY */
466 
467 
468 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
469 
470 static int
471 __kmp_cpuid_mask_width(int count) {
472  int r = 0;
473 
474  while((1<<r) < count)
475  ++r;
476  return r;
477 }
478 
479 
480 class apicThreadInfo {
481 public:
482  unsigned osId; // param to __kmp_affinity_bind_thread
483  unsigned apicId; // from cpuid after binding
484  unsigned maxCoresPerPkg; // ""
485  unsigned maxThreadsPerPkg; // ""
486  unsigned pkgId; // inferred from above values
487  unsigned coreId; // ""
488  unsigned threadId; // ""
489 };
490 
491 
492 static int
493 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
494 {
495  const apicThreadInfo *aa = (const apicThreadInfo *)a;
496  const apicThreadInfo *bb = (const apicThreadInfo *)b;
497  if (aa->osId < bb->osId) return -1;
498  if (aa->osId > bb->osId) return 1;
499  return 0;
500 }
501 
502 
503 static int
504 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
505 {
506  const apicThreadInfo *aa = (const apicThreadInfo *)a;
507  const apicThreadInfo *bb = (const apicThreadInfo *)b;
508  if (aa->pkgId < bb->pkgId) return -1;
509  if (aa->pkgId > bb->pkgId) return 1;
510  if (aa->coreId < bb->coreId) return -1;
511  if (aa->coreId > bb->coreId) return 1;
512  if (aa->threadId < bb->threadId) return -1;
513  if (aa->threadId > bb->threadId) return 1;
514  return 0;
515 }
516 
517 
518 //
519 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
520 // an algorithm which cycles through the available os threads, setting
521 // the current thread's affinity mask to that thread, and then retrieves
522 // the Apic Id for each thread context using the cpuid instruction.
523 //
524 static int
525 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
526  kmp_i18n_id_t *const msg_id)
527 {
528  kmp_cpuid buf;
529  int rc;
530  *address2os = NULL;
531  *msg_id = kmp_i18n_null;
532 
533  //
534  // Check if cpuid leaf 4 is supported.
535  //
536  __kmp_x86_cpuid(0, 0, &buf);
537  if (buf.eax < 4) {
538  *msg_id = kmp_i18n_str_NoLeaf4Support;
539  return -1;
540  }
541 
542  //
543  // The algorithm used starts by setting the affinity to each available
544  // thread and retrieving info from the cpuid instruction, so if we are
545  // not capable of calling __kmp_get_system_affinity() and
546  // _kmp_get_system_affinity(), then we need to do something else - use
547  // the defaults that we calculated from issuing cpuid without binding
548  // to each proc.
549  //
550  if (! KMP_AFFINITY_CAPABLE()) {
551  //
552  // Hack to try and infer the machine topology using only the data
553  // available from cpuid on the current thread, and __kmp_xproc.
554  //
555  KMP_ASSERT(__kmp_affinity_type == affinity_none);
556 
557  //
558  // Get an upper bound on the number of threads per package using
559  // cpuid(1).
560  //
561  // On some OS/chps combinations where HT is supported by the chip
562  // but is disabled, this value will be 2 on a single core chip.
563  // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
564  //
565  __kmp_x86_cpuid(1, 0, &buf);
566  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
567  if (maxThreadsPerPkg == 0) {
568  maxThreadsPerPkg = 1;
569  }
570 
571  //
572  // The num cores per pkg comes from cpuid(4).
573  // 1 must be added to the encoded value.
574  //
575  // The author of cpu_count.cpp treated this only an upper bound
576  // on the number of cores, but I haven't seen any cases where it
577  // was greater than the actual number of cores, so we will treat
578  // it as exact in this block of code.
579  //
580  // First, we need to check if cpuid(4) is supported on this chip.
581  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
582  // has the value n or greater.
583  //
584  __kmp_x86_cpuid(0, 0, &buf);
585  if (buf.eax >= 4) {
586  __kmp_x86_cpuid(4, 0, &buf);
587  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
588  }
589  else {
590  nCoresPerPkg = 1;
591  }
592 
593  //
594  // There is no way to reliably tell if HT is enabled without issuing
595  // the cpuid instruction from every thread, can correlating the cpuid
596  // info, so if the machine is not affinity capable, we assume that HT
597  // is off. We have seen quite a few machines where maxThreadsPerPkg
598  // is 2, yet the machine does not support HT.
599  //
600  // - Older OSes are usually found on machines with older chips, which
601  // do not support HT.
602  //
603  // - The performance penalty for mistakenly identifying a machine as
604  // HT when it isn't (which results in blocktime being incorrecly set
605  // to 0) is greater than the penalty when for mistakenly identifying
606  // a machine as being 1 thread/core when it is really HT enabled
607  // (which results in blocktime being incorrectly set to a positive
608  // value).
609  //
610  __kmp_ncores = __kmp_xproc;
611  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
612  __kmp_nThreadsPerCore = 1;
613  if (__kmp_affinity_verbose) {
614  KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
615  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
616  if (__kmp_affinity_uniform_topology()) {
617  KMP_INFORM(Uniform, "KMP_AFFINITY");
618  } else {
619  KMP_INFORM(NonUniform, "KMP_AFFINITY");
620  }
621  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
622  __kmp_nThreadsPerCore, __kmp_ncores);
623  }
624  return 0;
625  }
626 
627  //
628  //
629  // From here on, we can assume that it is safe to call
630  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
631  // even if __kmp_affinity_type = affinity_none.
632  //
633 
634  //
635  // Save the affinity mask for the current thread.
636  //
637  kmp_affin_mask_t *oldMask;
638  KMP_CPU_ALLOC(oldMask);
639  KMP_ASSERT(oldMask != NULL);
640  __kmp_get_system_affinity(oldMask, TRUE);
641 
642  //
643  // Run through each of the available contexts, binding the current thread
644  // to it, and obtaining the pertinent information using the cpuid instr.
645  //
646  // The relevant information is:
647  //
648  // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
649  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
650  //
651  // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
652  // value of this field determines the width of the core# + thread#
653  // fields in the Apic Id. It is also an upper bound on the number
654  // of threads per package, but it has been verified that situations
655  // happen were it is not exact. In particular, on certain OS/chip
656  // combinations where Intel(R) Hyper-Threading Technology is supported
657  // by the chip but has
658  // been disabled, the value of this field will be 2 (for a single core
659  // chip). On other OS/chip combinations supporting
660  // Intel(R) Hyper-Threading Technology, the value of
661  // this field will be 1 when Intel(R) Hyper-Threading Technology is
662  // disabled and 2 when it is enabled.
663  //
664  // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
665  // value of this field (+1) determines the width of the core# field in
666  // the Apic Id. The comments in "cpucount.cpp" say that this value is
667  // an upper bound, but the IA-32 architecture manual says that it is
668  // exactly the number of cores per package, and I haven't seen any
669  // case where it wasn't.
670  //
671  // From this information, deduce the package Id, core Id, and thread Id,
672  // and set the corresponding fields in the apicThreadInfo struct.
673  //
674  unsigned i;
675  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
676  __kmp_avail_proc * sizeof(apicThreadInfo));
677  unsigned nApics = 0;
678  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
679  //
680  // Skip this proc if it is not included in the machine model.
681  //
682  if (! KMP_CPU_ISSET(i, fullMask)) {
683  continue;
684  }
685  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
686 
687  __kmp_affinity_bind_thread(i);
688  threadInfo[nApics].osId = i;
689 
690  //
691  // The apic id and max threads per pkg come from cpuid(1).
692  //
693  __kmp_x86_cpuid(1, 0, &buf);
694  if (! (buf.edx >> 9) & 1) {
695  __kmp_set_system_affinity(oldMask, TRUE);
696  __kmp_free(threadInfo);
697  KMP_CPU_FREE(oldMask);
698  *msg_id = kmp_i18n_str_ApicNotPresent;
699  return -1;
700  }
701  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
702  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
703  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
704  threadInfo[nApics].maxThreadsPerPkg = 1;
705  }
706 
707  //
708  // Max cores per pkg comes from cpuid(4).
709  // 1 must be added to the encoded value.
710  //
711  // First, we need to check if cpuid(4) is supported on this chip.
712  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
713  // has the value n or greater.
714  //
715  __kmp_x86_cpuid(0, 0, &buf);
716  if (buf.eax >= 4) {
717  __kmp_x86_cpuid(4, 0, &buf);
718  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
719  }
720  else {
721  threadInfo[nApics].maxCoresPerPkg = 1;
722  }
723 
724  //
725  // Infer the pkgId / coreId / threadId using only the info
726  // obtained locally.
727  //
728  int widthCT = __kmp_cpuid_mask_width(
729  threadInfo[nApics].maxThreadsPerPkg);
730  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
731 
732  int widthC = __kmp_cpuid_mask_width(
733  threadInfo[nApics].maxCoresPerPkg);
734  int widthT = widthCT - widthC;
735  if (widthT < 0) {
736  //
737  // I've never seen this one happen, but I suppose it could, if
738  // the cpuid instruction on a chip was really screwed up.
739  // Make sure to restore the affinity mask before the tail call.
740  //
741  __kmp_set_system_affinity(oldMask, TRUE);
742  __kmp_free(threadInfo);
743  KMP_CPU_FREE(oldMask);
744  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
745  return -1;
746  }
747 
748  int maskC = (1 << widthC) - 1;
749  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
750  &maskC;
751 
752  int maskT = (1 << widthT) - 1;
753  threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
754 
755  nApics++;
756  }
757 
758  //
759  // We've collected all the info we need.
760  // Restore the old affinity mask for this thread.
761  //
762  __kmp_set_system_affinity(oldMask, TRUE);
763 
764  //
765  // If there's only one thread context to bind to, form an Address object
766  // with depth 1 and return immediately (or, if affinity is off, set
767  // address2os to NULL and return).
768  //
769  // If it is configured to omit the package level when there is only a
770  // single package, the logic at the end of this routine won't work if
771  // there is only a single thread - it would try to form an Address
772  // object with depth 0.
773  //
774  KMP_ASSERT(nApics > 0);
775  if (nApics == 1) {
776  __kmp_ncores = nPackages = 1;
777  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
778  if (__kmp_affinity_verbose) {
779  char buf[KMP_AFFIN_MASK_PRINT_LEN];
780  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
781 
782  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
783  if (__kmp_affinity_respect_mask) {
784  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
785  } else {
786  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
787  }
788  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
789  KMP_INFORM(Uniform, "KMP_AFFINITY");
790  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
791  __kmp_nThreadsPerCore, __kmp_ncores);
792  }
793 
794  if (__kmp_affinity_type == affinity_none) {
795  __kmp_free(threadInfo);
796  KMP_CPU_FREE(oldMask);
797  return 0;
798  }
799 
800  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
801  Address addr(1);
802  addr.labels[0] = threadInfo[0].pkgId;
803  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
804 
805  if (__kmp_affinity_gran_levels < 0) {
806  __kmp_affinity_gran_levels = 0;
807  }
808 
809  if (__kmp_affinity_verbose) {
810  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
811  }
812 
813  __kmp_free(threadInfo);
814  KMP_CPU_FREE(oldMask);
815  return 1;
816  }
817 
818  //
819  // Sort the threadInfo table by physical Id.
820  //
821  qsort(threadInfo, nApics, sizeof(*threadInfo),
822  __kmp_affinity_cmp_apicThreadInfo_phys_id);
823 
824  //
825  // The table is now sorted by pkgId / coreId / threadId, but we really
826  // don't know the radix of any of the fields. pkgId's may be sparsely
827  // assigned among the chips on a system. Although coreId's are usually
828  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
829  // [0..threadsPerCore-1], we don't want to make any such assumptions.
830  //
831  // For that matter, we don't know what coresPerPkg and threadsPerCore
832  // (or the total # packages) are at this point - we want to determine
833  // that now. We only have an upper bound on the first two figures.
834  //
835  // We also perform a consistency check at this point: the values returned
836  // by the cpuid instruction for any thread bound to a given package had
837  // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
838  //
839  nPackages = 1;
840  nCoresPerPkg = 1;
841  __kmp_nThreadsPerCore = 1;
842  unsigned nCores = 1;
843 
844  unsigned pkgCt = 1; // to determine radii
845  unsigned lastPkgId = threadInfo[0].pkgId;
846  unsigned coreCt = 1;
847  unsigned lastCoreId = threadInfo[0].coreId;
848  unsigned threadCt = 1;
849  unsigned lastThreadId = threadInfo[0].threadId;
850 
851  // intra-pkg consist checks
852  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
853  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
854 
855  for (i = 1; i < nApics; i++) {
856  if (threadInfo[i].pkgId != lastPkgId) {
857  nCores++;
858  pkgCt++;
859  lastPkgId = threadInfo[i].pkgId;
860  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
861  coreCt = 1;
862  lastCoreId = threadInfo[i].coreId;
863  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
864  threadCt = 1;
865  lastThreadId = threadInfo[i].threadId;
866 
867  //
868  // This is a different package, so go on to the next iteration
869  // without doing any consistency checks. Reset the consistency
870  // check vars, though.
871  //
872  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
873  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
874  continue;
875  }
876 
877  if (threadInfo[i].coreId != lastCoreId) {
878  nCores++;
879  coreCt++;
880  lastCoreId = threadInfo[i].coreId;
881  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
882  threadCt = 1;
883  lastThreadId = threadInfo[i].threadId;
884  }
885  else if (threadInfo[i].threadId != lastThreadId) {
886  threadCt++;
887  lastThreadId = threadInfo[i].threadId;
888  }
889  else {
890  __kmp_free(threadInfo);
891  KMP_CPU_FREE(oldMask);
892  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
893  return -1;
894  }
895 
896  //
897  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
898  // fields agree between all the threads bounds to a given package.
899  //
900  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
901  || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
902  __kmp_free(threadInfo);
903  KMP_CPU_FREE(oldMask);
904  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
905  return -1;
906  }
907  }
908  nPackages = pkgCt;
909  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
910  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
911 
912  //
913  // When affinity is off, this routine will still be called to set
914  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
915  // nCoresPerPkg, & nPackages. Make sure all these vars are set
916  // correctly, and return now if affinity is not enabled.
917  //
918  __kmp_ncores = nCores;
919  if (__kmp_affinity_verbose) {
920  char buf[KMP_AFFIN_MASK_PRINT_LEN];
921  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
922 
923  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
924  if (__kmp_affinity_respect_mask) {
925  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
926  } else {
927  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
928  }
929  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
930  if (__kmp_affinity_uniform_topology()) {
931  KMP_INFORM(Uniform, "KMP_AFFINITY");
932  } else {
933  KMP_INFORM(NonUniform, "KMP_AFFINITY");
934  }
935  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
936  __kmp_nThreadsPerCore, __kmp_ncores);
937 
938  }
939 
940  if (__kmp_affinity_type == affinity_none) {
941  __kmp_free(threadInfo);
942  KMP_CPU_FREE(oldMask);
943  return 0;
944  }
945 
946  //
947  // Now that we've determined the number of packages, the number of cores
948  // per package, and the number of threads per core, we can construct the
949  // data structure that is to be returned.
950  //
951  int pkgLevel = 0;
952  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
953  int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
954  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
955 
956  KMP_ASSERT(depth > 0);
957  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
958 
959  for (i = 0; i < nApics; ++i) {
960  Address addr(depth);
961  unsigned os = threadInfo[i].osId;
962  int d = 0;
963 
964  if (pkgLevel >= 0) {
965  addr.labels[d++] = threadInfo[i].pkgId;
966  }
967  if (coreLevel >= 0) {
968  addr.labels[d++] = threadInfo[i].coreId;
969  }
970  if (threadLevel >= 0) {
971  addr.labels[d++] = threadInfo[i].threadId;
972  }
973  (*address2os)[i] = AddrUnsPair(addr, os);
974  }
975 
976  if (__kmp_affinity_gran_levels < 0) {
977  //
978  // Set the granularity level based on what levels are modeled
979  // in the machine topology map.
980  //
981  __kmp_affinity_gran_levels = 0;
982  if ((threadLevel >= 0)
983  && (__kmp_affinity_gran > affinity_gran_thread)) {
984  __kmp_affinity_gran_levels++;
985  }
986  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
987  __kmp_affinity_gran_levels++;
988  }
989  if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
990  __kmp_affinity_gran_levels++;
991  }
992  }
993 
994  if (__kmp_affinity_verbose) {
995  __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
996  coreLevel, threadLevel);
997  }
998 
999  __kmp_free(threadInfo);
1000  KMP_CPU_FREE(oldMask);
1001  return depth;
1002 }
1003 
1004 
1005 //
1006 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1007 // architectures support a newer interface for specifying the x2APIC Ids,
1008 // based on cpuid leaf 11.
1009 //
1010 static int
1011 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1012  kmp_i18n_id_t *const msg_id)
1013 {
1014  kmp_cpuid buf;
1015  *address2os = NULL;
1016  *msg_id = kmp_i18n_null;
1017 
1018  //
1019  // Check to see if cpuid leaf 11 is supported.
1020  //
1021  __kmp_x86_cpuid(0, 0, &buf);
1022  if (buf.eax < 11) {
1023  *msg_id = kmp_i18n_str_NoLeaf11Support;
1024  return -1;
1025  }
1026  __kmp_x86_cpuid(11, 0, &buf);
1027  if (buf.ebx == 0) {
1028  *msg_id = kmp_i18n_str_NoLeaf11Support;
1029  return -1;
1030  }
1031 
1032  //
1033  // Find the number of levels in the machine topology. While we're at it,
1034  // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1035  // try to get more accurate values later by explicitly counting them,
1036  // but get reasonable defaults now, in case we return early.
1037  //
1038  int level;
1039  int threadLevel = -1;
1040  int coreLevel = -1;
1041  int pkgLevel = -1;
1042  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1043 
1044  for (level = 0;; level++) {
1045  if (level > 31) {
1046  //
1047  // FIXME: Hack for DPD200163180
1048  //
1049  // If level is big then something went wrong -> exiting
1050  //
1051  // There could actually be 32 valid levels in the machine topology,
1052  // but so far, the only machine we have seen which does not exit
1053  // this loop before iteration 32 has fubar x2APIC settings.
1054  //
1055  // For now, just reject this case based upon loop trip count.
1056  //
1057  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1058  return -1;
1059  }
1060  __kmp_x86_cpuid(11, level, &buf);
1061  if (buf.ebx == 0) {
1062  if (pkgLevel < 0) {
1063  //
1064  // Will infer nPackages from __kmp_xproc
1065  //
1066  pkgLevel = level;
1067  level++;
1068  }
1069  break;
1070  }
1071  int kind = (buf.ecx >> 8) & 0xff;
1072  if (kind == 1) {
1073  //
1074  // SMT level
1075  //
1076  threadLevel = level;
1077  coreLevel = -1;
1078  pkgLevel = -1;
1079  __kmp_nThreadsPerCore = buf.ebx & 0xff;
1080  if (__kmp_nThreadsPerCore == 0) {
1081  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1082  return -1;
1083  }
1084  }
1085  else if (kind == 2) {
1086  //
1087  // core level
1088  //
1089  coreLevel = level;
1090  pkgLevel = -1;
1091  nCoresPerPkg = buf.ebx & 0xff;
1092  if (nCoresPerPkg == 0) {
1093  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1094  return -1;
1095  }
1096  }
1097  else {
1098  if (level <= 0) {
1099  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1100  return -1;
1101  }
1102  if (pkgLevel >= 0) {
1103  continue;
1104  }
1105  pkgLevel = level;
1106  nPackages = buf.ebx & 0xff;
1107  if (nPackages == 0) {
1108  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1109  return -1;
1110  }
1111  }
1112  }
1113  int depth = level;
1114 
1115  //
1116  // In the above loop, "level" was counted from the finest level (usually
1117  // thread) to the coarsest. The caller expects that we will place the
1118  // labels in (*address2os)[].first.labels[] in the inverse order, so
1119  // we need to invert the vars saying which level means what.
1120  //
1121  if (threadLevel >= 0) {
1122  threadLevel = depth - threadLevel - 1;
1123  }
1124  if (coreLevel >= 0) {
1125  coreLevel = depth - coreLevel - 1;
1126  }
1127  KMP_DEBUG_ASSERT(pkgLevel >= 0);
1128  pkgLevel = depth - pkgLevel - 1;
1129 
1130  //
1131  // The algorithm used starts by setting the affinity to each available
1132  // thread and retrieving info from the cpuid instruction, so if we are
1133  // not capable of calling __kmp_get_system_affinity() and
1134  // _kmp_get_system_affinity(), then we need to do something else - use
1135  // the defaults that we calculated from issuing cpuid without binding
1136  // to each proc.
1137  //
1138  if (! KMP_AFFINITY_CAPABLE())
1139  {
1140  //
1141  // Hack to try and infer the machine topology using only the data
1142  // available from cpuid on the current thread, and __kmp_xproc.
1143  //
1144  KMP_ASSERT(__kmp_affinity_type == affinity_none);
1145 
1146  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1147  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1148  if (__kmp_affinity_verbose) {
1149  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1150  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1151  if (__kmp_affinity_uniform_topology()) {
1152  KMP_INFORM(Uniform, "KMP_AFFINITY");
1153  } else {
1154  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1155  }
1156  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1157  __kmp_nThreadsPerCore, __kmp_ncores);
1158  }
1159  return 0;
1160  }
1161 
1162  //
1163  //
1164  // From here on, we can assume that it is safe to call
1165  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1166  // even if __kmp_affinity_type = affinity_none.
1167  //
1168 
1169  //
1170  // Save the affinity mask for the current thread.
1171  //
1172  kmp_affin_mask_t *oldMask;
1173  KMP_CPU_ALLOC(oldMask);
1174  __kmp_get_system_affinity(oldMask, TRUE);
1175 
1176  //
1177  // Allocate the data structure to be returned.
1178  //
1179  AddrUnsPair *retval = (AddrUnsPair *)
1180  __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1181 
1182  //
1183  // Run through each of the available contexts, binding the current thread
1184  // to it, and obtaining the pertinent information using the cpuid instr.
1185  //
1186  unsigned int proc;
1187  int nApics = 0;
1188  for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1189  //
1190  // Skip this proc if it is not included in the machine model.
1191  //
1192  if (! KMP_CPU_ISSET(proc, fullMask)) {
1193  continue;
1194  }
1195  KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1196 
1197  __kmp_affinity_bind_thread(proc);
1198 
1199  //
1200  // Extrach the labels for each level in the machine topology map
1201  // from the Apic ID.
1202  //
1203  Address addr(depth);
1204  int prev_shift = 0;
1205 
1206  for (level = 0; level < depth; level++) {
1207  __kmp_x86_cpuid(11, level, &buf);
1208  unsigned apicId = buf.edx;
1209  if (buf.ebx == 0) {
1210  if (level != depth - 1) {
1211  KMP_CPU_FREE(oldMask);
1212  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1213  return -1;
1214  }
1215  addr.labels[depth - level - 1] = apicId >> prev_shift;
1216  level++;
1217  break;
1218  }
1219  int shift = buf.eax & 0x1f;
1220  int mask = (1 << shift) - 1;
1221  addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1222  prev_shift = shift;
1223  }
1224  if (level != depth) {
1225  KMP_CPU_FREE(oldMask);
1226  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1227  return -1;
1228  }
1229 
1230  retval[nApics] = AddrUnsPair(addr, proc);
1231  nApics++;
1232  }
1233 
1234  //
1235  // We've collected all the info we need.
1236  // Restore the old affinity mask for this thread.
1237  //
1238  __kmp_set_system_affinity(oldMask, TRUE);
1239 
1240  //
1241  // If there's only one thread context to bind to, return now.
1242  //
1243  KMP_ASSERT(nApics > 0);
1244  if (nApics == 1) {
1245  __kmp_ncores = nPackages = 1;
1246  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1247  if (__kmp_affinity_verbose) {
1248  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1249  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1250 
1251  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1252  if (__kmp_affinity_respect_mask) {
1253  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1254  } else {
1255  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1256  }
1257  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1258  KMP_INFORM(Uniform, "KMP_AFFINITY");
1259  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1260  __kmp_nThreadsPerCore, __kmp_ncores);
1261  }
1262 
1263  if (__kmp_affinity_type == affinity_none) {
1264  __kmp_free(retval);
1265  KMP_CPU_FREE(oldMask);
1266  return 0;
1267  }
1268 
1269  //
1270  // Form an Address object which only includes the package level.
1271  //
1272  Address addr(1);
1273  addr.labels[0] = retval[0].first.labels[pkgLevel];
1274  retval[0].first = addr;
1275 
1276  if (__kmp_affinity_gran_levels < 0) {
1277  __kmp_affinity_gran_levels = 0;
1278  }
1279 
1280  if (__kmp_affinity_verbose) {
1281  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1282  }
1283 
1284  *address2os = retval;
1285  KMP_CPU_FREE(oldMask);
1286  return 1;
1287  }
1288 
1289  //
1290  // Sort the table by physical Id.
1291  //
1292  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1293 
1294  //
1295  // Find the radix at each of the levels.
1296  //
1297  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1298  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1299  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1300  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1301  for (level = 0; level < depth; level++) {
1302  totals[level] = 1;
1303  maxCt[level] = 1;
1304  counts[level] = 1;
1305  last[level] = retval[0].first.labels[level];
1306  }
1307 
1308  //
1309  // From here on, the iteration variable "level" runs from the finest
1310  // level to the coarsest, i.e. we iterate forward through
1311  // (*address2os)[].first.labels[] - in the previous loops, we iterated
1312  // backwards.
1313  //
1314  for (proc = 1; (int)proc < nApics; proc++) {
1315  int level;
1316  for (level = 0; level < depth; level++) {
1317  if (retval[proc].first.labels[level] != last[level]) {
1318  int j;
1319  for (j = level + 1; j < depth; j++) {
1320  totals[j]++;
1321  counts[j] = 1;
1322  // The line below causes printing incorrect topology information
1323  // in case the max value for some level (maxCt[level]) is encountered earlier than
1324  // some less value while going through the array.
1325  // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1326  // whereas it must be 4.
1327  // TODO!!! Check if it can be commented safely
1328  //maxCt[j] = 1;
1329  last[j] = retval[proc].first.labels[j];
1330  }
1331  totals[level]++;
1332  counts[level]++;
1333  if (counts[level] > maxCt[level]) {
1334  maxCt[level] = counts[level];
1335  }
1336  last[level] = retval[proc].first.labels[level];
1337  break;
1338  }
1339  else if (level == depth - 1) {
1340  __kmp_free(last);
1341  __kmp_free(maxCt);
1342  __kmp_free(counts);
1343  __kmp_free(totals);
1344  __kmp_free(retval);
1345  KMP_CPU_FREE(oldMask);
1346  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1347  return -1;
1348  }
1349  }
1350  }
1351 
1352  //
1353  // When affinity is off, this routine will still be called to set
1354  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1355  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1356  // correctly, and return if affinity is not enabled.
1357  //
1358  if (threadLevel >= 0) {
1359  __kmp_nThreadsPerCore = maxCt[threadLevel];
1360  }
1361  else {
1362  __kmp_nThreadsPerCore = 1;
1363  }
1364  nPackages = totals[pkgLevel];
1365 
1366  if (coreLevel >= 0) {
1367  __kmp_ncores = totals[coreLevel];
1368  nCoresPerPkg = maxCt[coreLevel];
1369  }
1370  else {
1371  __kmp_ncores = nPackages;
1372  nCoresPerPkg = 1;
1373  }
1374 
1375  //
1376  // Check to see if the machine topology is uniform
1377  //
1378  unsigned prod = maxCt[0];
1379  for (level = 1; level < depth; level++) {
1380  prod *= maxCt[level];
1381  }
1382  bool uniform = (prod == totals[level - 1]);
1383 
1384  //
1385  // Print the machine topology summary.
1386  //
1387  if (__kmp_affinity_verbose) {
1388  char mask[KMP_AFFIN_MASK_PRINT_LEN];
1389  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1390 
1391  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1392  if (__kmp_affinity_respect_mask) {
1393  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1394  } else {
1395  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1396  }
1397  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1398  if (uniform) {
1399  KMP_INFORM(Uniform, "KMP_AFFINITY");
1400  } else {
1401  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1402  }
1403 
1404  kmp_str_buf_t buf;
1405  __kmp_str_buf_init(&buf);
1406 
1407  __kmp_str_buf_print(&buf, "%d", totals[0]);
1408  for (level = 1; level <= pkgLevel; level++) {
1409  __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1410  }
1411  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1412  __kmp_nThreadsPerCore, __kmp_ncores);
1413 
1414  __kmp_str_buf_free(&buf);
1415  }
1416 
1417  if (__kmp_affinity_type == affinity_none) {
1418  __kmp_free(last);
1419  __kmp_free(maxCt);
1420  __kmp_free(counts);
1421  __kmp_free(totals);
1422  __kmp_free(retval);
1423  KMP_CPU_FREE(oldMask);
1424  return 0;
1425  }
1426 
1427  //
1428  // Find any levels with radiix 1, and remove them from the map
1429  // (except for the package level).
1430  //
1431  int new_depth = 0;
1432  for (level = 0; level < depth; level++) {
1433  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1434  continue;
1435  }
1436  new_depth++;
1437  }
1438 
1439  //
1440  // If we are removing any levels, allocate a new vector to return,
1441  // and copy the relevant information to it.
1442  //
1443  if (new_depth != depth) {
1444  AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1445  sizeof(AddrUnsPair) * nApics);
1446  for (proc = 0; (int)proc < nApics; proc++) {
1447  Address addr(new_depth);
1448  new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1449  }
1450  int new_level = 0;
1451  int newPkgLevel = -1;
1452  int newCoreLevel = -1;
1453  int newThreadLevel = -1;
1454  int i;
1455  for (level = 0; level < depth; level++) {
1456  if ((maxCt[level] == 1)
1457  && (level != pkgLevel)) {
1458  //
1459  // Remove this level. Never remove the package level
1460  //
1461  continue;
1462  }
1463  if (level == pkgLevel) {
1464  newPkgLevel = level;
1465  }
1466  if (level == coreLevel) {
1467  newCoreLevel = level;
1468  }
1469  if (level == threadLevel) {
1470  newThreadLevel = level;
1471  }
1472  for (proc = 0; (int)proc < nApics; proc++) {
1473  new_retval[proc].first.labels[new_level]
1474  = retval[proc].first.labels[level];
1475  }
1476  new_level++;
1477  }
1478 
1479  __kmp_free(retval);
1480  retval = new_retval;
1481  depth = new_depth;
1482  pkgLevel = newPkgLevel;
1483  coreLevel = newCoreLevel;
1484  threadLevel = newThreadLevel;
1485  }
1486 
1487  if (__kmp_affinity_gran_levels < 0) {
1488  //
1489  // Set the granularity level based on what levels are modeled
1490  // in the machine topology map.
1491  //
1492  __kmp_affinity_gran_levels = 0;
1493  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1494  __kmp_affinity_gran_levels++;
1495  }
1496  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1497  __kmp_affinity_gran_levels++;
1498  }
1499  if (__kmp_affinity_gran > affinity_gran_package) {
1500  __kmp_affinity_gran_levels++;
1501  }
1502  }
1503 
1504  if (__kmp_affinity_verbose) {
1505  __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1506  coreLevel, threadLevel);
1507  }
1508 
1509  __kmp_free(last);
1510  __kmp_free(maxCt);
1511  __kmp_free(counts);
1512  __kmp_free(totals);
1513  KMP_CPU_FREE(oldMask);
1514  *address2os = retval;
1515  return depth;
1516 }
1517 
1518 
1519 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1520 
1521 
1522 #define osIdIndex 0
1523 #define threadIdIndex 1
1524 #define coreIdIndex 2
1525 #define pkgIdIndex 3
1526 #define nodeIdIndex 4
1527 
1528 typedef unsigned *ProcCpuInfo;
1529 static unsigned maxIndex = pkgIdIndex;
1530 
1531 
1532 static int
1533 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1534 {
1535  const unsigned *aa = (const unsigned *)a;
1536  const unsigned *bb = (const unsigned *)b;
1537  if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1538  if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1539  return 0;
1540 };
1541 
1542 
1543 static int
1544 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1545 {
1546  unsigned i;
1547  const unsigned *aa = *((const unsigned **)a);
1548  const unsigned *bb = *((const unsigned **)b);
1549  for (i = maxIndex; ; i--) {
1550  if (aa[i] < bb[i]) return -1;
1551  if (aa[i] > bb[i]) return 1;
1552  if (i == osIdIndex) break;
1553  }
1554  return 0;
1555 }
1556 
1557 
1558 //
1559 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1560 // affinity map.
1561 //
1562 static int
1563 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1564  kmp_i18n_id_t *const msg_id, FILE *f)
1565 {
1566  *address2os = NULL;
1567  *msg_id = kmp_i18n_null;
1568 
1569  //
1570  // Scan of the file, and count the number of "processor" (osId) fields,
1571  // and find the highest value of <n> for a node_<n> field.
1572  //
1573  char buf[256];
1574  unsigned num_records = 0;
1575  while (! feof(f)) {
1576  buf[sizeof(buf) - 1] = 1;
1577  if (! fgets(buf, sizeof(buf), f)) {
1578  //
1579  // Read errors presumably because of EOF
1580  //
1581  break;
1582  }
1583 
1584  char s1[] = "processor";
1585  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1586  num_records++;
1587  continue;
1588  }
1589 
1590  //
1591  // FIXME - this will match "node_<n> <garbage>"
1592  //
1593  unsigned level;
1594  if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1595  if (nodeIdIndex + level >= maxIndex) {
1596  maxIndex = nodeIdIndex + level;
1597  }
1598  continue;
1599  }
1600  }
1601 
1602  //
1603  // Check for empty file / no valid processor records, or too many.
1604  // The number of records can't exceed the number of valid bits in the
1605  // affinity mask.
1606  //
1607  if (num_records == 0) {
1608  *line = 0;
1609  *msg_id = kmp_i18n_str_NoProcRecords;
1610  return -1;
1611  }
1612  if (num_records > (unsigned)__kmp_xproc) {
1613  *line = 0;
1614  *msg_id = kmp_i18n_str_TooManyProcRecords;
1615  return -1;
1616  }
1617 
1618  //
1619  // Set the file pointer back to the begginning, so that we can scan the
1620  // file again, this time performing a full parse of the data.
1621  // Allocate a vector of ProcCpuInfo object, where we will place the data.
1622  // Adding an extra element at the end allows us to remove a lot of extra
1623  // checks for termination conditions.
1624  //
1625  if (fseek(f, 0, SEEK_SET) != 0) {
1626  *line = 0;
1627  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1628  return -1;
1629  }
1630 
1631  //
1632  // Allocate the array of records to store the proc info in. The dummy
1633  // element at the end makes the logic in filling them out easier to code.
1634  //
1635  unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1636  * sizeof(unsigned *));
1637  unsigned i;
1638  for (i = 0; i <= num_records; i++) {
1639  threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1640  * sizeof(unsigned));
1641  }
1642 
1643 #define CLEANUP_THREAD_INFO \
1644  for (i = 0; i <= num_records; i++) { \
1645  __kmp_free(threadInfo[i]); \
1646  } \
1647  __kmp_free(threadInfo);
1648 
1649  //
1650  // A value of UINT_MAX means that we didn't find the field
1651  //
1652  unsigned __index;
1653 
1654 #define INIT_PROC_INFO(p) \
1655  for (__index = 0; __index <= maxIndex; __index++) { \
1656  (p)[__index] = UINT_MAX; \
1657  }
1658 
1659  for (i = 0; i <= num_records; i++) {
1660  INIT_PROC_INFO(threadInfo[i]);
1661  }
1662 
1663  unsigned num_avail = 0;
1664  *line = 0;
1665  while (! feof(f)) {
1666  //
1667  // Create an inner scoping level, so that all the goto targets at the
1668  // end of the loop appear in an outer scoping level. This avoids
1669  // warnings about jumping past an initialization to a target in the
1670  // same block.
1671  //
1672  {
1673  buf[sizeof(buf) - 1] = 1;
1674  bool long_line = false;
1675  if (! fgets(buf, sizeof(buf), f)) {
1676  //
1677  // Read errors presumably because of EOF
1678  //
1679  // If there is valid data in threadInfo[num_avail], then fake
1680  // a blank line in ensure that the last address gets parsed.
1681  //
1682  bool valid = false;
1683  for (i = 0; i <= maxIndex; i++) {
1684  if (threadInfo[num_avail][i] != UINT_MAX) {
1685  valid = true;
1686  }
1687  }
1688  if (! valid) {
1689  break;
1690  }
1691  buf[0] = 0;
1692  } else if (!buf[sizeof(buf) - 1]) {
1693  //
1694  // The line is longer than the buffer. Set a flag and don't
1695  // emit an error if we were going to ignore the line, anyway.
1696  //
1697  long_line = true;
1698 
1699 #define CHECK_LINE \
1700  if (long_line) { \
1701  CLEANUP_THREAD_INFO; \
1702  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
1703  return -1; \
1704  }
1705  }
1706  (*line)++;
1707 
1708  char s1[] = "processor";
1709  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1710  CHECK_LINE;
1711  char *p = strchr(buf + sizeof(s1) - 1, ':');
1712  unsigned val;
1713  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1714  if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1715  threadInfo[num_avail][osIdIndex] = val;
1716 #if KMP_OS_LINUX && USE_SYSFS_INFO
1717  char path[256];
1718  KMP_SNPRINTF(path, sizeof(path),
1719  "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1720  threadInfo[num_avail][osIdIndex]);
1721  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1722 
1723  KMP_SNPRINTF(path, sizeof(path),
1724  "/sys/devices/system/cpu/cpu%u/topology/core_id",
1725  threadInfo[num_avail][osIdIndex]);
1726  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
1727  continue;
1728 #else
1729  }
1730  char s2[] = "physical id";
1731  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1732  CHECK_LINE;
1733  char *p = strchr(buf + sizeof(s2) - 1, ':');
1734  unsigned val;
1735  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1736  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
1737  threadInfo[num_avail][pkgIdIndex] = val;
1738  continue;
1739  }
1740  char s3[] = "core id";
1741  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
1742  CHECK_LINE;
1743  char *p = strchr(buf + sizeof(s3) - 1, ':');
1744  unsigned val;
1745  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1746  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
1747  threadInfo[num_avail][coreIdIndex] = val;
1748  continue;
1749 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
1750  }
1751  char s4[] = "thread id";
1752  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
1753  CHECK_LINE;
1754  char *p = strchr(buf + sizeof(s4) - 1, ':');
1755  unsigned val;
1756  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1757  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
1758  threadInfo[num_avail][threadIdIndex] = val;
1759  continue;
1760  }
1761  unsigned level;
1762  if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1763  CHECK_LINE;
1764  char *p = strchr(buf + sizeof(s4) - 1, ':');
1765  unsigned val;
1766  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1767  KMP_ASSERT(nodeIdIndex + level <= maxIndex);
1768  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
1769  threadInfo[num_avail][nodeIdIndex + level] = val;
1770  continue;
1771  }
1772 
1773  //
1774  // We didn't recognize the leading token on the line.
1775  // There are lots of leading tokens that we don't recognize -
1776  // if the line isn't empty, go on to the next line.
1777  //
1778  if ((*buf != 0) && (*buf != '\n')) {
1779  //
1780  // If the line is longer than the buffer, read characters
1781  // until we find a newline.
1782  //
1783  if (long_line) {
1784  int ch;
1785  while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
1786  }
1787  continue;
1788  }
1789 
1790  //
1791  // A newline has signalled the end of the processor record.
1792  // Check that there aren't too many procs specified.
1793  //
1794  if ((int)num_avail == __kmp_xproc) {
1795  CLEANUP_THREAD_INFO;
1796  *msg_id = kmp_i18n_str_TooManyEntries;
1797  return -1;
1798  }
1799 
1800  //
1801  // Check for missing fields. The osId field must be there, and we
1802  // currently require that the physical id field is specified, also.
1803  //
1804  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
1805  CLEANUP_THREAD_INFO;
1806  *msg_id = kmp_i18n_str_MissingProcField;
1807  return -1;
1808  }
1809  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
1810  CLEANUP_THREAD_INFO;
1811  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
1812  return -1;
1813  }
1814 
1815  //
1816  // Skip this proc if it is not included in the machine model.
1817  //
1818  if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
1819  INIT_PROC_INFO(threadInfo[num_avail]);
1820  continue;
1821  }
1822 
1823  //
1824  // We have a successful parse of this proc's info.
1825  // Increment the counter, and prepare for the next proc.
1826  //
1827  num_avail++;
1828  KMP_ASSERT(num_avail <= num_records);
1829  INIT_PROC_INFO(threadInfo[num_avail]);
1830  }
1831  continue;
1832 
1833  no_val:
1834  CLEANUP_THREAD_INFO;
1835  *msg_id = kmp_i18n_str_MissingValCpuinfo;
1836  return -1;
1837 
1838  dup_field:
1839  CLEANUP_THREAD_INFO;
1840  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
1841  return -1;
1842  }
1843  *line = 0;
1844 
1845 # if KMP_MIC && REDUCE_TEAM_SIZE
1846  unsigned teamSize = 0;
1847 # endif // KMP_MIC && REDUCE_TEAM_SIZE
1848 
1849  // check for num_records == __kmp_xproc ???
1850 
1851  //
1852  // If there's only one thread context to bind to, form an Address object
1853  // with depth 1 and return immediately (or, if affinity is off, set
1854  // address2os to NULL and return).
1855  //
1856  // If it is configured to omit the package level when there is only a
1857  // single package, the logic at the end of this routine won't work if
1858  // there is only a single thread - it would try to form an Address
1859  // object with depth 0.
1860  //
1861  KMP_ASSERT(num_avail > 0);
1862  KMP_ASSERT(num_avail <= num_records);
1863  if (num_avail == 1) {
1864  __kmp_ncores = 1;
1865  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1866  if (__kmp_affinity_verbose) {
1867  if (! KMP_AFFINITY_CAPABLE()) {
1868  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
1869  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1870  KMP_INFORM(Uniform, "KMP_AFFINITY");
1871  }
1872  else {
1873  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1874  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
1875  fullMask);
1876  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
1877  if (__kmp_affinity_respect_mask) {
1878  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1879  } else {
1880  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1881  }
1882  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1883  KMP_INFORM(Uniform, "KMP_AFFINITY");
1884  }
1885  int index;
1886  kmp_str_buf_t buf;
1887  __kmp_str_buf_init(&buf);
1888  __kmp_str_buf_print(&buf, "1");
1889  for (index = maxIndex - 1; index > pkgIdIndex; index--) {
1890  __kmp_str_buf_print(&buf, " x 1");
1891  }
1892  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
1893  __kmp_str_buf_free(&buf);
1894  }
1895 
1896  if (__kmp_affinity_type == affinity_none) {
1897  CLEANUP_THREAD_INFO;
1898  return 0;
1899  }
1900 
1901  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1902  Address addr(1);
1903  addr.labels[0] = threadInfo[0][pkgIdIndex];
1904  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
1905 
1906  if (__kmp_affinity_gran_levels < 0) {
1907  __kmp_affinity_gran_levels = 0;
1908  }
1909 
1910  if (__kmp_affinity_verbose) {
1911  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1912  }
1913 
1914  CLEANUP_THREAD_INFO;
1915  return 1;
1916  }
1917 
1918  //
1919  // Sort the threadInfo table by physical Id.
1920  //
1921  qsort(threadInfo, num_avail, sizeof(*threadInfo),
1922  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
1923 
1924  //
1925  // The table is now sorted by pkgId / coreId / threadId, but we really
1926  // don't know the radix of any of the fields. pkgId's may be sparsely
1927  // assigned among the chips on a system. Although coreId's are usually
1928  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1929  // [0..threadsPerCore-1], we don't want to make any such assumptions.
1930  //
1931  // For that matter, we don't know what coresPerPkg and threadsPerCore
1932  // (or the total # packages) are at this point - we want to determine
1933  // that now. We only have an upper bound on the first two figures.
1934  //
1935  unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
1936  * sizeof(unsigned));
1937  unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
1938  * sizeof(unsigned));
1939  unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
1940  * sizeof(unsigned));
1941  unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
1942  * sizeof(unsigned));
1943 
1944  bool assign_thread_ids = false;
1945  unsigned threadIdCt;
1946  unsigned index;
1947 
1948  restart_radix_check:
1949  threadIdCt = 0;
1950 
1951  //
1952  // Initialize the counter arrays with data from threadInfo[0].
1953  //
1954  if (assign_thread_ids) {
1955  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
1956  threadInfo[0][threadIdIndex] = threadIdCt++;
1957  }
1958  else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
1959  threadIdCt = threadInfo[0][threadIdIndex] + 1;
1960  }
1961  }
1962  for (index = 0; index <= maxIndex; index++) {
1963  counts[index] = 1;
1964  maxCt[index] = 1;
1965  totals[index] = 1;
1966  lastId[index] = threadInfo[0][index];;
1967  }
1968 
1969  //
1970  // Run through the rest of the OS procs.
1971  //
1972  for (i = 1; i < num_avail; i++) {
1973  //
1974  // Find the most significant index whose id differs
1975  // from the id for the previous OS proc.
1976  //
1977  for (index = maxIndex; index >= threadIdIndex; index--) {
1978  if (assign_thread_ids && (index == threadIdIndex)) {
1979  //
1980  // Auto-assign the thread id field if it wasn't specified.
1981  //
1982  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
1983  threadInfo[i][threadIdIndex] = threadIdCt++;
1984  }
1985 
1986  //
1987  // Aparrently the thread id field was specified for some
1988  // entries and not others. Start the thread id counter
1989  // off at the next higher thread id.
1990  //
1991  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
1992  threadIdCt = threadInfo[i][threadIdIndex] + 1;
1993  }
1994  }
1995  if (threadInfo[i][index] != lastId[index]) {
1996  //
1997  // Run through all indices which are less significant,
1998  // and reset the counts to 1.
1999  //
2000  // At all levels up to and including index, we need to
2001  // increment the totals and record the last id.
2002  //
2003  unsigned index2;
2004  for (index2 = threadIdIndex; index2 < index; index2++) {
2005  totals[index2]++;
2006  if (counts[index2] > maxCt[index2]) {
2007  maxCt[index2] = counts[index2];
2008  }
2009  counts[index2] = 1;
2010  lastId[index2] = threadInfo[i][index2];
2011  }
2012  counts[index]++;
2013  totals[index]++;
2014  lastId[index] = threadInfo[i][index];
2015 
2016  if (assign_thread_ids && (index > threadIdIndex)) {
2017 
2018 # if KMP_MIC && REDUCE_TEAM_SIZE
2019  //
2020  // The default team size is the total #threads in the machine
2021  // minus 1 thread for every core that has 3 or more threads.
2022  //
2023  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2024 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2025 
2026  //
2027  // Restart the thread counter, as we are on a new core.
2028  //
2029  threadIdCt = 0;
2030 
2031  //
2032  // Auto-assign the thread id field if it wasn't specified.
2033  //
2034  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2035  threadInfo[i][threadIdIndex] = threadIdCt++;
2036  }
2037 
2038  //
2039  // Aparrently the thread id field was specified for some
2040  // entries and not others. Start the thread id counter
2041  // off at the next higher thread id.
2042  //
2043  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2044  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2045  }
2046  }
2047  break;
2048  }
2049  }
2050  if (index < threadIdIndex) {
2051  //
2052  // If thread ids were specified, it is an error if they are not
2053  // unique. Also, check that we waven't already restarted the
2054  // loop (to be safe - shouldn't need to).
2055  //
2056  if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2057  || assign_thread_ids) {
2058  __kmp_free(lastId);
2059  __kmp_free(totals);
2060  __kmp_free(maxCt);
2061  __kmp_free(counts);
2062  CLEANUP_THREAD_INFO;
2063  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2064  return -1;
2065  }
2066 
2067  //
2068  // If the thread ids were not specified and we see entries
2069  // entries that are duplicates, start the loop over and
2070  // assign the thread ids manually.
2071  //
2072  assign_thread_ids = true;
2073  goto restart_radix_check;
2074  }
2075  }
2076 
2077 # if KMP_MIC && REDUCE_TEAM_SIZE
2078  //
2079  // The default team size is the total #threads in the machine
2080  // minus 1 thread for every core that has 3 or more threads.
2081  //
2082  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2083 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2084 
2085  for (index = threadIdIndex; index <= maxIndex; index++) {
2086  if (counts[index] > maxCt[index]) {
2087  maxCt[index] = counts[index];
2088  }
2089  }
2090 
2091  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2092  nCoresPerPkg = maxCt[coreIdIndex];
2093  nPackages = totals[pkgIdIndex];
2094 
2095  //
2096  // Check to see if the machine topology is uniform
2097  //
2098  unsigned prod = totals[maxIndex];
2099  for (index = threadIdIndex; index < maxIndex; index++) {
2100  prod *= maxCt[index];
2101  }
2102  bool uniform = (prod == totals[threadIdIndex]);
2103 
2104  //
2105  // When affinity is off, this routine will still be called to set
2106  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2107  // nCoresPerPkg, & nPackages. Make sure all these vars are set
2108  // correctly, and return now if affinity is not enabled.
2109  //
2110  __kmp_ncores = totals[coreIdIndex];
2111 
2112  if (__kmp_affinity_verbose) {
2113  if (! KMP_AFFINITY_CAPABLE()) {
2114  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2115  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2116  if (uniform) {
2117  KMP_INFORM(Uniform, "KMP_AFFINITY");
2118  } else {
2119  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2120  }
2121  }
2122  else {
2123  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2124  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2125  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2126  if (__kmp_affinity_respect_mask) {
2127  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2128  } else {
2129  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2130  }
2131  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2132  if (uniform) {
2133  KMP_INFORM(Uniform, "KMP_AFFINITY");
2134  } else {
2135  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2136  }
2137  }
2138  kmp_str_buf_t buf;
2139  __kmp_str_buf_init(&buf);
2140 
2141  __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2142  for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2143  __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2144  }
2145  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2146  maxCt[threadIdIndex], __kmp_ncores);
2147 
2148  __kmp_str_buf_free(&buf);
2149  }
2150 
2151 # if KMP_MIC && REDUCE_TEAM_SIZE
2152  //
2153  // Set the default team size.
2154  //
2155  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2156  __kmp_dflt_team_nth = teamSize;
2157  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2158  __kmp_dflt_team_nth));
2159  }
2160 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2161 
2162  if (__kmp_affinity_type == affinity_none) {
2163  __kmp_free(lastId);
2164  __kmp_free(totals);
2165  __kmp_free(maxCt);
2166  __kmp_free(counts);
2167  CLEANUP_THREAD_INFO;
2168  return 0;
2169  }
2170 
2171  //
2172  // Count the number of levels which have more nodes at that level than
2173  // at the parent's level (with there being an implicit root node of
2174  // the top level). This is equivalent to saying that there is at least
2175  // one node at this level which has a sibling. These levels are in the
2176  // map, and the package level is always in the map.
2177  //
2178  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2179  int level = 0;
2180  for (index = threadIdIndex; index < maxIndex; index++) {
2181  KMP_ASSERT(totals[index] >= totals[index + 1]);
2182  inMap[index] = (totals[index] > totals[index + 1]);
2183  }
2184  inMap[maxIndex] = (totals[maxIndex] > 1);
2185  inMap[pkgIdIndex] = true;
2186 
2187  int depth = 0;
2188  for (index = threadIdIndex; index <= maxIndex; index++) {
2189  if (inMap[index]) {
2190  depth++;
2191  }
2192  }
2193  KMP_ASSERT(depth > 0);
2194 
2195  //
2196  // Construct the data structure that is to be returned.
2197  //
2198  *address2os = (AddrUnsPair*)
2199  __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2200  int pkgLevel = -1;
2201  int coreLevel = -1;
2202  int threadLevel = -1;
2203 
2204  for (i = 0; i < num_avail; ++i) {
2205  Address addr(depth);
2206  unsigned os = threadInfo[i][osIdIndex];
2207  int src_index;
2208  int dst_index = 0;
2209 
2210  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2211  if (! inMap[src_index]) {
2212  continue;
2213  }
2214  addr.labels[dst_index] = threadInfo[i][src_index];
2215  if (src_index == pkgIdIndex) {
2216  pkgLevel = dst_index;
2217  }
2218  else if (src_index == coreIdIndex) {
2219  coreLevel = dst_index;
2220  }
2221  else if (src_index == threadIdIndex) {
2222  threadLevel = dst_index;
2223  }
2224  dst_index++;
2225  }
2226  (*address2os)[i] = AddrUnsPair(addr, os);
2227  }
2228 
2229  if (__kmp_affinity_gran_levels < 0) {
2230  //
2231  // Set the granularity level based on what levels are modeled
2232  // in the machine topology map.
2233  //
2234  unsigned src_index;
2235  __kmp_affinity_gran_levels = 0;
2236  for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2237  if (! inMap[src_index]) {
2238  continue;
2239  }
2240  switch (src_index) {
2241  case threadIdIndex:
2242  if (__kmp_affinity_gran > affinity_gran_thread) {
2243  __kmp_affinity_gran_levels++;
2244  }
2245 
2246  break;
2247  case coreIdIndex:
2248  if (__kmp_affinity_gran > affinity_gran_core) {
2249  __kmp_affinity_gran_levels++;
2250  }
2251  break;
2252 
2253  case pkgIdIndex:
2254  if (__kmp_affinity_gran > affinity_gran_package) {
2255  __kmp_affinity_gran_levels++;
2256  }
2257  break;
2258  }
2259  }
2260  }
2261 
2262  if (__kmp_affinity_verbose) {
2263  __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2264  coreLevel, threadLevel);
2265  }
2266 
2267  __kmp_free(inMap);
2268  __kmp_free(lastId);
2269  __kmp_free(totals);
2270  __kmp_free(maxCt);
2271  __kmp_free(counts);
2272  CLEANUP_THREAD_INFO;
2273  return depth;
2274 }
2275 
2276 
2277 //
2278 // Create and return a table of affinity masks, indexed by OS thread ID.
2279 // This routine handles OR'ing together all the affinity masks of threads
2280 // that are sufficiently close, if granularity > fine.
2281 //
2282 static kmp_affin_mask_t *
2283 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2284  AddrUnsPair *address2os, unsigned numAddrs)
2285 {
2286  //
2287  // First form a table of affinity masks in order of OS thread id.
2288  //
2289  unsigned depth;
2290  unsigned maxOsId;
2291  unsigned i;
2292 
2293  KMP_ASSERT(numAddrs > 0);
2294  depth = address2os[0].first.depth;
2295 
2296  maxOsId = 0;
2297  for (i = 0; i < numAddrs; i++) {
2298  unsigned osId = address2os[i].second;
2299  if (osId > maxOsId) {
2300  maxOsId = osId;
2301  }
2302  }
2303  kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2304  (maxOsId + 1) * __kmp_affin_mask_size);
2305 
2306  //
2307  // Sort the address2os table according to physical order. Doing so
2308  // will put all threads on the same core/package/node in consecutive
2309  // locations.
2310  //
2311  qsort(address2os, numAddrs, sizeof(*address2os),
2312  __kmp_affinity_cmp_Address_labels);
2313 
2314  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2315  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2316  KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2317  }
2318  if (__kmp_affinity_gran_levels >= (int)depth) {
2319  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2320  && (__kmp_affinity_type != affinity_none))) {
2321  KMP_WARNING(AffThreadsMayMigrate);
2322  }
2323  }
2324 
2325  //
2326  // Run through the table, forming the masks for all threads on each
2327  // core. Threads on the same core will have identical "Address"
2328  // objects, not considering the last level, which must be the thread
2329  // id. All threads on a core will appear consecutively.
2330  //
2331  unsigned unique = 0;
2332  unsigned j = 0; // index of 1st thread on core
2333  unsigned leader = 0;
2334  Address *leaderAddr = &(address2os[0].first);
2335  kmp_affin_mask_t *sum
2336  = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
2337  KMP_CPU_ZERO(sum);
2338  KMP_CPU_SET(address2os[0].second, sum);
2339  for (i = 1; i < numAddrs; i++) {
2340  //
2341  // If this thread is sufficiently close to the leader (within the
2342  // granularity setting), then set the bit for this os thread in the
2343  // affinity mask for this group, and go on to the next thread.
2344  //
2345  if (leaderAddr->isClose(address2os[i].first,
2346  __kmp_affinity_gran_levels)) {
2347  KMP_CPU_SET(address2os[i].second, sum);
2348  continue;
2349  }
2350 
2351  //
2352  // For every thread in this group, copy the mask to the thread's
2353  // entry in the osId2Mask table. Mark the first address as a
2354  // leader.
2355  //
2356  for (; j < i; j++) {
2357  unsigned osId = address2os[j].second;
2358  KMP_DEBUG_ASSERT(osId <= maxOsId);
2359  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2360  KMP_CPU_COPY(mask, sum);
2361  address2os[j].first.leader = (j == leader);
2362  }
2363  unique++;
2364 
2365  //
2366  // Start a new mask.
2367  //
2368  leader = i;
2369  leaderAddr = &(address2os[i].first);
2370  KMP_CPU_ZERO(sum);
2371  KMP_CPU_SET(address2os[i].second, sum);
2372  }
2373 
2374  //
2375  // For every thread in last group, copy the mask to the thread's
2376  // entry in the osId2Mask table.
2377  //
2378  for (; j < i; j++) {
2379  unsigned osId = address2os[j].second;
2380  KMP_DEBUG_ASSERT(osId <= maxOsId);
2381  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2382  KMP_CPU_COPY(mask, sum);
2383  address2os[j].first.leader = (j == leader);
2384  }
2385  unique++;
2386 
2387  *maxIndex = maxOsId;
2388  *numUnique = unique;
2389  return osId2Mask;
2390 }
2391 
2392 
2393 //
2394 // Stuff for the affinity proclist parsers. It's easier to declare these vars
2395 // as file-static than to try and pass them through the calling sequence of
2396 // the recursive-descent OMP_PLACES parser.
2397 //
2398 static kmp_affin_mask_t *newMasks;
2399 static int numNewMasks;
2400 static int nextNewMask;
2401 
2402 #define ADD_MASK(_mask) \
2403  { \
2404  if (nextNewMask >= numNewMasks) { \
2405  numNewMasks *= 2; \
2406  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2407  numNewMasks * __kmp_affin_mask_size); \
2408  } \
2409  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2410  nextNewMask++; \
2411  }
2412 
2413 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2414  { \
2415  if (((_osId) > _maxOsId) || \
2416  (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2417  if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2418  && (__kmp_affinity_type != affinity_none))) { \
2419  KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2420  } \
2421  } \
2422  else { \
2423  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2424  } \
2425  }
2426 
2427 
2428 //
2429 // Re-parse the proclist (for the explicit affinity type), and form the list
2430 // of affinity newMasks indexed by gtid.
2431 //
2432 static void
2433 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2434  unsigned int *out_numMasks, const char *proclist,
2435  kmp_affin_mask_t *osId2Mask, int maxOsId)
2436 {
2437  const char *scan = proclist;
2438  const char *next = proclist;
2439 
2440  //
2441  // We use malloc() for the temporary mask vector,
2442  // so that we can use realloc() to extend it.
2443  //
2444  numNewMasks = 2;
2445  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2446  * __kmp_affin_mask_size);
2447  nextNewMask = 0;
2448  kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2449  __kmp_affin_mask_size);
2450  int setSize = 0;
2451 
2452  for (;;) {
2453  int start, end, stride;
2454 
2455  SKIP_WS(scan);
2456  next = scan;
2457  if (*next == '\0') {
2458  break;
2459  }
2460 
2461  if (*next == '{') {
2462  int num;
2463  setSize = 0;
2464  next++; // skip '{'
2465  SKIP_WS(next);
2466  scan = next;
2467 
2468  //
2469  // Read the first integer in the set.
2470  //
2471  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2472  "bad proclist");
2473  SKIP_DIGITS(next);
2474  num = __kmp_str_to_int(scan, *next);
2475  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2476 
2477  //
2478  // Copy the mask for that osId to the sum (union) mask.
2479  //
2480  if ((num > maxOsId) ||
2481  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2482  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2483  && (__kmp_affinity_type != affinity_none))) {
2484  KMP_WARNING(AffIgnoreInvalidProcID, num);
2485  }
2486  KMP_CPU_ZERO(sumMask);
2487  }
2488  else {
2489  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2490  setSize = 1;
2491  }
2492 
2493  for (;;) {
2494  //
2495  // Check for end of set.
2496  //
2497  SKIP_WS(next);
2498  if (*next == '}') {
2499  next++; // skip '}'
2500  break;
2501  }
2502 
2503  //
2504  // Skip optional comma.
2505  //
2506  if (*next == ',') {
2507  next++;
2508  }
2509  SKIP_WS(next);
2510 
2511  //
2512  // Read the next integer in the set.
2513  //
2514  scan = next;
2515  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2516  "bad explicit proc list");
2517 
2518  SKIP_DIGITS(next);
2519  num = __kmp_str_to_int(scan, *next);
2520  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2521 
2522  //
2523  // Add the mask for that osId to the sum mask.
2524  //
2525  if ((num > maxOsId) ||
2526  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2527  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2528  && (__kmp_affinity_type != affinity_none))) {
2529  KMP_WARNING(AffIgnoreInvalidProcID, num);
2530  }
2531  }
2532  else {
2533  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2534  setSize++;
2535  }
2536  }
2537  if (setSize > 0) {
2538  ADD_MASK(sumMask);
2539  }
2540 
2541  SKIP_WS(next);
2542  if (*next == ',') {
2543  next++;
2544  }
2545  scan = next;
2546  continue;
2547  }
2548 
2549  //
2550  // Read the first integer.
2551  //
2552  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2553  SKIP_DIGITS(next);
2554  start = __kmp_str_to_int(scan, *next);
2555  KMP_ASSERT2(start >= 0, "bad explicit proc list");
2556  SKIP_WS(next);
2557 
2558  //
2559  // If this isn't a range, then add a mask to the list and go on.
2560  //
2561  if (*next != '-') {
2562  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2563 
2564  //
2565  // Skip optional comma.
2566  //
2567  if (*next == ',') {
2568  next++;
2569  }
2570  scan = next;
2571  continue;
2572  }
2573 
2574  //
2575  // This is a range. Skip over the '-' and read in the 2nd int.
2576  //
2577  next++; // skip '-'
2578  SKIP_WS(next);
2579  scan = next;
2580  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2581  SKIP_DIGITS(next);
2582  end = __kmp_str_to_int(scan, *next);
2583  KMP_ASSERT2(end >= 0, "bad explicit proc list");
2584 
2585  //
2586  // Check for a stride parameter
2587  //
2588  stride = 1;
2589  SKIP_WS(next);
2590  if (*next == ':') {
2591  //
2592  // A stride is specified. Skip over the ':" and read the 3rd int.
2593  //
2594  int sign = +1;
2595  next++; // skip ':'
2596  SKIP_WS(next);
2597  scan = next;
2598  if (*next == '-') {
2599  sign = -1;
2600  next++;
2601  SKIP_WS(next);
2602  scan = next;
2603  }
2604  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2605  "bad explicit proc list");
2606  SKIP_DIGITS(next);
2607  stride = __kmp_str_to_int(scan, *next);
2608  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2609  stride *= sign;
2610  }
2611 
2612  //
2613  // Do some range checks.
2614  //
2615  KMP_ASSERT2(stride != 0, "bad explicit proc list");
2616  if (stride > 0) {
2617  KMP_ASSERT2(start <= end, "bad explicit proc list");
2618  }
2619  else {
2620  KMP_ASSERT2(start >= end, "bad explicit proc list");
2621  }
2622  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2623 
2624  //
2625  // Add the mask for each OS proc # to the list.
2626  //
2627  if (stride > 0) {
2628  do {
2629  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2630  start += stride;
2631  } while (start <= end);
2632  }
2633  else {
2634  do {
2635  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2636  start += stride;
2637  } while (start >= end);
2638  }
2639 
2640  //
2641  // Skip optional comma.
2642  //
2643  SKIP_WS(next);
2644  if (*next == ',') {
2645  next++;
2646  }
2647  scan = next;
2648  }
2649 
2650  *out_numMasks = nextNewMask;
2651  if (nextNewMask == 0) {
2652  *out_masks = NULL;
2653  KMP_INTERNAL_FREE(newMasks);
2654  return;
2655  }
2656  *out_masks
2657  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2658  KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2659  __kmp_free(sumMask);
2660  KMP_INTERNAL_FREE(newMasks);
2661 }
2662 
2663 
2664 # if OMP_40_ENABLED
2665 
2666 /*-----------------------------------------------------------------------------
2667 
2668 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2669 places. Again, Here is the grammar:
2670 
2671 place_list := place
2672 place_list := place , place_list
2673 place := num
2674 place := place : num
2675 place := place : num : signed
2676 place := { subplacelist }
2677 place := ! place // (lowest priority)
2678 subplace_list := subplace
2679 subplace_list := subplace , subplace_list
2680 subplace := num
2681 subplace := num : num
2682 subplace := num : num : signed
2683 signed := num
2684 signed := + signed
2685 signed := - signed
2686 
2687 -----------------------------------------------------------------------------*/
2688 
2689 static void
2690 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2691  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2692 {
2693  const char *next;
2694 
2695  for (;;) {
2696  int start, count, stride, i;
2697 
2698  //
2699  // Read in the starting proc id
2700  //
2701  SKIP_WS(*scan);
2702  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2703  "bad explicit places list");
2704  next = *scan;
2705  SKIP_DIGITS(next);
2706  start = __kmp_str_to_int(*scan, *next);
2707  KMP_ASSERT(start >= 0);
2708  *scan = next;
2709 
2710  //
2711  // valid follow sets are ',' ':' and '}'
2712  //
2713  SKIP_WS(*scan);
2714  if (**scan == '}' || **scan == ',') {
2715  if ((start > maxOsId) ||
2716  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2717  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2718  && (__kmp_affinity_type != affinity_none))) {
2719  KMP_WARNING(AffIgnoreInvalidProcID, start);
2720  }
2721  }
2722  else {
2723  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2724  (*setSize)++;
2725  }
2726  if (**scan == '}') {
2727  break;
2728  }
2729  (*scan)++; // skip ','
2730  continue;
2731  }
2732  KMP_ASSERT2(**scan == ':', "bad explicit places list");
2733  (*scan)++; // skip ':'
2734 
2735  //
2736  // Read count parameter
2737  //
2738  SKIP_WS(*scan);
2739  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2740  "bad explicit places list");
2741  next = *scan;
2742  SKIP_DIGITS(next);
2743  count = __kmp_str_to_int(*scan, *next);
2744  KMP_ASSERT(count >= 0);
2745  *scan = next;
2746 
2747  //
2748  // valid follow sets are ',' ':' and '}'
2749  //
2750  SKIP_WS(*scan);
2751  if (**scan == '}' || **scan == ',') {
2752  for (i = 0; i < count; i++) {
2753  if ((start > maxOsId) ||
2754  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2755  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2756  && (__kmp_affinity_type != affinity_none))) {
2757  KMP_WARNING(AffIgnoreInvalidProcID, start);
2758  }
2759  break; // don't proliferate warnings for large count
2760  }
2761  else {
2762  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2763  start++;
2764  (*setSize)++;
2765  }
2766  }
2767  if (**scan == '}') {
2768  break;
2769  }
2770  (*scan)++; // skip ','
2771  continue;
2772  }
2773  KMP_ASSERT2(**scan == ':', "bad explicit places list");
2774  (*scan)++; // skip ':'
2775 
2776  //
2777  // Read stride parameter
2778  //
2779  int sign = +1;
2780  for (;;) {
2781  SKIP_WS(*scan);
2782  if (**scan == '+') {
2783  (*scan)++; // skip '+'
2784  continue;
2785  }
2786  if (**scan == '-') {
2787  sign *= -1;
2788  (*scan)++; // skip '-'
2789  continue;
2790  }
2791  break;
2792  }
2793  SKIP_WS(*scan);
2794  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2795  "bad explicit places list");
2796  next = *scan;
2797  SKIP_DIGITS(next);
2798  stride = __kmp_str_to_int(*scan, *next);
2799  KMP_ASSERT(stride >= 0);
2800  *scan = next;
2801  stride *= sign;
2802 
2803  //
2804  // valid follow sets are ',' and '}'
2805  //
2806  SKIP_WS(*scan);
2807  if (**scan == '}' || **scan == ',') {
2808  for (i = 0; i < count; i++) {
2809  if ((start > maxOsId) ||
2810  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2811  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2812  && (__kmp_affinity_type != affinity_none))) {
2813  KMP_WARNING(AffIgnoreInvalidProcID, start);
2814  }
2815  break; // don't proliferate warnings for large count
2816  }
2817  else {
2818  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2819  start += stride;
2820  (*setSize)++;
2821  }
2822  }
2823  if (**scan == '}') {
2824  break;
2825  }
2826  (*scan)++; // skip ','
2827  continue;
2828  }
2829 
2830  KMP_ASSERT2(0, "bad explicit places list");
2831  }
2832 }
2833 
2834 
2835 static void
2836 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
2837  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2838 {
2839  const char *next;
2840 
2841  //
2842  // valid follow sets are '{' '!' and num
2843  //
2844  SKIP_WS(*scan);
2845  if (**scan == '{') {
2846  (*scan)++; // skip '{'
2847  __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
2848  setSize);
2849  KMP_ASSERT2(**scan == '}', "bad explicit places list");
2850  (*scan)++; // skip '}'
2851  }
2852  else if (**scan == '!') {
2853  __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
2854  KMP_CPU_COMPLEMENT(tempMask);
2855  (*scan)++; // skip '!'
2856  }
2857  else if ((**scan >= '0') && (**scan <= '9')) {
2858  next = *scan;
2859  SKIP_DIGITS(next);
2860  int num = __kmp_str_to_int(*scan, *next);
2861  KMP_ASSERT(num >= 0);
2862  if ((num > maxOsId) ||
2863  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2864  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2865  && (__kmp_affinity_type != affinity_none))) {
2866  KMP_WARNING(AffIgnoreInvalidProcID, num);
2867  }
2868  }
2869  else {
2870  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
2871  (*setSize)++;
2872  }
2873  *scan = next; // skip num
2874  }
2875  else {
2876  KMP_ASSERT2(0, "bad explicit places list");
2877  }
2878 }
2879 
2880 
2881 //static void
2882 void
2883 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
2884  unsigned int *out_numMasks, const char *placelist,
2885  kmp_affin_mask_t *osId2Mask, int maxOsId)
2886 {
2887  const char *scan = placelist;
2888  const char *next = placelist;
2889 
2890  numNewMasks = 2;
2891  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2892  * __kmp_affin_mask_size);
2893  nextNewMask = 0;
2894 
2895  kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
2896  __kmp_affin_mask_size);
2897  KMP_CPU_ZERO(tempMask);
2898  int setSize = 0;
2899 
2900  for (;;) {
2901  __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
2902 
2903  //
2904  // valid follow sets are ',' ':' and EOL
2905  //
2906  SKIP_WS(scan);
2907  if (*scan == '\0' || *scan == ',') {
2908  if (setSize > 0) {
2909  ADD_MASK(tempMask);
2910  }
2911  KMP_CPU_ZERO(tempMask);
2912  setSize = 0;
2913  if (*scan == '\0') {
2914  break;
2915  }
2916  scan++; // skip ','
2917  continue;
2918  }
2919 
2920  KMP_ASSERT2(*scan == ':', "bad explicit places list");
2921  scan++; // skip ':'
2922 
2923  //
2924  // Read count parameter
2925  //
2926  SKIP_WS(scan);
2927  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
2928  "bad explicit places list");
2929  next = scan;
2930  SKIP_DIGITS(next);
2931  int count = __kmp_str_to_int(scan, *next);
2932  KMP_ASSERT(count >= 0);
2933  scan = next;
2934 
2935  //
2936  // valid follow sets are ',' ':' and EOL
2937  //
2938  SKIP_WS(scan);
2939  int stride;
2940  if (*scan == '\0' || *scan == ',') {
2941  stride = +1;
2942  }
2943  else {
2944  KMP_ASSERT2(*scan == ':', "bad explicit places list");
2945  scan++; // skip ':'
2946 
2947  //
2948  // Read stride parameter
2949  //
2950  int sign = +1;
2951  for (;;) {
2952  SKIP_WS(scan);
2953  if (*scan == '+') {
2954  scan++; // skip '+'
2955  continue;
2956  }
2957  if (*scan == '-') {
2958  sign *= -1;
2959  scan++; // skip '-'
2960  continue;
2961  }
2962  break;
2963  }
2964  SKIP_WS(scan);
2965  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
2966  "bad explicit places list");
2967  next = scan;
2968  SKIP_DIGITS(next);
2969  stride = __kmp_str_to_int(scan, *next);
2970  KMP_DEBUG_ASSERT(stride >= 0);
2971  scan = next;
2972  stride *= sign;
2973  }
2974 
2975  if (stride > 0) {
2976  int i;
2977  for (i = 0; i < count; i++) {
2978  int j;
2979  if (setSize == 0) {
2980  break;
2981  }
2982  ADD_MASK(tempMask);
2983  setSize = 0;
2984  for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
2985  if (! KMP_CPU_ISSET(j - stride, tempMask)) {
2986  KMP_CPU_CLR(j, tempMask);
2987  }
2988  else if ((j > maxOsId) ||
2989  (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
2990  if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
2991  && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
2992  KMP_WARNING(AffIgnoreInvalidProcID, j);
2993  }
2994  KMP_CPU_CLR(j, tempMask);
2995  }
2996  else {
2997  KMP_CPU_SET(j, tempMask);
2998  setSize++;
2999  }
3000  }
3001  for (; j >= 0; j--) {
3002  KMP_CPU_CLR(j, tempMask);
3003  }
3004  }
3005  }
3006  else {
3007  int i;
3008  for (i = 0; i < count; i++) {
3009  int j;
3010  if (setSize == 0) {
3011  break;
3012  }
3013  ADD_MASK(tempMask);
3014  setSize = 0;
3015  for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
3016  j++) {
3017  if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3018  KMP_CPU_CLR(j, tempMask);
3019  }
3020  else if ((j > maxOsId) ||
3021  (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3022  if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3023  && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3024  KMP_WARNING(AffIgnoreInvalidProcID, j);
3025  }
3026  KMP_CPU_CLR(j, tempMask);
3027  }
3028  else {
3029  KMP_CPU_SET(j, tempMask);
3030  setSize++;
3031  }
3032  }
3033  for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3034  KMP_CPU_CLR(j, tempMask);
3035  }
3036  }
3037  }
3038  KMP_CPU_ZERO(tempMask);
3039  setSize = 0;
3040 
3041  //
3042  // valid follow sets are ',' and EOL
3043  //
3044  SKIP_WS(scan);
3045  if (*scan == '\0') {
3046  break;
3047  }
3048  if (*scan == ',') {
3049  scan++; // skip ','
3050  continue;
3051  }
3052 
3053  KMP_ASSERT2(0, "bad explicit places list");
3054  }
3055 
3056  *out_numMasks = nextNewMask;
3057  if (nextNewMask == 0) {
3058  *out_masks = NULL;
3059  KMP_INTERNAL_FREE(newMasks);
3060  return;
3061  }
3062  *out_masks
3063  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3064  KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3065  __kmp_free(tempMask);
3066  KMP_INTERNAL_FREE(newMasks);
3067 }
3068 
3069 # endif /* OMP_40_ENABLED */
3070 
3071 #undef ADD_MASK
3072 #undef ADD_MASK_OSID
3073 
3074 static void
3075 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3076 {
3077  if (__kmp_place_num_sockets == 0 &&
3078  __kmp_place_num_cores == 0 &&
3079  __kmp_place_num_threads_per_core == 0 )
3080  return; // no topology limiting actions requested, exit
3081  if (__kmp_place_num_sockets == 0)
3082  __kmp_place_num_sockets = nPackages; // use all available sockets
3083  if (__kmp_place_num_cores == 0)
3084  __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3085  if (__kmp_place_num_threads_per_core == 0 ||
3086  __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore)
3087  __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3088 
3089  if ( !__kmp_affinity_uniform_topology() ) {
3090  KMP_WARNING( AffThrPlaceNonUniform );
3091  return; // don't support non-uniform topology
3092  }
3093  if ( depth != 3 ) {
3094  KMP_WARNING( AffThrPlaceNonThreeLevel );
3095  return; // don't support not-3-level topology
3096  }
3097  if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) {
3098  KMP_WARNING(AffThrPlaceManySockets);
3099  return;
3100  }
3101  if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3102  KMP_WARNING( AffThrPlaceManyCores );
3103  return;
3104  }
3105 
3106  AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3107  __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3108 
3109  int i, j, k, n_old = 0, n_new = 0;
3110  for (i = 0; i < nPackages; ++i)
3111  if (i < __kmp_place_socket_offset ||
3112  i >= __kmp_place_socket_offset + __kmp_place_num_sockets)
3113  n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket
3114  else
3115  for (j = 0; j < nCoresPerPkg; ++j) // walk through requested socket
3116  if (j < __kmp_place_core_offset ||
3117  j >= __kmp_place_core_offset + __kmp_place_num_cores)
3118  n_old += __kmp_nThreadsPerCore; // skip not-requested core
3119  else
3120  for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core
3121  if (k < __kmp_place_num_threads_per_core) {
3122  newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data
3123  n_new++;
3124  }
3125  n_old++;
3126  }
3127  KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
3128  KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores *
3129  __kmp_place_num_threads_per_core);
3130 
3131  nPackages = __kmp_place_num_sockets; // correct nPackages
3132  nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3133  __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3134  __kmp_avail_proc = n_new; // correct avail_proc
3135  __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3136 
3137  __kmp_free( *pAddr );
3138  *pAddr = newAddr; // replace old topology with new one
3139 }
3140 
3141 
3142 static AddrUnsPair *address2os = NULL;
3143 static int * procarr = NULL;
3144 static int __kmp_aff_depth = 0;
3145 
3146 static void
3147 __kmp_aux_affinity_initialize(void)
3148 {
3149  if (__kmp_affinity_masks != NULL) {
3150  KMP_ASSERT(fullMask != NULL);
3151  return;
3152  }
3153 
3154  //
3155  // Create the "full" mask - this defines all of the processors that we
3156  // consider to be in the machine model. If respect is set, then it is
3157  // the initialization thread's affinity mask. Otherwise, it is all
3158  // processors that we know about on the machine.
3159  //
3160  if (fullMask == NULL) {
3161  fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3162  }
3163  if (KMP_AFFINITY_CAPABLE()) {
3164  if (__kmp_affinity_respect_mask) {
3165  __kmp_get_system_affinity(fullMask, TRUE);
3166 
3167  //
3168  // Count the number of available processors.
3169  //
3170  unsigned i;
3171  __kmp_avail_proc = 0;
3172  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3173  if (! KMP_CPU_ISSET(i, fullMask)) {
3174  continue;
3175  }
3176  __kmp_avail_proc++;
3177  }
3178  if (__kmp_avail_proc > __kmp_xproc) {
3179  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3180  && (__kmp_affinity_type != affinity_none))) {
3181  KMP_WARNING(ErrorInitializeAffinity);
3182  }
3183  __kmp_affinity_type = affinity_none;
3184  KMP_AFFINITY_DISABLE();
3185  return;
3186  }
3187  }
3188  else {
3189  __kmp_affinity_entire_machine_mask(fullMask);
3190  __kmp_avail_proc = __kmp_xproc;
3191  }
3192  }
3193 
3194  int depth = -1;
3195  kmp_i18n_id_t msg_id = kmp_i18n_null;
3196 
3197  //
3198  // For backward compatibility, setting KMP_CPUINFO_FILE =>
3199  // KMP_TOPOLOGY_METHOD=cpuinfo
3200  //
3201  if ((__kmp_cpuinfo_file != NULL) &&
3202  (__kmp_affinity_top_method == affinity_top_method_all)) {
3203  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3204  }
3205 
3206  if (__kmp_affinity_top_method == affinity_top_method_all) {
3207  //
3208  // In the default code path, errors are not fatal - we just try using
3209  // another method. We only emit a warning message if affinity is on,
3210  // or the verbose flag is set, an the nowarnings flag was not set.
3211  //
3212  const char *file_name = NULL;
3213  int line = 0;
3214 
3215 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3216 
3217  if (__kmp_affinity_verbose) {
3218  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3219  }
3220 
3221  file_name = NULL;
3222  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3223  if (depth == 0) {
3224  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3225  KMP_ASSERT(address2os == NULL);
3226  return;
3227  }
3228 
3229  if (depth < 0) {
3230  if (__kmp_affinity_verbose) {
3231  if (msg_id != kmp_i18n_null) {
3232  KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3233  KMP_I18N_STR(DecodingLegacyAPIC));
3234  }
3235  else {
3236  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3237  }
3238  }
3239 
3240  file_name = NULL;
3241  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3242  if (depth == 0) {
3243  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3244  KMP_ASSERT(address2os == NULL);
3245  return;
3246  }
3247  }
3248 
3249 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3250 
3251 # if KMP_OS_LINUX
3252 
3253  if (depth < 0) {
3254  if (__kmp_affinity_verbose) {
3255  if (msg_id != kmp_i18n_null) {
3256  KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3257  }
3258  else {
3259  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3260  }
3261  }
3262 
3263  FILE *f = fopen("/proc/cpuinfo", "r");
3264  if (f == NULL) {
3265  msg_id = kmp_i18n_str_CantOpenCpuinfo;
3266  }
3267  else {
3268  file_name = "/proc/cpuinfo";
3269  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3270  fclose(f);
3271  if (depth == 0) {
3272  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3273  KMP_ASSERT(address2os == NULL);
3274  return;
3275  }
3276  }
3277  }
3278 
3279 # endif /* KMP_OS_LINUX */
3280 
3281 # if KMP_GROUP_AFFINITY
3282 
3283  if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3284  if (__kmp_affinity_verbose) {
3285  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3286  }
3287 
3288  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3289  KMP_ASSERT(depth != 0);
3290  }
3291 
3292 # endif /* KMP_GROUP_AFFINITY */
3293 
3294  if (depth < 0) {
3295  if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3296  if (file_name == NULL) {
3297  KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3298  }
3299  else if (line == 0) {
3300  KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3301  }
3302  else {
3303  KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3304  }
3305  }
3306  // FIXME - print msg if msg_id = kmp_i18n_null ???
3307 
3308  file_name = "";
3309  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3310  if (depth == 0) {
3311  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3312  KMP_ASSERT(address2os == NULL);
3313  return;
3314  }
3315  KMP_ASSERT(depth > 0);
3316  KMP_ASSERT(address2os != NULL);
3317  }
3318  }
3319 
3320  //
3321  // If the user has specified that a paricular topology discovery method
3322  // is to be used, then we abort if that method fails. The exception is
3323  // group affinity, which might have been implicitly set.
3324  //
3325 
3326 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3327 
3328  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3329  if (__kmp_affinity_verbose) {
3330  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3331  KMP_I18N_STR(Decodingx2APIC));
3332  }
3333 
3334  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3335  if (depth == 0) {
3336  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3337  KMP_ASSERT(address2os == NULL);
3338  return;
3339  }
3340  if (depth < 0) {
3341  KMP_ASSERT(msg_id != kmp_i18n_null);
3342  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3343  }
3344  }
3345  else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3346  if (__kmp_affinity_verbose) {
3347  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3348  KMP_I18N_STR(DecodingLegacyAPIC));
3349  }
3350 
3351  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3352  if (depth == 0) {
3353  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3354  KMP_ASSERT(address2os == NULL);
3355  return;
3356  }
3357  if (depth < 0) {
3358  KMP_ASSERT(msg_id != kmp_i18n_null);
3359  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3360  }
3361  }
3362 
3363 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3364 
3365  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3366  const char *filename;
3367  if (__kmp_cpuinfo_file != NULL) {
3368  filename = __kmp_cpuinfo_file;
3369  }
3370  else {
3371  filename = "/proc/cpuinfo";
3372  }
3373 
3374  if (__kmp_affinity_verbose) {
3375  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3376  }
3377 
3378  FILE *f = fopen(filename, "r");
3379  if (f == NULL) {
3380  int code = errno;
3381  if (__kmp_cpuinfo_file != NULL) {
3382  __kmp_msg(
3383  kmp_ms_fatal,
3384  KMP_MSG(CantOpenFileForReading, filename),
3385  KMP_ERR(code),
3386  KMP_HNT(NameComesFrom_CPUINFO_FILE),
3387  __kmp_msg_null
3388  );
3389  }
3390  else {
3391  __kmp_msg(
3392  kmp_ms_fatal,
3393  KMP_MSG(CantOpenFileForReading, filename),
3394  KMP_ERR(code),
3395  __kmp_msg_null
3396  );
3397  }
3398  }
3399  int line = 0;
3400  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3401  fclose(f);
3402  if (depth < 0) {
3403  KMP_ASSERT(msg_id != kmp_i18n_null);
3404  if (line > 0) {
3405  KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3406  }
3407  else {
3408  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3409  }
3410  }
3411  if (__kmp_affinity_type == affinity_none) {
3412  KMP_ASSERT(depth == 0);
3413  KMP_ASSERT(address2os == NULL);
3414  return;
3415  }
3416  }
3417 
3418 # if KMP_GROUP_AFFINITY
3419 
3420  else if (__kmp_affinity_top_method == affinity_top_method_group) {
3421  if (__kmp_affinity_verbose) {
3422  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3423  }
3424 
3425  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3426  KMP_ASSERT(depth != 0);
3427  if (depth < 0) {
3428  KMP_ASSERT(msg_id != kmp_i18n_null);
3429  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3430  }
3431  }
3432 
3433 # endif /* KMP_GROUP_AFFINITY */
3434 
3435  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3436  if (__kmp_affinity_verbose) {
3437  KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3438  }
3439 
3440  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3441  if (depth == 0) {
3442  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3443  KMP_ASSERT(address2os == NULL);
3444  return;
3445  }
3446  // should not fail
3447  KMP_ASSERT(depth > 0);
3448  KMP_ASSERT(address2os != NULL);
3449  }
3450 
3451  if (address2os == NULL) {
3452  if (KMP_AFFINITY_CAPABLE()
3453  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3454  && (__kmp_affinity_type != affinity_none)))) {
3455  KMP_WARNING(ErrorInitializeAffinity);
3456  }
3457  __kmp_affinity_type = affinity_none;
3458  KMP_AFFINITY_DISABLE();
3459  return;
3460  }
3461 
3462  __kmp_apply_thread_places(&address2os, depth);
3463 
3464  //
3465  // Create the table of masks, indexed by thread Id.
3466  //
3467  unsigned maxIndex;
3468  unsigned numUnique;
3469  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3470  address2os, __kmp_avail_proc);
3471  if (__kmp_affinity_gran_levels == 0) {
3472  KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3473  }
3474 
3475  //
3476  // Set the childNums vector in all Address objects. This must be done
3477  // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3478  // which takes into account the setting of __kmp_affinity_compact.
3479  //
3480  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3481 
3482  switch (__kmp_affinity_type) {
3483 
3484  case affinity_explicit:
3485  KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3486 # if OMP_40_ENABLED
3487  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3488 # endif
3489  {
3490  __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3491  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3492  maxIndex);
3493  }
3494 # if OMP_40_ENABLED
3495  else {
3496  __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3497  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3498  maxIndex);
3499  }
3500 # endif
3501  if (__kmp_affinity_num_masks == 0) {
3502  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3503  && (__kmp_affinity_type != affinity_none))) {
3504  KMP_WARNING(AffNoValidProcID);
3505  }
3506  __kmp_affinity_type = affinity_none;
3507  return;
3508  }
3509  break;
3510 
3511  //
3512  // The other affinity types rely on sorting the Addresses according
3513  // to some permutation of the machine topology tree. Set
3514  // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3515  // then jump to a common code fragment to do the sort and create
3516  // the array of affinity masks.
3517  //
3518 
3519  case affinity_logical:
3520  __kmp_affinity_compact = 0;
3521  if (__kmp_affinity_offset) {
3522  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3523  % __kmp_avail_proc;
3524  }
3525  goto sortAddresses;
3526 
3527  case affinity_physical:
3528  if (__kmp_nThreadsPerCore > 1) {
3529  __kmp_affinity_compact = 1;
3530  if (__kmp_affinity_compact >= depth) {
3531  __kmp_affinity_compact = 0;
3532  }
3533  } else {
3534  __kmp_affinity_compact = 0;
3535  }
3536  if (__kmp_affinity_offset) {
3537  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3538  % __kmp_avail_proc;
3539  }
3540  goto sortAddresses;
3541 
3542  case affinity_scatter:
3543  if (__kmp_affinity_compact >= depth) {
3544  __kmp_affinity_compact = 0;
3545  }
3546  else {
3547  __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3548  }
3549  goto sortAddresses;
3550 
3551  case affinity_compact:
3552  if (__kmp_affinity_compact >= depth) {
3553  __kmp_affinity_compact = depth - 1;
3554  }
3555  goto sortAddresses;
3556 
3557  case affinity_balanced:
3558  // Balanced works only for the case of a single package
3559  if( nPackages > 1 ) {
3560  if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3561  KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3562  }
3563  __kmp_affinity_type = affinity_none;
3564  return;
3565  } else if( __kmp_affinity_uniform_topology() ) {
3566  break;
3567  } else { // Non-uniform topology
3568 
3569  // Save the depth for further usage
3570  __kmp_aff_depth = depth;
3571 
3572  // Number of hyper threads per core in HT machine
3573  int nth_per_core = __kmp_nThreadsPerCore;
3574 
3575  int core_level;
3576  if( nth_per_core > 1 ) {
3577  core_level = depth - 2;
3578  } else {
3579  core_level = depth - 1;
3580  }
3581  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3582  int nproc = nth_per_core * ncores;
3583 
3584  procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3585  for( int i = 0; i < nproc; i++ ) {
3586  procarr[ i ] = -1;
3587  }
3588 
3589  for( int i = 0; i < __kmp_avail_proc; i++ ) {
3590  int proc = address2os[ i ].second;
3591  // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3592  // If there is only one thread per core then depth == 2: level 0 - package,
3593  // level 1 - core.
3594  int level = depth - 1;
3595 
3596  // __kmp_nth_per_core == 1
3597  int thread = 0;
3598  int core = address2os[ i ].first.labels[ level ];
3599  // If the thread level exists, that is we have more than one thread context per core
3600  if( nth_per_core > 1 ) {
3601  thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3602  core = address2os[ i ].first.labels[ level - 1 ];
3603  }
3604  procarr[ core * nth_per_core + thread ] = proc;
3605  }
3606 
3607  break;
3608  }
3609 
3610  sortAddresses:
3611  //
3612  // Allocate the gtid->affinity mask table.
3613  //
3614  if (__kmp_affinity_dups) {
3615  __kmp_affinity_num_masks = __kmp_avail_proc;
3616  }
3617  else {
3618  __kmp_affinity_num_masks = numUnique;
3619  }
3620 
3621 # if OMP_40_ENABLED
3622  if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3623  && ( __kmp_affinity_num_places > 0 )
3624  && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3625  __kmp_affinity_num_masks = __kmp_affinity_num_places;
3626  }
3627 # endif
3628 
3629  __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3630  __kmp_affinity_num_masks * __kmp_affin_mask_size);
3631 
3632  //
3633  // Sort the address2os table according to the current setting of
3634  // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3635  //
3636  qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3637  __kmp_affinity_cmp_Address_child_num);
3638  {
3639  int i;
3640  unsigned j;
3641  for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3642  if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3643  continue;
3644  }
3645  unsigned osId = address2os[i].second;
3646  kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3647  kmp_affin_mask_t *dest
3648  = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3649  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3650  KMP_CPU_COPY(dest, src);
3651  if (++j >= __kmp_affinity_num_masks) {
3652  break;
3653  }
3654  }
3655  KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3656  }
3657  break;
3658 
3659  default:
3660  KMP_ASSERT2(0, "Unexpected affinity setting");
3661  }
3662 
3663  __kmp_free(osId2Mask);
3664  machine_hierarchy.init(address2os, __kmp_avail_proc);
3665 }
3666 
3667 
3668 void
3669 __kmp_affinity_initialize(void)
3670 {
3671  //
3672  // Much of the code above was written assumming that if a machine was not
3673  // affinity capable, then __kmp_affinity_type == affinity_none. We now
3674  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3675  //
3676  // There are too many checks for __kmp_affinity_type == affinity_none
3677  // in this code. Instead of trying to change them all, check if
3678  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3679  // affinity_none, call the real initialization routine, then restore
3680  // __kmp_affinity_type to affinity_disabled.
3681  //
3682  int disabled = (__kmp_affinity_type == affinity_disabled);
3683  if (! KMP_AFFINITY_CAPABLE()) {
3684  KMP_ASSERT(disabled);
3685  }
3686  if (disabled) {
3687  __kmp_affinity_type = affinity_none;
3688  }
3689  __kmp_aux_affinity_initialize();
3690  if (disabled) {
3691  __kmp_affinity_type = affinity_disabled;
3692  }
3693 }
3694 
3695 
3696 void
3697 __kmp_affinity_uninitialize(void)
3698 {
3699  if (__kmp_affinity_masks != NULL) {
3700  __kmp_free(__kmp_affinity_masks);
3701  __kmp_affinity_masks = NULL;
3702  }
3703  if (fullMask != NULL) {
3704  KMP_CPU_FREE(fullMask);
3705  fullMask = NULL;
3706  }
3707  __kmp_affinity_num_masks = 0;
3708 # if OMP_40_ENABLED
3709  __kmp_affinity_num_places = 0;
3710 # endif
3711  if (__kmp_affinity_proclist != NULL) {
3712  __kmp_free(__kmp_affinity_proclist);
3713  __kmp_affinity_proclist = NULL;
3714  }
3715  if( address2os != NULL ) {
3716  __kmp_free( address2os );
3717  address2os = NULL;
3718  }
3719  if( procarr != NULL ) {
3720  __kmp_free( procarr );
3721  procarr = NULL;
3722  }
3723 }
3724 
3725 
3726 void
3727 __kmp_affinity_set_init_mask(int gtid, int isa_root)
3728 {
3729  if (! KMP_AFFINITY_CAPABLE()) {
3730  return;
3731  }
3732 
3733  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3734  if (th->th.th_affin_mask == NULL) {
3735  KMP_CPU_ALLOC(th->th.th_affin_mask);
3736  }
3737  else {
3738  KMP_CPU_ZERO(th->th.th_affin_mask);
3739  }
3740 
3741  //
3742  // Copy the thread mask to the kmp_info_t strucuture.
3743  // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3744  // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
3745  // is set, then the full mask is the same as the mask of the initialization
3746  // thread.
3747  //
3748  kmp_affin_mask_t *mask;
3749  int i;
3750 
3751 # if OMP_40_ENABLED
3752  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3753 # endif
3754  {
3755  if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
3756  ) {
3757 # if KMP_GROUP_AFFINITY
3758  if (__kmp_num_proc_groups > 1) {
3759  return;
3760  }
3761 # endif
3762  KMP_ASSERT(fullMask != NULL);
3763  i = KMP_PLACE_ALL;
3764  mask = fullMask;
3765  }
3766  else {
3767  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3768  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3769  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3770  }
3771  }
3772 # if OMP_40_ENABLED
3773  else {
3774  if ((! isa_root)
3775  || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
3776 # if KMP_GROUP_AFFINITY
3777  if (__kmp_num_proc_groups > 1) {
3778  return;
3779  }
3780 # endif
3781  KMP_ASSERT(fullMask != NULL);
3782  i = KMP_PLACE_ALL;
3783  mask = fullMask;
3784  }
3785  else {
3786  //
3787  // int i = some hash function or just a counter that doesn't
3788  // always start at 0. Use gtid for now.
3789  //
3790  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3791  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3792  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3793  }
3794  }
3795 # endif
3796 
3797 # if OMP_40_ENABLED
3798  th->th.th_current_place = i;
3799  if (isa_root) {
3800  th->th.th_new_place = i;
3801  th->th.th_first_place = 0;
3802  th->th.th_last_place = __kmp_affinity_num_masks - 1;
3803  }
3804 
3805  if (i == KMP_PLACE_ALL) {
3806  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
3807  gtid));
3808  }
3809  else {
3810  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
3811  gtid, i));
3812  }
3813 # else
3814  if (i == -1) {
3815  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
3816  gtid));
3817  }
3818  else {
3819  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
3820  gtid, i));
3821  }
3822 # endif /* OMP_40_ENABLED */
3823 
3824  KMP_CPU_COPY(th->th.th_affin_mask, mask);
3825 
3826  if (__kmp_affinity_verbose) {
3827  char buf[KMP_AFFIN_MASK_PRINT_LEN];
3828  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3829  th->th.th_affin_mask);
3830  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
3831  buf);
3832  }
3833 
3834 # if KMP_OS_WINDOWS
3835  //
3836  // On Windows* OS, the process affinity mask might have changed.
3837  // If the user didn't request affinity and this call fails,
3838  // just continue silently. See CQ171393.
3839  //
3840  if ( __kmp_affinity_type == affinity_none ) {
3841  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
3842  }
3843  else
3844 # endif
3845  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
3846 }
3847 
3848 
3849 # if OMP_40_ENABLED
3850 
3851 void
3852 __kmp_affinity_set_place(int gtid)
3853 {
3854  int retval;
3855 
3856  if (! KMP_AFFINITY_CAPABLE()) {
3857  return;
3858  }
3859 
3860  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3861 
3862  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
3863  gtid, th->th.th_new_place, th->th.th_current_place));
3864 
3865  //
3866  // Check that the new place is within this thread's partition.
3867  //
3868  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
3869  KMP_ASSERT(th->th.th_new_place >= 0);
3870  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
3871  if (th->th.th_first_place <= th->th.th_last_place) {
3872  KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
3873  && (th->th.th_new_place <= th->th.th_last_place));
3874  }
3875  else {
3876  KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
3877  || (th->th.th_new_place >= th->th.th_last_place));
3878  }
3879 
3880  //
3881  // Copy the thread mask to the kmp_info_t strucuture,
3882  // and set this thread's affinity.
3883  //
3884  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
3885  th->th.th_new_place);
3886  KMP_CPU_COPY(th->th.th_affin_mask, mask);
3887  th->th.th_current_place = th->th.th_new_place;
3888 
3889  if (__kmp_affinity_verbose) {
3890  char buf[KMP_AFFIN_MASK_PRINT_LEN];
3891  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3892  th->th.th_affin_mask);
3893  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
3894  gtid, buf);
3895  }
3896  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
3897 }
3898 
3899 # endif /* OMP_40_ENABLED */
3900 
3901 
3902 int
3903 __kmp_aux_set_affinity(void **mask)
3904 {
3905  int gtid;
3906  kmp_info_t *th;
3907  int retval;
3908 
3909  if (! KMP_AFFINITY_CAPABLE()) {
3910  return -1;
3911  }
3912 
3913  gtid = __kmp_entry_gtid();
3914  KA_TRACE(1000, ;{
3915  char buf[KMP_AFFIN_MASK_PRINT_LEN];
3916  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3917  (kmp_affin_mask_t *)(*mask));
3918  __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
3919  gtid, buf);
3920  });
3921 
3922  if (__kmp_env_consistency_check) {
3923  if ((mask == NULL) || (*mask == NULL)) {
3924  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
3925  }
3926  else {
3927  unsigned proc;
3928  int num_procs = 0;
3929 
3930  for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
3931  if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
3932  continue;
3933  }
3934  num_procs++;
3935  if (! KMP_CPU_ISSET(proc, fullMask)) {
3936  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
3937  break;
3938  }
3939  }
3940  if (num_procs == 0) {
3941  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
3942  }
3943 
3944 # if KMP_GROUP_AFFINITY
3945  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
3946  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
3947  }
3948 # endif /* KMP_GROUP_AFFINITY */
3949 
3950  }
3951  }
3952 
3953  th = __kmp_threads[gtid];
3954  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
3955  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
3956  if (retval == 0) {
3957  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
3958  }
3959 
3960 # if OMP_40_ENABLED
3961  th->th.th_current_place = KMP_PLACE_UNDEFINED;
3962  th->th.th_new_place = KMP_PLACE_UNDEFINED;
3963  th->th.th_first_place = 0;
3964  th->th.th_last_place = __kmp_affinity_num_masks - 1;
3965 
3966  //
3967  // Turn off 4.0 affinity for the current tread at this parallel level.
3968  //
3969  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
3970 # endif
3971 
3972  return retval;
3973 }
3974 
3975 
3976 int
3977 __kmp_aux_get_affinity(void **mask)
3978 {
3979  int gtid;
3980  int retval;
3981  kmp_info_t *th;
3982 
3983  if (! KMP_AFFINITY_CAPABLE()) {
3984  return -1;
3985  }
3986 
3987  gtid = __kmp_entry_gtid();
3988  th = __kmp_threads[gtid];
3989  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
3990 
3991  KA_TRACE(1000, ;{
3992  char buf[KMP_AFFIN_MASK_PRINT_LEN];
3993  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3994  th->th.th_affin_mask);
3995  __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
3996  });
3997 
3998  if (__kmp_env_consistency_check) {
3999  if ((mask == NULL) || (*mask == NULL)) {
4000  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4001  }
4002  }
4003 
4004 # if !KMP_OS_WINDOWS
4005 
4006  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4007  KA_TRACE(1000, ;{
4008  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4009  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4010  (kmp_affin_mask_t *)(*mask));
4011  __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4012  });
4013  return retval;
4014 
4015 # else
4016 
4017  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4018  return 0;
4019 
4020 # endif /* KMP_OS_WINDOWS */
4021 
4022 }
4023 
4024 int
4025 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4026 {
4027  int retval;
4028 
4029  if (! KMP_AFFINITY_CAPABLE()) {
4030  return -1;
4031  }
4032 
4033  KA_TRACE(1000, ;{
4034  int gtid = __kmp_entry_gtid();
4035  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4036  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4037  (kmp_affin_mask_t *)(*mask));
4038  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4039  proc, gtid, buf);
4040  });
4041 
4042  if (__kmp_env_consistency_check) {
4043  if ((mask == NULL) || (*mask == NULL)) {
4044  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4045  }
4046  }
4047 
4048  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4049  return -1;
4050  }
4051  if (! KMP_CPU_ISSET(proc, fullMask)) {
4052  return -2;
4053  }
4054 
4055  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4056  return 0;
4057 }
4058 
4059 
4060 int
4061 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4062 {
4063  int retval;
4064 
4065  if (! KMP_AFFINITY_CAPABLE()) {
4066  return -1;
4067  }
4068 
4069  KA_TRACE(1000, ;{
4070  int gtid = __kmp_entry_gtid();
4071  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4072  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4073  (kmp_affin_mask_t *)(*mask));
4074  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4075  proc, gtid, buf);
4076  });
4077 
4078  if (__kmp_env_consistency_check) {
4079  if ((mask == NULL) || (*mask == NULL)) {
4080  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4081  }
4082  }
4083 
4084  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4085  return -1;
4086  }
4087  if (! KMP_CPU_ISSET(proc, fullMask)) {
4088  return -2;
4089  }
4090 
4091  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4092  return 0;
4093 }
4094 
4095 
4096 int
4097 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4098 {
4099  int retval;
4100 
4101  if (! KMP_AFFINITY_CAPABLE()) {
4102  return -1;
4103  }
4104 
4105  KA_TRACE(1000, ;{
4106  int gtid = __kmp_entry_gtid();
4107  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4108  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4109  (kmp_affin_mask_t *)(*mask));
4110  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4111  proc, gtid, buf);
4112  });
4113 
4114  if (__kmp_env_consistency_check) {
4115  if ((mask == NULL) || (*mask == NULL)) {
4116  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4117  }
4118  }
4119 
4120  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4121  return 0;
4122  }
4123  if (! KMP_CPU_ISSET(proc, fullMask)) {
4124  return 0;
4125  }
4126 
4127  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4128 }
4129 
4130 
4131 // Dynamic affinity settings - Affinity balanced
4132 void __kmp_balanced_affinity( int tid, int nthreads )
4133 {
4134  if( __kmp_affinity_uniform_topology() ) {
4135  int coreID;
4136  int threadID;
4137  // Number of hyper threads per core in HT machine
4138  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4139  // Number of cores
4140  int ncores = __kmp_ncores;
4141  // How many threads will be bound to each core
4142  int chunk = nthreads / ncores;
4143  // How many cores will have an additional thread bound to it - "big cores"
4144  int big_cores = nthreads % ncores;
4145  // Number of threads on the big cores
4146  int big_nth = ( chunk + 1 ) * big_cores;
4147  if( tid < big_nth ) {
4148  coreID = tid / (chunk + 1 );
4149  threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4150  } else { //tid >= big_nth
4151  coreID = ( tid - big_cores ) / chunk;
4152  threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4153  }
4154 
4155  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4156  "Illegal set affinity operation when not capable");
4157 
4158  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4159  KMP_CPU_ZERO(mask);
4160 
4161  // Granularity == thread
4162  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4163  int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4164  KMP_CPU_SET( osID, mask);
4165  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4166  for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4167  int osID;
4168  osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4169  KMP_CPU_SET( osID, mask);
4170  }
4171  }
4172  if (__kmp_affinity_verbose) {
4173  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4174  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4175  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4176  tid, buf);
4177  }
4178  __kmp_set_system_affinity( mask, TRUE );
4179  } else { // Non-uniform topology
4180 
4181  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4182  KMP_CPU_ZERO(mask);
4183 
4184  // Number of hyper threads per core in HT machine
4185  int nth_per_core = __kmp_nThreadsPerCore;
4186  int core_level;
4187  if( nth_per_core > 1 ) {
4188  core_level = __kmp_aff_depth - 2;
4189  } else {
4190  core_level = __kmp_aff_depth - 1;
4191  }
4192 
4193  // Number of cores - maximum value; it does not count trail cores with 0 processors
4194  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4195 
4196  // For performance gain consider the special case nthreads == __kmp_avail_proc
4197  if( nthreads == __kmp_avail_proc ) {
4198  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4199  int osID = address2os[ tid ].second;
4200  KMP_CPU_SET( osID, mask);
4201  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4202  int coreID = address2os[ tid ].first.labels[ core_level ];
4203  // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4204  // since the address2os is sortied we can break when cnt==nth_per_core
4205  int cnt = 0;
4206  for( int i = 0; i < __kmp_avail_proc; i++ ) {
4207  int osID = address2os[ i ].second;
4208  int core = address2os[ i ].first.labels[ core_level ];
4209  if( core == coreID ) {
4210  KMP_CPU_SET( osID, mask);
4211  cnt++;
4212  if( cnt == nth_per_core ) {
4213  break;
4214  }
4215  }
4216  }
4217  }
4218  } else if( nthreads <= __kmp_ncores ) {
4219 
4220  int core = 0;
4221  for( int i = 0; i < ncores; i++ ) {
4222  // Check if this core from procarr[] is in the mask
4223  int in_mask = 0;
4224  for( int j = 0; j < nth_per_core; j++ ) {
4225  if( procarr[ i * nth_per_core + j ] != - 1 ) {
4226  in_mask = 1;
4227  break;
4228  }
4229  }
4230  if( in_mask ) {
4231  if( tid == core ) {
4232  for( int j = 0; j < nth_per_core; j++ ) {
4233  int osID = procarr[ i * nth_per_core + j ];
4234  if( osID != -1 ) {
4235  KMP_CPU_SET( osID, mask );
4236  // For granularity=thread it is enough to set the first available osID for this core
4237  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4238  break;
4239  }
4240  }
4241  }
4242  break;
4243  } else {
4244  core++;
4245  }
4246  }
4247  }
4248 
4249  } else { // nthreads > __kmp_ncores
4250 
4251  // Array to save the number of processors at each core
4252  int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
4253  // Array to save the number of cores with "x" available processors;
4254  int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4255  // Array to save the number of cores with # procs from x to nth_per_core
4256  int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4257 
4258  for( int i = 0; i <= nth_per_core; i++ ) {
4259  ncores_with_x_procs[ i ] = 0;
4260  ncores_with_x_to_max_procs[ i ] = 0;
4261  }
4262 
4263  for( int i = 0; i < ncores; i++ ) {
4264  int cnt = 0;
4265  for( int j = 0; j < nth_per_core; j++ ) {
4266  if( procarr[ i * nth_per_core + j ] != -1 ) {
4267  cnt++;
4268  }
4269  }
4270  nproc_at_core[ i ] = cnt;
4271  ncores_with_x_procs[ cnt ]++;
4272  }
4273 
4274  for( int i = 0; i <= nth_per_core; i++ ) {
4275  for( int j = i; j <= nth_per_core; j++ ) {
4276  ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4277  }
4278  }
4279 
4280  // Max number of processors
4281  int nproc = nth_per_core * ncores;
4282  // An array to keep number of threads per each context
4283  int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4284  for( int i = 0; i < nproc; i++ ) {
4285  newarr[ i ] = 0;
4286  }
4287 
4288  int nth = nthreads;
4289  int flag = 0;
4290  while( nth > 0 ) {
4291  for( int j = 1; j <= nth_per_core; j++ ) {
4292  int cnt = ncores_with_x_to_max_procs[ j ];
4293  for( int i = 0; i < ncores; i++ ) {
4294  // Skip the core with 0 processors
4295  if( nproc_at_core[ i ] == 0 ) {
4296  continue;
4297  }
4298  for( int k = 0; k < nth_per_core; k++ ) {
4299  if( procarr[ i * nth_per_core + k ] != -1 ) {
4300  if( newarr[ i * nth_per_core + k ] == 0 ) {
4301  newarr[ i * nth_per_core + k ] = 1;
4302  cnt--;
4303  nth--;
4304  break;
4305  } else {
4306  if( flag != 0 ) {
4307  newarr[ i * nth_per_core + k ] ++;
4308  cnt--;
4309  nth--;
4310  break;
4311  }
4312  }
4313  }
4314  }
4315  if( cnt == 0 || nth == 0 ) {
4316  break;
4317  }
4318  }
4319  if( nth == 0 ) {
4320  break;
4321  }
4322  }
4323  flag = 1;
4324  }
4325  int sum = 0;
4326  for( int i = 0; i < nproc; i++ ) {
4327  sum += newarr[ i ];
4328  if( sum > tid ) {
4329  // Granularity == thread
4330  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4331  int osID = procarr[ i ];
4332  KMP_CPU_SET( osID, mask);
4333  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4334  int coreID = i / nth_per_core;
4335  for( int ii = 0; ii < nth_per_core; ii++ ) {
4336  int osID = procarr[ coreID * nth_per_core + ii ];
4337  if( osID != -1 ) {
4338  KMP_CPU_SET( osID, mask);
4339  }
4340  }
4341  }
4342  break;
4343  }
4344  }
4345  __kmp_free( newarr );
4346  }
4347 
4348  if (__kmp_affinity_verbose) {
4349  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4350  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4351  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4352  tid, buf);
4353  }
4354  __kmp_set_system_affinity( mask, TRUE );
4355  }
4356 }
4357 
4358 #endif // KMP_AFFINITY_SUPPORTED
kmp_uint32 depth
Definition: kmp_affinity.h:162
kmp_uint32 * numPerLevel
Definition: kmp_affinity.h:171