arch/ia64/kernel/domain.c

   1 /*
   2  * arch/ia64/kernel/domain.c
   3  * Architecture specific sched-domains builder.
   4  *
   5  * Copyright (C) 2004 Jesse Barnes
   6  * Copyright (C) 2004 Silicon Graphics, Inc.
   7  */
   8
   9 #include <linux/sched.h>
  10 #include <linux/percpu.h>
  11 #include <linux/slab.h>
  12 #include <linux/cpumask.h>
  13 #include <linux/init.h>
  14 #include <linux/topology.h>
  15 #include <linux/nodemask.h>
  16
  17 #define SD_NODES_PER_DOMAIN 16
  18
  19 #ifdef CONFIG_NUMA
  20 /**
  21  * find_next_best_node - find the next node to include in a sched_domain
  22  * @node: node whose sched_domain we're building
  23  * @used_nodes: nodes already in the sched_domain
  24  *
  25  * Find the next node to include in a given scheduling domain.  Simply
  26  * finds the closest node not already in the @used_nodes map.
  27  *
  28  * Should use nodemask_t.
  29  */
  30 static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
  31 {
  32         int i, n, val, min_val, best_node = 0;
  33
  34         min_val = INT_MAX;
  35
  36         for (i = 0; i < MAX_NUMNODES; i++) {
  37                 /* Start at @node */
  38                 n = (node + i) % MAX_NUMNODES;
  39
  40                 if (!nr_cpus_node(n))
  41                         continue;
  42
  43                 /* Skip already used nodes */
  44                 if (test_bit(n, used_nodes))
  45                         continue;
  46
  47                 /* Simple min distance search */
  48                 val = node_distance(node, n);
  49
  50                 if (val < min_val) {
  51                         min_val = val;
  52                         best_node = n;
  53                 }
  54         }
  55
  56         set_bit(best_node, used_nodes);
  57         return best_node;
  58 }
  59
  60 /**
  61  * sched_domain_node_span - get a cpumask for a node's sched_domain
  62  * @node: node whose cpumask we're constructing
  63  * @size: number of nodes to include in this span
  64  *
  65  * Given a node, construct a good cpumask for its sched_domain to span.  It
  66  * should be one that prevents unnecessary balancing, but also spreads tasks
  67  * out optimally.
  68  */
  69 static cpumask_t __devinit sched_domain_node_span(int node)
  70 {
  71         int i;
  72         cpumask_t span, nodemask;
  73         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
  74
  75         cpus_clear(span);
  76         bitmap_zero(used_nodes, MAX_NUMNODES);
  77
  78         nodemask = node_to_cpumask(node);
  79         cpus_or(span, span, nodemask);
  80         set_bit(node, used_nodes);
  81
  82         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
  83                 int next_node = find_next_best_node(node, used_nodes);
  84                 nodemask = node_to_cpumask(next_node);
  85                 cpus_or(span, span, nodemask);
  86         }
  87
  88         return span;
  89 }
  90 #endif
  91
  92 /*
  93  * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
  94  * can switch it on easily if needed.
  95  */
  96 #ifdef CONFIG_SCHED_SMT
  97 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
  98 static struct sched_group sched_group_cpus[NR_CPUS];
  99 static int __devinit cpu_to_cpu_group(int cpu)
 100 {
 101         return cpu;
 102 }
 103 #endif
 104
 105 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 106 static struct sched_group sched_group_phys[NR_CPUS];
 107 static int __devinit cpu_to_phys_group(int cpu)
 108 {
 109 #ifdef CONFIG_SCHED_SMT
 110         return first_cpu(cpu_sibling_map[cpu]);
 111 #else
 112         return cpu;
 113 #endif
 114 }
 115
 116 #ifdef CONFIG_NUMA
 117 /*
 118  * The init_sched_build_groups can't handle what we want to do with node
 119  * groups, so roll our own. Now each node has its own list of groups which
 120  * gets dynamically allocated.
 121  */
 122 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 123 static struct sched_group *sched_group_nodes[MAX_NUMNODES];
 124
 125 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 126 static struct sched_group sched_group_allnodes[MAX_NUMNODES];
 127
 128 static int __devinit cpu_to_allnodes_group(int cpu)
 129 {
 130         return cpu_to_node(cpu);
 131 }
 132 #endif
 133
 134 /*
 135  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
 136  */
 137 void __devinit arch_init_sched_domains(void)
 138 {
 139         int i;
 140         cpumask_t cpu_default_map;
 141
 142         /*
 143          * Setup mask for cpus without special case scheduling requirements.
 144          * For now this just excludes isolated cpus, but could be used to
 145          * exclude other special cases in the future.
 146          */
 147         cpus_complement(cpu_default_map, cpu_isolated_map);
 148         cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
 149
 150         /*
 151          * Set up domains. Isolated domains just stay on the dummy domain.
 152          */
 153         for_each_cpu_mask(i, cpu_default_map) {
 154                 int group;
 155                 struct sched_domain *sd = NULL, *p;
 156                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 157
 158                 cpus_and(nodemask, nodemask, cpu_default_map);
 159
 160 #ifdef CONFIG_NUMA
 161                 if (num_online_cpus()
 162                                 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
 163                         sd = &per_cpu(allnodes_domains, i);
 164                         *sd = SD_ALLNODES_INIT;
 165                         sd->span = cpu_default_map;
 166                         group = cpu_to_allnodes_group(i);
 167                         sd->groups = &sched_group_allnodes[group];
 168                         p = sd;
 169                 } else
 170                         p = NULL;
 171
 172                 sd = &per_cpu(node_domains, i);
 173                 *sd = SD_NODE_INIT;
 174                 sd->span = sched_domain_node_span(cpu_to_node(i));
 175                 sd->parent = p;
 176                 cpus_and(sd->span, sd->span, cpu_default_map);
 177 #endif
 178
 179                 p = sd;
 180                 sd = &per_cpu(phys_domains, i);
 181                 group = cpu_to_phys_group(i);
 182                 *sd = SD_CPU_INIT;
 183                 sd->span = nodemask;
 184                 sd->parent = p;
 185                 sd->groups = &sched_group_phys[group];
 186
 187 #ifdef CONFIG_SCHED_SMT
 188                 p = sd;
 189                 sd = &per_cpu(cpu_domains, i);
 190                 group = cpu_to_cpu_group(i);
 191                 *sd = SD_SIBLING_INIT;
 192                 sd->span = cpu_sibling_map[i];
 193                 cpus_and(sd->span, sd->span, cpu_default_map);
 194                 sd->parent = p;
 195                 sd->groups = &sched_group_cpus[group];
 196 #endif
 197         }
 198
 199 #ifdef CONFIG_SCHED_SMT
 200         /* Set up CPU (sibling) groups */
 201         for_each_cpu_mask(i, cpu_default_map) {
 202                 cpumask_t this_sibling_map = cpu_sibling_map[i];
 203                 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
 204                 if (i != first_cpu(this_sibling_map))
 205                         continue;
 206
 207                 init_sched_build_groups(sched_group_cpus, this_sibling_map,
 208                                                 &cpu_to_cpu_group);
 209         }
 210 #endif
 211
 212         /* Set up physical groups */
 213         for (i = 0; i < MAX_NUMNODES; i++) {
 214                 cpumask_t nodemask = node_to_cpumask(i);
 215
 216                 cpus_and(nodemask, nodemask, cpu_default_map);
 217                 if (cpus_empty(nodemask))
 218                         continue;
 219
 220                 init_sched_build_groups(sched_group_phys, nodemask,
 221                                                 &cpu_to_phys_group);
 222         }
 223
 224 #ifdef CONFIG_NUMA
 225         init_sched_build_groups(sched_group_allnodes, cpu_default_map,
 226                                 &cpu_to_allnodes_group);
 227
 228         for (i = 0; i < MAX_NUMNODES; i++) {
 229                 /* Set up node groups */
 230                 struct sched_group *sg, *prev;
 231                 cpumask_t nodemask = node_to_cpumask(i);
 232                 cpumask_t domainspan;
 233                 cpumask_t covered = CPU_MASK_NONE;
 234                 int j;
 235
 236                 cpus_and(nodemask, nodemask, cpu_default_map);
 237                 if (cpus_empty(nodemask))
 238                         continue;
 239
 240                 domainspan = sched_domain_node_span(i);
 241                 cpus_and(domainspan, domainspan, cpu_default_map);
 242
 243                 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
 244                 sched_group_nodes[i] = sg;
 245                 for_each_cpu_mask(j, nodemask) {
 246                         struct sched_domain *sd;
 247                         sd = &per_cpu(node_domains, j);
 248                         sd->groups = sg;
 249                         if (sd->groups == NULL) {
 250                                 /* Turn off balancing if we have no groups */
 251                                 sd->flags = 0;
 252                         }
 253                 }
 254                 if (!sg) {
 255                         printk(KERN_WARNING
 256                         "Can not alloc domain group for node %d\n", i);
 257                         continue;
 258                 }
 259                 sg->cpu_power = 0;
 260                 sg->cpumask = nodemask;
 261                 cpus_or(covered, covered, nodemask);
 262                 prev = sg;
 263
 264                 for (j = 0; j < MAX_NUMNODES; j++) {
 265                         cpumask_t tmp, notcovered;
 266                         int n = (i + j) % MAX_NUMNODES;
 267
 268                         cpus_complement(notcovered, covered);
 269                         cpus_and(tmp, notcovered, cpu_default_map);
 270                         cpus_and(tmp, tmp, domainspan);
 271                         if (cpus_empty(tmp))
 272                                 break;
 273
 274                         nodemask = node_to_cpumask(n);
 275                         cpus_and(tmp, tmp, nodemask);
 276                         if (cpus_empty(tmp))
 277                                 continue;
 278
 279                         sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
 280                         if (!sg) {
 281                                 printk(KERN_WARNING
 282                                 "Can not alloc domain group for node %d\n", j);
 283                                 break;
 284                         }
 285                         sg->cpu_power = 0;
 286                         sg->cpumask = tmp;
 287                         cpus_or(covered, covered, tmp);
 288                         prev->next = sg;
 289                         prev = sg;
 290                 }
 291                 prev->next = sched_group_nodes[i];
 292         }
 293 #endif
 294
 295         /* Calculate CPU power for physical packages and nodes */
 296         for_each_cpu_mask(i, cpu_default_map) {
 297                 int power;
 298                 struct sched_domain *sd;
 299 #ifdef CONFIG_SCHED_SMT
 300                 sd = &per_cpu(cpu_domains, i);
 301                 power = SCHED_LOAD_SCALE;
 302                 sd->groups->cpu_power = power;
 303 #endif
 304
 305                 sd = &per_cpu(phys_domains, i);
 306                 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 307                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
 308                 sd->groups->cpu_power = power;
 309
 310 #ifdef CONFIG_NUMA
 311                 sd = &per_cpu(allnodes_domains, i);
 312                 if (sd->groups) {
 313                         power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 314                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
 315                         sd->groups->cpu_power = power;
 316                 }
 317 #endif
 318         }
 319
 320 #ifdef CONFIG_NUMA
 321         for (i = 0; i < MAX_NUMNODES; i++) {
 322                 struct sched_group *sg = sched_group_nodes[i];
 323                 int j;
 324
 325                 if (sg == NULL)
 326                         continue;
 327 next_sg:
 328                 for_each_cpu_mask(j, sg->cpumask) {
 329                         struct sched_domain *sd;
 330                         int power;
 331
 332                         sd = &per_cpu(phys_domains, j);
 333                         if (j != first_cpu(sd->groups->cpumask)) {
 334                                 /*
 335                                  * Only add "power" once for each
 336                                  * physical package.
 337                                  */
 338                                 continue;
 339                         }
 340                         power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 341                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
 342
 343                         sg->cpu_power += power;
 344                 }
 345                 sg = sg->next;
 346                 if (sg != sched_group_nodes[i])
 347                         goto next_sg;
 348         }
 349 #endif
 350
 351         /* Attach the domains */
 352         for_each_online_cpu(i) {
 353                 struct sched_domain *sd;
 354 #ifdef CONFIG_SCHED_SMT
 355                 sd = &per_cpu(cpu_domains, i);
 356 #else
 357                 sd = &per_cpu(phys_domains, i);
 358 #endif
 359                 cpu_attach_domain(sd, i);
 360         }
 361 }
 362
 363 void __devinit arch_destroy_sched_domains(void)
 364 {
 365 #ifdef CONFIG_NUMA
 366         int i;
 367         for (i = 0; i < MAX_NUMNODES; i++) {
 368                 struct sched_group *oldsg, *sg = sched_group_nodes[i];
 369                 if (sg == NULL)
 370                         continue;
 371                 sg = sg->next;
 372 next_sg:
 373                 oldsg = sg;
 374                 sg = sg->next;
 375                 kfree(oldsg);
 376                 if (oldsg != sched_group_nodes[i])
 377                         goto next_sg;
 378                 sched_group_nodes[i] = NULL;
 379         }
 380 #endif
 381 }
 382