[PATCH] sched: sched tuning
[pandora-kernel.git] / arch / ia64 / kernel / domain.c
1 /*
2  * arch/ia64/kernel/domain.c
3  * Architecture specific sched-domains builder.
4  *
5  * Copyright (C) 2004 Jesse Barnes
6  * Copyright (C) 2004 Silicon Graphics, Inc.
7  */
8
9 #include <linux/sched.h>
10 #include <linux/percpu.h>
11 #include <linux/slab.h>
12 #include <linux/cpumask.h>
13 #include <linux/init.h>
14 #include <linux/topology.h>
15 #include <linux/nodemask.h>
16
17 #define SD_NODES_PER_DOMAIN 16
18
19 #ifdef CONFIG_NUMA
20 /**
21  * find_next_best_node - find the next node to include in a sched_domain
22  * @node: node whose sched_domain we're building
23  * @used_nodes: nodes already in the sched_domain
24  *
25  * Find the next node to include in a given scheduling domain.  Simply
26  * finds the closest node not already in the @used_nodes map.
27  *
28  * Should use nodemask_t.
29  */
30 static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
31 {
32         int i, n, val, min_val, best_node = 0;
33
34         min_val = INT_MAX;
35
36         for (i = 0; i < MAX_NUMNODES; i++) {
37                 /* Start at @node */
38                 n = (node + i) % MAX_NUMNODES;
39
40                 if (!nr_cpus_node(n))
41                         continue;
42
43                 /* Skip already used nodes */
44                 if (test_bit(n, used_nodes))
45                         continue;
46
47                 /* Simple min distance search */
48                 val = node_distance(node, n);
49
50                 if (val < min_val) {
51                         min_val = val;
52                         best_node = n;
53                 }
54         }
55
56         set_bit(best_node, used_nodes);
57         return best_node;
58 }
59
60 /**
61  * sched_domain_node_span - get a cpumask for a node's sched_domain
62  * @node: node whose cpumask we're constructing
63  * @size: number of nodes to include in this span
64  *
65  * Given a node, construct a good cpumask for its sched_domain to span.  It
66  * should be one that prevents unnecessary balancing, but also spreads tasks
67  * out optimally.
68  */
69 static cpumask_t __devinit sched_domain_node_span(int node)
70 {
71         int i;
72         cpumask_t span, nodemask;
73         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
74
75         cpus_clear(span);
76         bitmap_zero(used_nodes, MAX_NUMNODES);
77
78         nodemask = node_to_cpumask(node);
79         cpus_or(span, span, nodemask);
80         set_bit(node, used_nodes);
81
82         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
83                 int next_node = find_next_best_node(node, used_nodes);
84                 nodemask = node_to_cpumask(next_node);
85                 cpus_or(span, span, nodemask);
86         }
87
88         return span;
89 }
90 #endif
91
92 /*
93  * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
94  * can switch it on easily if needed.
95  */
96 #ifdef CONFIG_SCHED_SMT
97 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
98 static struct sched_group sched_group_cpus[NR_CPUS];
99 static int __devinit cpu_to_cpu_group(int cpu)
100 {
101         return cpu;
102 }
103 #endif
104
105 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
106 static struct sched_group sched_group_phys[NR_CPUS];
107 static int __devinit cpu_to_phys_group(int cpu)
108 {
109 #ifdef CONFIG_SCHED_SMT
110         return first_cpu(cpu_sibling_map[cpu]);
111 #else
112         return cpu;
113 #endif
114 }
115
116 #ifdef CONFIG_NUMA
117 /*
118  * The init_sched_build_groups can't handle what we want to do with node
119  * groups, so roll our own. Now each node has its own list of groups which
120  * gets dynamically allocated.
121  */
122 static DEFINE_PER_CPU(struct sched_domain, node_domains);
123 static struct sched_group *sched_group_nodes[MAX_NUMNODES];
124
125 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
126 static struct sched_group sched_group_allnodes[MAX_NUMNODES];
127
128 static int __devinit cpu_to_allnodes_group(int cpu)
129 {
130         return cpu_to_node(cpu);
131 }
132 #endif
133
134 /*
135  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
136  */
137 void __devinit arch_init_sched_domains(void)
138 {
139         int i;
140         cpumask_t cpu_default_map;
141
142         /*
143          * Setup mask for cpus without special case scheduling requirements.
144          * For now this just excludes isolated cpus, but could be used to
145          * exclude other special cases in the future.
146          */
147         cpus_complement(cpu_default_map, cpu_isolated_map);
148         cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
149
150         /*
151          * Set up domains. Isolated domains just stay on the dummy domain.
152          */
153         for_each_cpu_mask(i, cpu_default_map) {
154                 int group;
155                 struct sched_domain *sd = NULL, *p;
156                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
157
158                 cpus_and(nodemask, nodemask, cpu_default_map);
159
160 #ifdef CONFIG_NUMA
161                 if (num_online_cpus()
162                                 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
163                         sd = &per_cpu(allnodes_domains, i);
164                         *sd = SD_ALLNODES_INIT;
165                         sd->span = cpu_default_map;
166                         group = cpu_to_allnodes_group(i);
167                         sd->groups = &sched_group_allnodes[group];
168                         p = sd;
169                 } else
170                         p = NULL;
171
172                 sd = &per_cpu(node_domains, i);
173                 *sd = SD_NODE_INIT;
174                 sd->span = sched_domain_node_span(cpu_to_node(i));
175                 sd->parent = p;
176                 cpus_and(sd->span, sd->span, cpu_default_map);
177 #endif
178
179                 p = sd;
180                 sd = &per_cpu(phys_domains, i);
181                 group = cpu_to_phys_group(i);
182                 *sd = SD_CPU_INIT;
183                 sd->span = nodemask;
184                 sd->parent = p;
185                 sd->groups = &sched_group_phys[group];
186
187 #ifdef CONFIG_SCHED_SMT
188                 p = sd;
189                 sd = &per_cpu(cpu_domains, i);
190                 group = cpu_to_cpu_group(i);
191                 *sd = SD_SIBLING_INIT;
192                 sd->span = cpu_sibling_map[i];
193                 cpus_and(sd->span, sd->span, cpu_default_map);
194                 sd->parent = p;
195                 sd->groups = &sched_group_cpus[group];
196 #endif
197         }
198
199 #ifdef CONFIG_SCHED_SMT
200         /* Set up CPU (sibling) groups */
201         for_each_cpu_mask(i, cpu_default_map) {
202                 cpumask_t this_sibling_map = cpu_sibling_map[i];
203                 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
204                 if (i != first_cpu(this_sibling_map))
205                         continue;
206
207                 init_sched_build_groups(sched_group_cpus, this_sibling_map,
208                                                 &cpu_to_cpu_group);
209         }
210 #endif
211
212         /* Set up physical groups */
213         for (i = 0; i < MAX_NUMNODES; i++) {
214                 cpumask_t nodemask = node_to_cpumask(i);
215
216                 cpus_and(nodemask, nodemask, cpu_default_map);
217                 if (cpus_empty(nodemask))
218                         continue;
219
220                 init_sched_build_groups(sched_group_phys, nodemask,
221                                                 &cpu_to_phys_group);
222         }
223
224 #ifdef CONFIG_NUMA
225         init_sched_build_groups(sched_group_allnodes, cpu_default_map,
226                                 &cpu_to_allnodes_group);
227
228         for (i = 0; i < MAX_NUMNODES; i++) {
229                 /* Set up node groups */
230                 struct sched_group *sg, *prev;
231                 cpumask_t nodemask = node_to_cpumask(i);
232                 cpumask_t domainspan;
233                 cpumask_t covered = CPU_MASK_NONE;
234                 int j;
235
236                 cpus_and(nodemask, nodemask, cpu_default_map);
237                 if (cpus_empty(nodemask))
238                         continue;
239
240                 domainspan = sched_domain_node_span(i);
241                 cpus_and(domainspan, domainspan, cpu_default_map);
242
243                 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
244                 sched_group_nodes[i] = sg;
245                 for_each_cpu_mask(j, nodemask) {
246                         struct sched_domain *sd;
247                         sd = &per_cpu(node_domains, j);
248                         sd->groups = sg;
249                         if (sd->groups == NULL) {
250                                 /* Turn off balancing if we have no groups */
251                                 sd->flags = 0;
252                         }
253                 }
254                 if (!sg) {
255                         printk(KERN_WARNING
256                         "Can not alloc domain group for node %d\n", i);
257                         continue;
258                 }
259                 sg->cpu_power = 0;
260                 sg->cpumask = nodemask;
261                 cpus_or(covered, covered, nodemask);
262                 prev = sg;
263
264                 for (j = 0; j < MAX_NUMNODES; j++) {
265                         cpumask_t tmp, notcovered;
266                         int n = (i + j) % MAX_NUMNODES;
267
268                         cpus_complement(notcovered, covered);
269                         cpus_and(tmp, notcovered, cpu_default_map);
270                         cpus_and(tmp, tmp, domainspan);
271                         if (cpus_empty(tmp))
272                                 break;
273
274                         nodemask = node_to_cpumask(n);
275                         cpus_and(tmp, tmp, nodemask);
276                         if (cpus_empty(tmp))
277                                 continue;
278
279                         sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
280                         if (!sg) {
281                                 printk(KERN_WARNING
282                                 "Can not alloc domain group for node %d\n", j);
283                                 break;
284                         }
285                         sg->cpu_power = 0;
286                         sg->cpumask = tmp;
287                         cpus_or(covered, covered, tmp);
288                         prev->next = sg;
289                         prev = sg;
290                 }
291                 prev->next = sched_group_nodes[i];
292         }
293 #endif
294
295         /* Calculate CPU power for physical packages and nodes */
296         for_each_cpu_mask(i, cpu_default_map) {
297                 int power;
298                 struct sched_domain *sd;
299 #ifdef CONFIG_SCHED_SMT
300                 sd = &per_cpu(cpu_domains, i);
301                 power = SCHED_LOAD_SCALE;
302                 sd->groups->cpu_power = power;
303 #endif
304
305                 sd = &per_cpu(phys_domains, i);
306                 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
307                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
308                 sd->groups->cpu_power = power;
309
310 #ifdef CONFIG_NUMA
311                 sd = &per_cpu(allnodes_domains, i);
312                 if (sd->groups) {
313                         power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
314                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
315                         sd->groups->cpu_power = power;
316                 }
317 #endif
318         }
319
320 #ifdef CONFIG_NUMA
321         for (i = 0; i < MAX_NUMNODES; i++) {
322                 struct sched_group *sg = sched_group_nodes[i];
323                 int j;
324
325                 if (sg == NULL)
326                         continue;
327 next_sg:
328                 for_each_cpu_mask(j, sg->cpumask) {
329                         struct sched_domain *sd;
330                         int power;
331
332                         sd = &per_cpu(phys_domains, j);
333                         if (j != first_cpu(sd->groups->cpumask)) {
334                                 /*
335                                  * Only add "power" once for each
336                                  * physical package.
337                                  */
338                                 continue;
339                         }
340                         power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
341                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
342
343                         sg->cpu_power += power;
344                 }
345                 sg = sg->next;
346                 if (sg != sched_group_nodes[i])
347                         goto next_sg;
348         }
349 #endif
350
351         /* Attach the domains */
352         for_each_online_cpu(i) {
353                 struct sched_domain *sd;
354 #ifdef CONFIG_SCHED_SMT
355                 sd = &per_cpu(cpu_domains, i);
356 #else
357                 sd = &per_cpu(phys_domains, i);
358 #endif
359                 cpu_attach_domain(sd, i);
360         }
361 }
362
363 void __devinit arch_destroy_sched_domains(void)
364 {
365 #ifdef CONFIG_NUMA
366         int i;
367         for (i = 0; i < MAX_NUMNODES; i++) {
368                 struct sched_group *oldsg, *sg = sched_group_nodes[i];
369                 if (sg == NULL)
370                         continue;
371                 sg = sg->next;
372 next_sg:
373                 oldsg = sg;
374                 sg = sg->next;
375                 kfree(oldsg);
376                 if (oldsg != sched_group_nodes[i])
377                         goto next_sg;
378                 sched_group_nodes[i] = NULL;
379         }
380 #endif
381 }
382