oprofile/x86: protect cpu hotplug sections
[pandora-kernel.git] / arch / x86 / oprofile / nmi_int.c
1 /**
2  * @file nmi_int.c
3  *
4  * @remark Copyright 2002-2009 OProfile authors
5  * @remark Read the file COPYING
6  *
7  * @author John Levon <levon@movementarian.org>
8  * @author Robert Richter <robert.richter@amd.com>
9  * @author Barry Kasindorf <barry.kasindorf@amd.com>
10  * @author Jason Yeh <jason.yeh@amd.com>
11  * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
12  */
13
14 #include <linux/init.h>
15 #include <linux/notifier.h>
16 #include <linux/smp.h>
17 #include <linux/oprofile.h>
18 #include <linux/sysdev.h>
19 #include <linux/slab.h>
20 #include <linux/moduleparam.h>
21 #include <linux/kdebug.h>
22 #include <linux/cpu.h>
23 #include <asm/nmi.h>
24 #include <asm/msr.h>
25 #include <asm/apic.h>
26
27 #include "op_counter.h"
28 #include "op_x86_model.h"
29
30 static struct op_x86_model_spec *model;
31 static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
32 static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
33
34 /* must be protected with get_online_cpus()/put_online_cpus(): */
35 static int nmi_enabled;
36 static int ctr_running;
37
38 struct op_counter_config counter_config[OP_MAX_COUNTER];
39
40 /* common functions */
41
42 u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
43                     struct op_counter_config *counter_config)
44 {
45         u64 val = 0;
46         u16 event = (u16)counter_config->event;
47
48         val |= ARCH_PERFMON_EVENTSEL_INT;
49         val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
50         val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
51         val |= (counter_config->unit_mask & 0xFF) << 8;
52         event &= model->event_mask ? model->event_mask : 0xFF;
53         val |= event & 0xFF;
54         val |= (event & 0x0F00) << 24;
55
56         return val;
57 }
58
59
60 static int profile_exceptions_notify(struct notifier_block *self,
61                                      unsigned long val, void *data)
62 {
63         struct die_args *args = (struct die_args *)data;
64         int ret = NOTIFY_DONE;
65         int cpu = smp_processor_id();
66
67         switch (val) {
68         case DIE_NMI:
69         case DIE_NMI_IPI:
70                 model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
71                 ret = NOTIFY_STOP;
72                 break;
73         default:
74                 break;
75         }
76         return ret;
77 }
78
79 static void nmi_cpu_save_registers(struct op_msrs *msrs)
80 {
81         struct op_msr *counters = msrs->counters;
82         struct op_msr *controls = msrs->controls;
83         unsigned int i;
84
85         for (i = 0; i < model->num_counters; ++i) {
86                 if (counters[i].addr)
87                         rdmsrl(counters[i].addr, counters[i].saved);
88         }
89
90         for (i = 0; i < model->num_controls; ++i) {
91                 if (controls[i].addr)
92                         rdmsrl(controls[i].addr, controls[i].saved);
93         }
94 }
95
96 static void nmi_cpu_start(void *dummy)
97 {
98         struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
99         if (!msrs->controls)
100                 WARN_ON_ONCE(1);
101         else
102                 model->start(msrs);
103 }
104
105 static int nmi_start(void)
106 {
107         get_online_cpus();
108         on_each_cpu(nmi_cpu_start, NULL, 1);
109         ctr_running = 1;
110         put_online_cpus();
111         return 0;
112 }
113
114 static void nmi_cpu_stop(void *dummy)
115 {
116         struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
117         if (!msrs->controls)
118                 WARN_ON_ONCE(1);
119         else
120                 model->stop(msrs);
121 }
122
123 static void nmi_stop(void)
124 {
125         get_online_cpus();
126         on_each_cpu(nmi_cpu_stop, NULL, 1);
127         ctr_running = 0;
128         put_online_cpus();
129 }
130
131 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
132
133 static DEFINE_PER_CPU(int, switch_index);
134
135 static inline int has_mux(void)
136 {
137         return !!model->switch_ctrl;
138 }
139
140 inline int op_x86_phys_to_virt(int phys)
141 {
142         return __get_cpu_var(switch_index) + phys;
143 }
144
145 inline int op_x86_virt_to_phys(int virt)
146 {
147         return virt % model->num_counters;
148 }
149
150 static void nmi_shutdown_mux(void)
151 {
152         int i;
153
154         if (!has_mux())
155                 return;
156
157         for_each_possible_cpu(i) {
158                 kfree(per_cpu(cpu_msrs, i).multiplex);
159                 per_cpu(cpu_msrs, i).multiplex = NULL;
160                 per_cpu(switch_index, i) = 0;
161         }
162 }
163
164 static int nmi_setup_mux(void)
165 {
166         size_t multiplex_size =
167                 sizeof(struct op_msr) * model->num_virt_counters;
168         int i;
169
170         if (!has_mux())
171                 return 1;
172
173         for_each_possible_cpu(i) {
174                 per_cpu(cpu_msrs, i).multiplex =
175                         kzalloc(multiplex_size, GFP_KERNEL);
176                 if (!per_cpu(cpu_msrs, i).multiplex)
177                         return 0;
178         }
179
180         return 1;
181 }
182
183 static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
184 {
185         int i;
186         struct op_msr *multiplex = msrs->multiplex;
187
188         if (!has_mux())
189                 return;
190
191         for (i = 0; i < model->num_virt_counters; ++i) {
192                 if (counter_config[i].enabled) {
193                         multiplex[i].saved = -(u64)counter_config[i].count;
194                 } else {
195                         multiplex[i].saved = 0;
196                 }
197         }
198
199         per_cpu(switch_index, cpu) = 0;
200 }
201
202 static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
203 {
204         struct op_msr *counters = msrs->counters;
205         struct op_msr *multiplex = msrs->multiplex;
206         int i;
207
208         for (i = 0; i < model->num_counters; ++i) {
209                 int virt = op_x86_phys_to_virt(i);
210                 if (counters[i].addr)
211                         rdmsrl(counters[i].addr, multiplex[virt].saved);
212         }
213 }
214
215 static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
216 {
217         struct op_msr *counters = msrs->counters;
218         struct op_msr *multiplex = msrs->multiplex;
219         int i;
220
221         for (i = 0; i < model->num_counters; ++i) {
222                 int virt = op_x86_phys_to_virt(i);
223                 if (counters[i].addr)
224                         wrmsrl(counters[i].addr, multiplex[virt].saved);
225         }
226 }
227
228 static void nmi_cpu_switch(void *dummy)
229 {
230         int cpu = smp_processor_id();
231         int si = per_cpu(switch_index, cpu);
232         struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
233
234         nmi_cpu_stop(NULL);
235         nmi_cpu_save_mpx_registers(msrs);
236
237         /* move to next set */
238         si += model->num_counters;
239         if ((si >= model->num_virt_counters) || (counter_config[si].count == 0))
240                 per_cpu(switch_index, cpu) = 0;
241         else
242                 per_cpu(switch_index, cpu) = si;
243
244         model->switch_ctrl(model, msrs);
245         nmi_cpu_restore_mpx_registers(msrs);
246
247         nmi_cpu_start(NULL);
248 }
249
250
251 /*
252  * Quick check to see if multiplexing is necessary.
253  * The check should be sufficient since counters are used
254  * in ordre.
255  */
256 static int nmi_multiplex_on(void)
257 {
258         return counter_config[model->num_counters].count ? 0 : -EINVAL;
259 }
260
261 static int nmi_switch_event(void)
262 {
263         if (!has_mux())
264                 return -ENOSYS;         /* not implemented */
265         if (nmi_multiplex_on() < 0)
266                 return -EINVAL;         /* not necessary */
267
268         get_online_cpus();
269         if (ctr_running)
270                 on_each_cpu(nmi_cpu_switch, NULL, 1);
271         put_online_cpus();
272
273         return 0;
274 }
275
276 static inline void mux_init(struct oprofile_operations *ops)
277 {
278         if (has_mux())
279                 ops->switch_events = nmi_switch_event;
280 }
281
282 static void mux_clone(int cpu)
283 {
284         if (!has_mux())
285                 return;
286
287         memcpy(per_cpu(cpu_msrs, cpu).multiplex,
288                per_cpu(cpu_msrs, 0).multiplex,
289                sizeof(struct op_msr) * model->num_virt_counters);
290 }
291
292 #else
293
294 inline int op_x86_phys_to_virt(int phys) { return phys; }
295 inline int op_x86_virt_to_phys(int virt) { return virt; }
296 static inline void nmi_shutdown_mux(void) { }
297 static inline int nmi_setup_mux(void) { return 1; }
298 static inline void
299 nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { }
300 static inline void mux_init(struct oprofile_operations *ops) { }
301 static void mux_clone(int cpu) { }
302
303 #endif
304
305 static void free_msrs(void)
306 {
307         int i;
308         for_each_possible_cpu(i) {
309                 kfree(per_cpu(cpu_msrs, i).counters);
310                 per_cpu(cpu_msrs, i).counters = NULL;
311                 kfree(per_cpu(cpu_msrs, i).controls);
312                 per_cpu(cpu_msrs, i).controls = NULL;
313         }
314         nmi_shutdown_mux();
315 }
316
317 static int allocate_msrs(void)
318 {
319         size_t controls_size = sizeof(struct op_msr) * model->num_controls;
320         size_t counters_size = sizeof(struct op_msr) * model->num_counters;
321
322         int i;
323         for_each_possible_cpu(i) {
324                 per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
325                                                         GFP_KERNEL);
326                 if (!per_cpu(cpu_msrs, i).counters)
327                         goto fail;
328                 per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
329                                                         GFP_KERNEL);
330                 if (!per_cpu(cpu_msrs, i).controls)
331                         goto fail;
332         }
333
334         if (!nmi_setup_mux())
335                 goto fail;
336
337         return 1;
338
339 fail:
340         free_msrs();
341         return 0;
342 }
343
344 static void nmi_cpu_setup(void *dummy)
345 {
346         int cpu = smp_processor_id();
347         struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
348         nmi_cpu_save_registers(msrs);
349         spin_lock(&oprofilefs_lock);
350         model->setup_ctrs(model, msrs);
351         nmi_cpu_setup_mux(cpu, msrs);
352         spin_unlock(&oprofilefs_lock);
353         per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC);
354         apic_write(APIC_LVTPC, APIC_DM_NMI);
355 }
356
357 static struct notifier_block profile_exceptions_nb = {
358         .notifier_call = profile_exceptions_notify,
359         .next = NULL,
360         .priority = 2
361 };
362
363 static int nmi_setup(void)
364 {
365         int err = 0;
366         int cpu;
367
368         if (!allocate_msrs())
369                 return -ENOMEM;
370
371         /* We need to serialize save and setup for HT because the subset
372          * of msrs are distinct for save and setup operations
373          */
374
375         /* Assume saved/restored counters are the same on all CPUs */
376         err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
377         if (err)
378                 goto fail;
379
380         for_each_possible_cpu(cpu) {
381                 if (!cpu)
382                         continue;
383
384                 memcpy(per_cpu(cpu_msrs, cpu).counters,
385                        per_cpu(cpu_msrs, 0).counters,
386                        sizeof(struct op_msr) * model->num_counters);
387
388                 memcpy(per_cpu(cpu_msrs, cpu).controls,
389                        per_cpu(cpu_msrs, 0).controls,
390                        sizeof(struct op_msr) * model->num_controls);
391
392                 mux_clone(cpu);
393         }
394
395         err = register_die_notifier(&profile_exceptions_nb);
396         if (err)
397                 goto fail;
398
399         get_online_cpus();
400         on_each_cpu(nmi_cpu_setup, NULL, 1);
401         nmi_enabled = 1;
402         put_online_cpus();
403
404         return 0;
405 fail:
406         free_msrs();
407         return err;
408 }
409
410 static void nmi_cpu_restore_registers(struct op_msrs *msrs)
411 {
412         struct op_msr *counters = msrs->counters;
413         struct op_msr *controls = msrs->controls;
414         unsigned int i;
415
416         for (i = 0; i < model->num_controls; ++i) {
417                 if (controls[i].addr)
418                         wrmsrl(controls[i].addr, controls[i].saved);
419         }
420
421         for (i = 0; i < model->num_counters; ++i) {
422                 if (counters[i].addr)
423                         wrmsrl(counters[i].addr, counters[i].saved);
424         }
425 }
426
427 static void nmi_cpu_shutdown(void *dummy)
428 {
429         unsigned int v;
430         int cpu = smp_processor_id();
431         struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
432
433         /* restoring APIC_LVTPC can trigger an apic error because the delivery
434          * mode and vector nr combination can be illegal. That's by design: on
435          * power on apic lvt contain a zero vector nr which are legal only for
436          * NMI delivery mode. So inhibit apic err before restoring lvtpc
437          */
438         v = apic_read(APIC_LVTERR);
439         apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
440         apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
441         apic_write(APIC_LVTERR, v);
442         nmi_cpu_restore_registers(msrs);
443 }
444
445 static void nmi_shutdown(void)
446 {
447         struct op_msrs *msrs;
448
449         get_online_cpus();
450         on_each_cpu(nmi_cpu_shutdown, NULL, 1);
451         nmi_enabled = 0;
452         ctr_running = 0;
453         put_online_cpus();
454         unregister_die_notifier(&profile_exceptions_nb);
455         msrs = &get_cpu_var(cpu_msrs);
456         model->shutdown(msrs);
457         free_msrs();
458         put_cpu_var(cpu_msrs);
459 }
460
461 static void nmi_cpu_up(void *dummy)
462 {
463         if (nmi_enabled)
464                 nmi_cpu_setup(dummy);
465         if (ctr_running)
466                 nmi_cpu_start(dummy);
467 }
468
469 static void nmi_cpu_down(void *dummy)
470 {
471         if (ctr_running)
472                 nmi_cpu_stop(dummy);
473         if (nmi_enabled)
474                 nmi_cpu_shutdown(dummy);
475 }
476
477 static int nmi_create_files(struct super_block *sb, struct dentry *root)
478 {
479         unsigned int i;
480
481         for (i = 0; i < model->num_virt_counters; ++i) {
482                 struct dentry *dir;
483                 char buf[4];
484
485                 /* quick little hack to _not_ expose a counter if it is not
486                  * available for use.  This should protect userspace app.
487                  * NOTE:  assumes 1:1 mapping here (that counters are organized
488                  *        sequentially in their struct assignment).
489                  */
490                 if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i)))
491                         continue;
492
493                 snprintf(buf,  sizeof(buf), "%d", i);
494                 dir = oprofilefs_mkdir(sb, root, buf);
495                 oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
496                 oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
497                 oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
498                 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
499                 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
500                 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
501         }
502
503         return 0;
504 }
505
506 static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
507                                  void *data)
508 {
509         int cpu = (unsigned long)data;
510         switch (action) {
511         case CPU_DOWN_FAILED:
512         case CPU_ONLINE:
513                 smp_call_function_single(cpu, nmi_cpu_up, NULL, 0);
514                 break;
515         case CPU_DOWN_PREPARE:
516                 smp_call_function_single(cpu, nmi_cpu_down, NULL, 1);
517                 break;
518         }
519         return NOTIFY_DONE;
520 }
521
522 static struct notifier_block oprofile_cpu_nb = {
523         .notifier_call = oprofile_cpu_notifier
524 };
525
526 #ifdef CONFIG_PM
527
528 static int nmi_suspend(struct sys_device *dev, pm_message_t state)
529 {
530         /* Only one CPU left, just stop that one */
531         if (nmi_enabled == 1)
532                 nmi_cpu_stop(NULL);
533         return 0;
534 }
535
536 static int nmi_resume(struct sys_device *dev)
537 {
538         if (nmi_enabled == 1)
539                 nmi_cpu_start(NULL);
540         return 0;
541 }
542
543 static struct sysdev_class oprofile_sysclass = {
544         .name           = "oprofile",
545         .resume         = nmi_resume,
546         .suspend        = nmi_suspend,
547 };
548
549 static struct sys_device device_oprofile = {
550         .id     = 0,
551         .cls    = &oprofile_sysclass,
552 };
553
554 static int __init init_sysfs(void)
555 {
556         int error;
557
558         error = sysdev_class_register(&oprofile_sysclass);
559         if (!error)
560                 error = sysdev_register(&device_oprofile);
561         return error;
562 }
563
564 static void exit_sysfs(void)
565 {
566         sysdev_unregister(&device_oprofile);
567         sysdev_class_unregister(&oprofile_sysclass);
568 }
569
570 #else
571 #define init_sysfs() do { } while (0)
572 #define exit_sysfs() do { } while (0)
573 #endif /* CONFIG_PM */
574
575 static int __init p4_init(char **cpu_type)
576 {
577         __u8 cpu_model = boot_cpu_data.x86_model;
578
579         if (cpu_model > 6 || cpu_model == 5)
580                 return 0;
581
582 #ifndef CONFIG_SMP
583         *cpu_type = "i386/p4";
584         model = &op_p4_spec;
585         return 1;
586 #else
587         switch (smp_num_siblings) {
588         case 1:
589                 *cpu_type = "i386/p4";
590                 model = &op_p4_spec;
591                 return 1;
592
593         case 2:
594                 *cpu_type = "i386/p4-ht";
595                 model = &op_p4_ht2_spec;
596                 return 1;
597         }
598 #endif
599
600         printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n");
601         printk(KERN_INFO "oprofile: Reverting to timer mode.\n");
602         return 0;
603 }
604
605 static int force_arch_perfmon;
606 static int force_cpu_type(const char *str, struct kernel_param *kp)
607 {
608         if (!strcmp(str, "arch_perfmon")) {
609                 force_arch_perfmon = 1;
610                 printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
611         }
612
613         return 0;
614 }
615 module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
616
617 static int __init ppro_init(char **cpu_type)
618 {
619         __u8 cpu_model = boot_cpu_data.x86_model;
620         struct op_x86_model_spec *spec = &op_ppro_spec; /* default */
621
622         if (force_arch_perfmon && cpu_has_arch_perfmon)
623                 return 0;
624
625         switch (cpu_model) {
626         case 0 ... 2:
627                 *cpu_type = "i386/ppro";
628                 break;
629         case 3 ... 5:
630                 *cpu_type = "i386/pii";
631                 break;
632         case 6 ... 8:
633         case 10 ... 11:
634                 *cpu_type = "i386/piii";
635                 break;
636         case 9:
637         case 13:
638                 *cpu_type = "i386/p6_mobile";
639                 break;
640         case 14:
641                 *cpu_type = "i386/core";
642                 break;
643         case 15: case 23:
644                 *cpu_type = "i386/core_2";
645                 break;
646         case 0x2e:
647         case 26:
648                 spec = &op_arch_perfmon_spec;
649                 *cpu_type = "i386/core_i7";
650                 break;
651         case 28:
652                 *cpu_type = "i386/atom";
653                 break;
654         default:
655                 /* Unknown */
656                 return 0;
657         }
658
659         model = spec;
660         return 1;
661 }
662
663 /* in order to get sysfs right */
664 static int using_nmi;
665
666 int __init op_nmi_init(struct oprofile_operations *ops)
667 {
668         __u8 vendor = boot_cpu_data.x86_vendor;
669         __u8 family = boot_cpu_data.x86;
670         char *cpu_type = NULL;
671         int ret = 0;
672
673         if (!cpu_has_apic)
674                 return -ENODEV;
675
676         switch (vendor) {
677         case X86_VENDOR_AMD:
678                 /* Needs to be at least an Athlon (or hammer in 32bit mode) */
679
680                 switch (family) {
681                 case 6:
682                         cpu_type = "i386/athlon";
683                         break;
684                 case 0xf:
685                         /*
686                          * Actually it could be i386/hammer too, but
687                          * give user space an consistent name.
688                          */
689                         cpu_type = "x86-64/hammer";
690                         break;
691                 case 0x10:
692                         cpu_type = "x86-64/family10";
693                         break;
694                 case 0x11:
695                         cpu_type = "x86-64/family11h";
696                         break;
697                 default:
698                         return -ENODEV;
699                 }
700                 model = &op_amd_spec;
701                 break;
702
703         case X86_VENDOR_INTEL:
704                 switch (family) {
705                         /* Pentium IV */
706                 case 0xf:
707                         p4_init(&cpu_type);
708                         break;
709
710                         /* A P6-class processor */
711                 case 6:
712                         ppro_init(&cpu_type);
713                         break;
714
715                 default:
716                         break;
717                 }
718
719                 if (cpu_type)
720                         break;
721
722                 if (!cpu_has_arch_perfmon)
723                         return -ENODEV;
724
725                 /* use arch perfmon as fallback */
726                 cpu_type = "i386/arch_perfmon";
727                 model = &op_arch_perfmon_spec;
728                 break;
729
730         default:
731                 return -ENODEV;
732         }
733
734         get_online_cpus();
735         register_cpu_notifier(&oprofile_cpu_nb);
736         nmi_enabled = 0;
737         ctr_running = 0;
738         put_online_cpus();
739
740         /* default values, can be overwritten by model */
741         ops->create_files       = nmi_create_files;
742         ops->setup              = nmi_setup;
743         ops->shutdown           = nmi_shutdown;
744         ops->start              = nmi_start;
745         ops->stop               = nmi_stop;
746         ops->cpu_type           = cpu_type;
747
748         if (model->init)
749                 ret = model->init(ops);
750         if (ret)
751                 return ret;
752
753         if (!model->num_virt_counters)
754                 model->num_virt_counters = model->num_counters;
755
756         mux_init(ops);
757
758         init_sysfs();
759         using_nmi = 1;
760         printk(KERN_INFO "oprofile: using NMI interrupt.\n");
761         return 0;
762 }
763
764 void op_nmi_exit(void)
765 {
766         if (using_nmi) {
767                 exit_sysfs();
768                 get_online_cpus();
769                 unregister_cpu_notifier(&oprofile_cpu_nb);
770                 nmi_enabled = 0;
771                 ctr_running = 0;
772                 put_online_cpus();
773         }
774         if (model->exit)
775                 model->exit();
776 }