cpuidle: Single/Global registration of idle states
[pandora-kernel.git] / drivers / idle / intel_idle.c
1 /*
2  * intel_idle.c - native hardware idle loop for modern Intel processors
3  *
4  * Copyright (c) 2010, Intel Corporation.
5  * Len Brown <len.brown@intel.com>
6  *
7  * This program is free software; you can redistribute it and/or modify it
8  * under the terms and conditions of the GNU General Public License,
9  * version 2, as published by the Free Software Foundation.
10  *
11  * This program is distributed in the hope it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14  * more details.
15  *
16  * You should have received a copy of the GNU General Public License along with
17  * this program; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19  */
20
21 /*
22  * intel_idle is a cpuidle driver that loads on specific Intel processors
23  * in lieu of the legacy ACPI processor_idle driver.  The intent is to
24  * make Linux more efficient on these processors, as intel_idle knows
25  * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs.
26  */
27
28 /*
29  * Design Assumptions
30  *
31  * All CPUs have same idle states as boot CPU
32  *
33  * Chipset BM_STS (bus master status) bit is a NOP
34  *      for preventing entry into deep C-stats
35  */
36
37 /*
38  * Known limitations
39  *
40  * The driver currently initializes for_each_online_cpu() upon modprobe.
41  * It it unaware of subsequent processors hot-added to the system.
42  * This means that if you boot with maxcpus=n and later online
43  * processors above n, those processors will use C1 only.
44  *
45  * ACPI has a .suspend hack to turn off deep c-statees during suspend
46  * to avoid complications with the lapic timer workaround.
47  * Have not seen issues with suspend, but may need same workaround here.
48  *
49  * There is currently no kernel-based automatic probing/loading mechanism
50  * if the driver is built as a module.
51  */
52
53 /* un-comment DEBUG to enable pr_debug() statements */
54 #define DEBUG
55
56 #include <linux/kernel.h>
57 #include <linux/cpuidle.h>
58 #include <linux/clockchips.h>
59 #include <linux/hrtimer.h>      /* ktime_get_real() */
60 #include <trace/events/power.h>
61 #include <linux/sched.h>
62 #include <linux/notifier.h>
63 #include <linux/cpu.h>
64 #include <asm/mwait.h>
65 #include <asm/msr.h>
66
67 #define INTEL_IDLE_VERSION "0.4"
68 #define PREFIX "intel_idle: "
69
70 static struct cpuidle_driver intel_idle_driver = {
71         .name = "intel_idle",
72         .owner = THIS_MODULE,
73 };
74 /* intel_idle.max_cstate=0 disables driver */
75 static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1;
76
77 static unsigned int mwait_substates;
78
79 #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
80 /* Reliable LAPIC Timer States, bit 1 for C1 etc.  */
81 static unsigned int lapic_timer_reliable_states = (1 << 1);      /* Default to only C1 */
82
83 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
84 static int intel_idle(struct cpuidle_device *dev,
85                         struct cpuidle_driver *drv, int index);
86
87 static struct cpuidle_state *cpuidle_state_table;
88
89 /*
90  * Hardware C-state auto-demotion may not always be optimal.
91  * Indicate which enable bits to clear here.
92  */
93 static unsigned long long auto_demotion_disable_flags;
94
95 /*
96  * Set this flag for states where the HW flushes the TLB for us
97  * and so we don't need cross-calls to keep it consistent.
98  * If this flag is set, SW flushes the TLB, so even if the
99  * HW doesn't do the flushing, this flag is safe to use.
100  */
101 #define CPUIDLE_FLAG_TLB_FLUSHED        0x10000
102
103 /*
104  * States are indexed by the cstate number,
105  * which is also the index into the MWAIT hint array.
106  * Thus C0 is a dummy.
107  */
108 static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = {
109         { /* MWAIT C0 */ },
110         { /* MWAIT C1 */
111                 .name = "C1-NHM",
112                 .desc = "MWAIT 0x00",
113                 .flags = CPUIDLE_FLAG_TIME_VALID,
114                 .exit_latency = 3,
115                 .target_residency = 6,
116                 .enter = &intel_idle },
117         { /* MWAIT C2 */
118                 .name = "C3-NHM",
119                 .desc = "MWAIT 0x10",
120                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
121                 .exit_latency = 20,
122                 .target_residency = 80,
123                 .enter = &intel_idle },
124         { /* MWAIT C3 */
125                 .name = "C6-NHM",
126                 .desc = "MWAIT 0x20",
127                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
128                 .exit_latency = 200,
129                 .target_residency = 800,
130                 .enter = &intel_idle },
131 };
132
133 static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = {
134         { /* MWAIT C0 */ },
135         { /* MWAIT C1 */
136                 .name = "C1-SNB",
137                 .desc = "MWAIT 0x00",
138                 .flags = CPUIDLE_FLAG_TIME_VALID,
139                 .exit_latency = 1,
140                 .target_residency = 1,
141                 .enter = &intel_idle },
142         { /* MWAIT C2 */
143                 .name = "C3-SNB",
144                 .desc = "MWAIT 0x10",
145                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
146                 .exit_latency = 80,
147                 .target_residency = 211,
148                 .enter = &intel_idle },
149         { /* MWAIT C3 */
150                 .name = "C6-SNB",
151                 .desc = "MWAIT 0x20",
152                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
153                 .exit_latency = 104,
154                 .target_residency = 345,
155                 .enter = &intel_idle },
156         { /* MWAIT C4 */
157                 .name = "C7-SNB",
158                 .desc = "MWAIT 0x30",
159                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
160                 .exit_latency = 109,
161                 .target_residency = 345,
162                 .enter = &intel_idle },
163 };
164
165 static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = {
166         { /* MWAIT C0 */ },
167         { /* MWAIT C1 */
168                 .name = "C1-ATM",
169                 .desc = "MWAIT 0x00",
170                 .flags = CPUIDLE_FLAG_TIME_VALID,
171                 .exit_latency = 1,
172                 .target_residency = 4,
173                 .enter = &intel_idle },
174         { /* MWAIT C2 */
175                 .name = "C2-ATM",
176                 .desc = "MWAIT 0x10",
177                 .flags = CPUIDLE_FLAG_TIME_VALID,
178                 .exit_latency = 20,
179                 .target_residency = 80,
180                 .enter = &intel_idle },
181         { /* MWAIT C3 */ },
182         { /* MWAIT C4 */
183                 .name = "C4-ATM",
184                 .desc = "MWAIT 0x30",
185                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
186                 .exit_latency = 100,
187                 .target_residency = 400,
188                 .enter = &intel_idle },
189         { /* MWAIT C5 */ },
190         { /* MWAIT C6 */
191                 .name = "C6-ATM",
192                 .desc = "MWAIT 0x52",
193                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
194                 .exit_latency = 140,
195                 .target_residency = 560,
196                 .enter = &intel_idle },
197 };
198
199 static int get_driver_data(int cstate)
200 {
201         int driver_data;
202         switch (cstate) {
203
204         case 1: /* MWAIT C1 */
205                 driver_data = 0x00;
206                 break;
207         case 2: /* MWAIT C2 */
208                 driver_data = 0x10;
209                 break;
210         case 3: /* MWAIT C3 */
211                 driver_data = 0x20;
212                 break;
213         case 4: /* MWAIT C4 */
214                 driver_data = 0x30;
215                 break;
216         case 5: /* MWAIT C5 */
217                 driver_data = 0x40;
218                 break;
219         case 6: /* MWAIT C6 */
220                 driver_data = 0x52;
221                 break;
222         default:
223                 driver_data = 0x00;
224         }
225         return driver_data;
226 }
227
228 /**
229  * intel_idle
230  * @dev: cpuidle_device
231  * @drv: cpuidle driver
232  * @index: index of cpuidle state
233  *
234  */
235 static int intel_idle(struct cpuidle_device *dev,
236                 struct cpuidle_driver *drv, int index)
237 {
238         unsigned long ecx = 1; /* break on interrupt flag */
239         struct cpuidle_state *state = &drv->states[index];
240         struct cpuidle_state_usage *state_usage = &dev->states_usage[index];
241         unsigned long eax = (unsigned long)cpuidle_get_statedata(state_usage);
242         unsigned int cstate;
243         ktime_t kt_before, kt_after;
244         s64 usec_delta;
245         int cpu = smp_processor_id();
246
247         cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
248
249         local_irq_disable();
250
251         /*
252          * leave_mm() to avoid costly and often unnecessary wakeups
253          * for flushing the user TLB's associated with the active mm.
254          */
255         if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
256                 leave_mm(cpu);
257
258         if (!(lapic_timer_reliable_states & (1 << (cstate))))
259                 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
260
261         kt_before = ktime_get_real();
262
263         stop_critical_timings();
264         if (!need_resched()) {
265
266                 __monitor((void *)&current_thread_info()->flags, 0, 0);
267                 smp_mb();
268                 if (!need_resched())
269                         __mwait(eax, ecx);
270         }
271
272         start_critical_timings();
273
274         kt_after = ktime_get_real();
275         usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before));
276
277         local_irq_enable();
278
279         if (!(lapic_timer_reliable_states & (1 << (cstate))))
280                 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
281
282         /* Update cpuidle counters */
283         dev->last_residency = (int)usec_delta;
284
285         return index;
286 }
287
288 static void __setup_broadcast_timer(void *arg)
289 {
290         unsigned long reason = (unsigned long)arg;
291         int cpu = smp_processor_id();
292
293         reason = reason ?
294                 CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF;
295
296         clockevents_notify(reason, &cpu);
297 }
298
299 static int setup_broadcast_cpuhp_notify(struct notifier_block *n,
300                 unsigned long action, void *hcpu)
301 {
302         int hotcpu = (unsigned long)hcpu;
303
304         switch (action & 0xf) {
305         case CPU_ONLINE:
306                 smp_call_function_single(hotcpu, __setup_broadcast_timer,
307                         (void *)true, 1);
308                 break;
309         }
310         return NOTIFY_OK;
311 }
312
313 static struct notifier_block setup_broadcast_notifier = {
314         .notifier_call = setup_broadcast_cpuhp_notify,
315 };
316
317 static void auto_demotion_disable(void *dummy)
318 {
319         unsigned long long msr_bits;
320
321         rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
322         msr_bits &= ~auto_demotion_disable_flags;
323         wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
324 }
325
326 /*
327  * intel_idle_probe()
328  */
329 static int intel_idle_probe(void)
330 {
331         unsigned int eax, ebx, ecx;
332
333         if (max_cstate == 0) {
334                 pr_debug(PREFIX "disabled\n");
335                 return -EPERM;
336         }
337
338         if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
339                 return -ENODEV;
340
341         if (!boot_cpu_has(X86_FEATURE_MWAIT))
342                 return -ENODEV;
343
344         if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
345                 return -ENODEV;
346
347         cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
348
349         if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
350                 !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
351                         return -ENODEV;
352
353         pr_debug(PREFIX "MWAIT substates: 0x%x\n", mwait_substates);
354
355
356         if (boot_cpu_data.x86 != 6)     /* family 6 */
357                 return -ENODEV;
358
359         switch (boot_cpu_data.x86_model) {
360
361         case 0x1A:      /* Core i7, Xeon 5500 series */
362         case 0x1E:      /* Core i7 and i5 Processor - Lynnfield Jasper Forest */
363         case 0x1F:      /* Core i7 and i5 Processor - Nehalem */
364         case 0x2E:      /* Nehalem-EX Xeon */
365         case 0x2F:      /* Westmere-EX Xeon */
366         case 0x25:      /* Westmere */
367         case 0x2C:      /* Westmere */
368                 cpuidle_state_table = nehalem_cstates;
369                 auto_demotion_disable_flags =
370                         (NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE);
371                 break;
372
373         case 0x1C:      /* 28 - Atom Processor */
374                 cpuidle_state_table = atom_cstates;
375                 break;
376
377         case 0x26:      /* 38 - Lincroft Atom Processor */
378                 cpuidle_state_table = atom_cstates;
379                 auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE;
380                 break;
381
382         case 0x2A:      /* SNB */
383         case 0x2D:      /* SNB Xeon */
384                 cpuidle_state_table = snb_cstates;
385                 break;
386
387         default:
388                 pr_debug(PREFIX "does not run on family %d model %d\n",
389                         boot_cpu_data.x86, boot_cpu_data.x86_model);
390                 return -ENODEV;
391         }
392
393         if (boot_cpu_has(X86_FEATURE_ARAT))     /* Always Reliable APIC Timer */
394                 lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
395         else {
396                 smp_call_function(__setup_broadcast_timer, (void *)true, 1);
397                 register_cpu_notifier(&setup_broadcast_notifier);
398         }
399
400         pr_debug(PREFIX "v" INTEL_IDLE_VERSION
401                 " model 0x%X\n", boot_cpu_data.x86_model);
402
403         pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n",
404                 lapic_timer_reliable_states);
405         return 0;
406 }
407
408 /*
409  * intel_idle_cpuidle_devices_uninit()
410  * unregister, free cpuidle_devices
411  */
412 static void intel_idle_cpuidle_devices_uninit(void)
413 {
414         int i;
415         struct cpuidle_device *dev;
416
417         for_each_online_cpu(i) {
418                 dev = per_cpu_ptr(intel_idle_cpuidle_devices, i);
419                 cpuidle_unregister_device(dev);
420         }
421
422         free_percpu(intel_idle_cpuidle_devices);
423         return;
424 }
425 /*
426  * intel_idle_cpuidle_driver_init()
427  * allocate, initialize cpuidle_states
428  */
429 static int intel_idle_cpuidle_driver_init(void)
430 {
431         int cstate;
432         struct cpuidle_driver *drv = &intel_idle_driver;
433
434         drv->state_count = 1;
435
436         for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) {
437                 int num_substates;
438
439                 if (cstate > max_cstate) {
440                         printk(PREFIX "max_cstate %d reached\n",
441                                 max_cstate);
442                         break;
443                 }
444
445                 /* does the state exist in CPUID.MWAIT? */
446                 num_substates = (mwait_substates >> ((cstate) * 4))
447                                         & MWAIT_SUBSTATE_MASK;
448                 if (num_substates == 0)
449                         continue;
450                 /* is the state not enabled? */
451                 if (cpuidle_state_table[cstate].enter == NULL) {
452                         /* does the driver not know about the state? */
453                         if (*cpuidle_state_table[cstate].name == '\0')
454                                 pr_debug(PREFIX "unaware of model 0x%x"
455                                         " MWAIT %d please"
456                                         " contact lenb@kernel.org",
457                                 boot_cpu_data.x86_model, cstate);
458                         continue;
459                 }
460
461                 if ((cstate > 2) &&
462                         !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
463                         mark_tsc_unstable("TSC halts in idle"
464                                         " states deeper than C2");
465
466                 drv->states[drv->state_count] = /* structure copy */
467                         cpuidle_state_table[cstate];
468
469                 drv->state_count += 1;
470         }
471
472         if (auto_demotion_disable_flags)
473                 smp_call_function(auto_demotion_disable, NULL, 1);
474
475         return 0;
476 }
477
478
479 /*
480  * intel_idle_cpuidle_devices_init()
481  * allocate, initialize, register cpuidle_devices
482  */
483 static int intel_idle_cpuidle_devices_init(void)
484 {
485         int i, cstate;
486         struct cpuidle_device *dev;
487
488         intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
489         if (intel_idle_cpuidle_devices == NULL)
490                 return -ENOMEM;
491
492         for_each_online_cpu(i) {
493                 dev = per_cpu_ptr(intel_idle_cpuidle_devices, i);
494
495                 dev->state_count = 1;
496
497                 for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) {
498                         int num_substates;
499
500                         if (cstate > max_cstate) {
501                                 printk(PREFIX "max_cstate %d reached\n",
502                                         max_cstate);
503                                 break;
504                         }
505
506                         /* does the state exist in CPUID.MWAIT? */
507                         num_substates = (mwait_substates >> ((cstate) * 4))
508                                                 & MWAIT_SUBSTATE_MASK;
509                         if (num_substates == 0)
510                                 continue;
511                         /* is the state not enabled? */
512                         if (cpuidle_state_table[cstate].enter == NULL) {
513                                 continue;
514                         }
515
516                         dev->states_usage[dev->state_count].driver_data =
517                                 (void *)get_driver_data(cstate);
518
519                         dev->state_count += 1;
520                 }
521
522                 dev->cpu = i;
523                 if (cpuidle_register_device(dev)) {
524                         pr_debug(PREFIX "cpuidle_register_device %d failed!\n",
525                                  i);
526                         intel_idle_cpuidle_devices_uninit();
527                         return -EIO;
528                 }
529         }
530
531         return 0;
532 }
533
534
535 static int __init intel_idle_init(void)
536 {
537         int retval;
538
539         /* Do not load intel_idle at all for now if idle= is passed */
540         if (boot_option_idle_override != IDLE_NO_OVERRIDE)
541                 return -ENODEV;
542
543         retval = intel_idle_probe();
544         if (retval)
545                 return retval;
546
547         intel_idle_cpuidle_driver_init();
548         retval = cpuidle_register_driver(&intel_idle_driver);
549         if (retval) {
550                 printk(KERN_DEBUG PREFIX "intel_idle yielding to %s",
551                         cpuidle_get_driver()->name);
552                 return retval;
553         }
554
555         retval = intel_idle_cpuidle_devices_init();
556         if (retval) {
557                 cpuidle_unregister_driver(&intel_idle_driver);
558                 return retval;
559         }
560
561         return 0;
562 }
563
564 static void __exit intel_idle_exit(void)
565 {
566         intel_idle_cpuidle_devices_uninit();
567         cpuidle_unregister_driver(&intel_idle_driver);
568
569         if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) {
570                 smp_call_function(__setup_broadcast_timer, (void *)false, 1);
571                 unregister_cpu_notifier(&setup_broadcast_notifier);
572         }
573
574         return;
575 }
576
577 module_init(intel_idle_init);
578 module_exit(intel_idle_exit);
579
580 module_param(max_cstate, int, 0444);
581
582 MODULE_AUTHOR("Len Brown <len.brown@intel.com>");
583 MODULE_DESCRIPTION("Cpuidle driver for Intel Hardware v" INTEL_IDLE_VERSION);
584 MODULE_LICENSE("GPL");