3aa8d4cb6dca699c88464834b3a953766b36b4f2
[linux-2.6.git] / drivers / idle / intel_idle.c
1 /*
2  * intel_idle.c - native hardware idle loop for modern Intel processors
3  *
4  * Copyright (c) 2010, Intel Corporation.
5  * Len Brown <len.brown@intel.com>
6  *
7  * This program is free software; you can redistribute it and/or modify it
8  * under the terms and conditions of the GNU General Public License,
9  * version 2, as published by the Free Software Foundation.
10  *
11  * This program is distributed in the hope it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14  * more details.
15  *
16  * You should have received a copy of the GNU General Public License along with
17  * this program; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19  */
20
21 /*
22  * intel_idle is a cpuidle driver that loads on specific Intel processors
23  * in lieu of the legacy ACPI processor_idle driver.  The intent is to
24  * make Linux more efficient on these processors, as intel_idle knows
25  * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs.
26  */
27
28 /*
29  * Design Assumptions
30  *
31  * All CPUs have same idle states as boot CPU
32  *
33  * Chipset BM_STS (bus master status) bit is a NOP
34  *      for preventing entry into deep C-stats
35  */
36
37 /*
38  * Known limitations
39  *
40  * The driver currently initializes for_each_online_cpu() upon modprobe.
41  * It it unaware of subsequent processors hot-added to the system.
42  * This means that if you boot with maxcpus=n and later online
43  * processors above n, those processors will use C1 only.
44  *
45  * ACPI has a .suspend hack to turn off deep c-statees during suspend
46  * to avoid complications with the lapic timer workaround.
47  * Have not seen issues with suspend, but may need same workaround here.
48  *
49  * There is currently no kernel-based automatic probing/loading mechanism
50  * if the driver is built as a module.
51  */
52
53 /* un-comment DEBUG to enable pr_debug() statements */
54 #define DEBUG
55
56 #include <linux/kernel.h>
57 #include <linux/cpuidle.h>
58 #include <linux/clockchips.h>
59 #include <linux/hrtimer.h>      /* ktime_get_real() */
60 #include <trace/events/power.h>
61 #include <linux/sched.h>
62 #include <linux/notifier.h>
63 #include <linux/cpu.h>
64 #include <asm/mwait.h>
65 #include <asm/msr.h>
66
67 #define INTEL_IDLE_VERSION "0.4"
68 #define PREFIX "intel_idle: "
69
70 static struct cpuidle_driver intel_idle_driver = {
71         .name = "intel_idle",
72         .owner = THIS_MODULE,
73 };
74 /* intel_idle.max_cstate=0 disables driver */
75 static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1;
76
77 static unsigned int mwait_substates;
78
79 #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
80 /* Reliable LAPIC Timer States, bit 1 for C1 etc.  */
81 static unsigned int lapic_timer_reliable_states = (1 << 1);      /* Default to only C1 */
82
83 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
84 static int intel_idle(struct cpuidle_device *dev, int index);
85
86 static struct cpuidle_state *cpuidle_state_table;
87
88 /*
89  * Hardware C-state auto-demotion may not always be optimal.
90  * Indicate which enable bits to clear here.
91  */
92 static unsigned long long auto_demotion_disable_flags;
93
94 /*
95  * Set this flag for states where the HW flushes the TLB for us
96  * and so we don't need cross-calls to keep it consistent.
97  * If this flag is set, SW flushes the TLB, so even if the
98  * HW doesn't do the flushing, this flag is safe to use.
99  */
100 #define CPUIDLE_FLAG_TLB_FLUSHED        0x10000
101
102 /*
103  * States are indexed by the cstate number,
104  * which is also the index into the MWAIT hint array.
105  * Thus C0 is a dummy.
106  */
107 static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = {
108         { /* MWAIT C0 */ },
109         { /* MWAIT C1 */
110                 .name = "C1-NHM",
111                 .desc = "MWAIT 0x00",
112                 .flags = CPUIDLE_FLAG_TIME_VALID,
113                 .exit_latency = 3,
114                 .target_residency = 6,
115                 .enter = &intel_idle },
116         { /* MWAIT C2 */
117                 .name = "C3-NHM",
118                 .desc = "MWAIT 0x10",
119                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
120                 .exit_latency = 20,
121                 .target_residency = 80,
122                 .enter = &intel_idle },
123         { /* MWAIT C3 */
124                 .name = "C6-NHM",
125                 .desc = "MWAIT 0x20",
126                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
127                 .exit_latency = 200,
128                 .target_residency = 800,
129                 .enter = &intel_idle },
130 };
131
132 static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = {
133         { /* MWAIT C0 */ },
134         { /* MWAIT C1 */
135                 .name = "C1-SNB",
136                 .desc = "MWAIT 0x00",
137                 .flags = CPUIDLE_FLAG_TIME_VALID,
138                 .exit_latency = 1,
139                 .target_residency = 1,
140                 .enter = &intel_idle },
141         { /* MWAIT C2 */
142                 .name = "C3-SNB",
143                 .desc = "MWAIT 0x10",
144                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
145                 .exit_latency = 80,
146                 .target_residency = 211,
147                 .enter = &intel_idle },
148         { /* MWAIT C3 */
149                 .name = "C6-SNB",
150                 .desc = "MWAIT 0x20",
151                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
152                 .exit_latency = 104,
153                 .target_residency = 345,
154                 .enter = &intel_idle },
155         { /* MWAIT C4 */
156                 .name = "C7-SNB",
157                 .desc = "MWAIT 0x30",
158                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
159                 .exit_latency = 109,
160                 .target_residency = 345,
161                 .enter = &intel_idle },
162 };
163
164 static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = {
165         { /* MWAIT C0 */ },
166         { /* MWAIT C1 */
167                 .name = "C1-ATM",
168                 .desc = "MWAIT 0x00",
169                 .flags = CPUIDLE_FLAG_TIME_VALID,
170                 .exit_latency = 1,
171                 .target_residency = 4,
172                 .enter = &intel_idle },
173         { /* MWAIT C2 */
174                 .name = "C2-ATM",
175                 .desc = "MWAIT 0x10",
176                 .flags = CPUIDLE_FLAG_TIME_VALID,
177                 .exit_latency = 20,
178                 .target_residency = 80,
179                 .enter = &intel_idle },
180         { /* MWAIT C3 */ },
181         { /* MWAIT C4 */
182                 .name = "C4-ATM",
183                 .desc = "MWAIT 0x30",
184                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
185                 .exit_latency = 100,
186                 .target_residency = 400,
187                 .enter = &intel_idle },
188         { /* MWAIT C5 */ },
189         { /* MWAIT C6 */
190                 .name = "C6-ATM",
191                 .desc = "MWAIT 0x52",
192                 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
193                 .exit_latency = 140,
194                 .target_residency = 560,
195                 .enter = &intel_idle },
196 };
197
198 static int get_driver_data(int cstate)
199 {
200         int driver_data;
201         switch (cstate) {
202
203         case 1: /* MWAIT C1 */
204                 driver_data = 0x00;
205                 break;
206         case 2: /* MWAIT C2 */
207                 driver_data = 0x10;
208                 break;
209         case 3: /* MWAIT C3 */
210                 driver_data = 0x20;
211                 break;
212         case 4: /* MWAIT C4 */
213                 driver_data = 0x30;
214                 break;
215         case 5: /* MWAIT C5 */
216                 driver_data = 0x40;
217                 break;
218         case 6: /* MWAIT C6 */
219                 driver_data = 0x52;
220                 break;
221         default:
222                 driver_data = 0x00;
223         }
224         return driver_data;
225 }
226
227 /**
228  * intel_idle
229  * @dev: cpuidle_device
230  * @index: index of cpuidle state
231  *
232  */
233 static int intel_idle(struct cpuidle_device *dev, int index)
234 {
235         unsigned long ecx = 1; /* break on interrupt flag */
236         struct cpuidle_state *state = &dev->states[index];
237         struct cpuidle_state_usage *state_usage = &dev->states_usage[index];
238         unsigned long eax = (unsigned long)cpuidle_get_statedata(state_usage);
239         unsigned int cstate;
240         ktime_t kt_before, kt_after;
241         s64 usec_delta;
242         int cpu = smp_processor_id();
243
244         cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
245
246         local_irq_disable();
247
248         /*
249          * leave_mm() to avoid costly and often unnecessary wakeups
250          * for flushing the user TLB's associated with the active mm.
251          */
252         if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
253                 leave_mm(cpu);
254
255         if (!(lapic_timer_reliable_states & (1 << (cstate))))
256                 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
257
258         kt_before = ktime_get_real();
259
260         stop_critical_timings();
261         if (!need_resched()) {
262
263                 __monitor((void *)&current_thread_info()->flags, 0, 0);
264                 smp_mb();
265                 if (!need_resched())
266                         __mwait(eax, ecx);
267         }
268
269         start_critical_timings();
270
271         kt_after = ktime_get_real();
272         usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before));
273
274         local_irq_enable();
275
276         if (!(lapic_timer_reliable_states & (1 << (cstate))))
277                 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
278
279         /* Update cpuidle counters */
280         dev->last_residency = (int)usec_delta;
281
282         return index;
283 }
284
285 static void __setup_broadcast_timer(void *arg)
286 {
287         unsigned long reason = (unsigned long)arg;
288         int cpu = smp_processor_id();
289
290         reason = reason ?
291                 CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF;
292
293         clockevents_notify(reason, &cpu);
294 }
295
296 static int setup_broadcast_cpuhp_notify(struct notifier_block *n,
297                 unsigned long action, void *hcpu)
298 {
299         int hotcpu = (unsigned long)hcpu;
300
301         switch (action & 0xf) {
302         case CPU_ONLINE:
303                 smp_call_function_single(hotcpu, __setup_broadcast_timer,
304                         (void *)true, 1);
305                 break;
306         }
307         return NOTIFY_OK;
308 }
309
310 static struct notifier_block setup_broadcast_notifier = {
311         .notifier_call = setup_broadcast_cpuhp_notify,
312 };
313
314 static void auto_demotion_disable(void *dummy)
315 {
316         unsigned long long msr_bits;
317
318         rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
319         msr_bits &= ~auto_demotion_disable_flags;
320         wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
321 }
322
323 /*
324  * intel_idle_probe()
325  */
326 static int intel_idle_probe(void)
327 {
328         unsigned int eax, ebx, ecx;
329
330         if (max_cstate == 0) {
331                 pr_debug(PREFIX "disabled\n");
332                 return -EPERM;
333         }
334
335         if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
336                 return -ENODEV;
337
338         if (!boot_cpu_has(X86_FEATURE_MWAIT))
339                 return -ENODEV;
340
341         if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
342                 return -ENODEV;
343
344         cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
345
346         if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
347                 !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
348                         return -ENODEV;
349
350         pr_debug(PREFIX "MWAIT substates: 0x%x\n", mwait_substates);
351
352
353         if (boot_cpu_data.x86 != 6)     /* family 6 */
354                 return -ENODEV;
355
356         switch (boot_cpu_data.x86_model) {
357
358         case 0x1A:      /* Core i7, Xeon 5500 series */
359         case 0x1E:      /* Core i7 and i5 Processor - Lynnfield Jasper Forest */
360         case 0x1F:      /* Core i7 and i5 Processor - Nehalem */
361         case 0x2E:      /* Nehalem-EX Xeon */
362         case 0x2F:      /* Westmere-EX Xeon */
363         case 0x25:      /* Westmere */
364         case 0x2C:      /* Westmere */
365                 cpuidle_state_table = nehalem_cstates;
366                 auto_demotion_disable_flags =
367                         (NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE);
368                 break;
369
370         case 0x1C:      /* 28 - Atom Processor */
371                 cpuidle_state_table = atom_cstates;
372                 break;
373
374         case 0x26:      /* 38 - Lincroft Atom Processor */
375                 cpuidle_state_table = atom_cstates;
376                 auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE;
377                 break;
378
379         case 0x2A:      /* SNB */
380         case 0x2D:      /* SNB Xeon */
381                 cpuidle_state_table = snb_cstates;
382                 break;
383
384         default:
385                 pr_debug(PREFIX "does not run on family %d model %d\n",
386                         boot_cpu_data.x86, boot_cpu_data.x86_model);
387                 return -ENODEV;
388         }
389
390         if (boot_cpu_has(X86_FEATURE_ARAT))     /* Always Reliable APIC Timer */
391                 lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
392         else {
393                 smp_call_function(__setup_broadcast_timer, (void *)true, 1);
394                 register_cpu_notifier(&setup_broadcast_notifier);
395         }
396
397         pr_debug(PREFIX "v" INTEL_IDLE_VERSION
398                 " model 0x%X\n", boot_cpu_data.x86_model);
399
400         pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n",
401                 lapic_timer_reliable_states);
402         return 0;
403 }
404
405 /*
406  * intel_idle_cpuidle_devices_uninit()
407  * unregister, free cpuidle_devices
408  */
409 static void intel_idle_cpuidle_devices_uninit(void)
410 {
411         int i;
412         struct cpuidle_device *dev;
413
414         for_each_online_cpu(i) {
415                 dev = per_cpu_ptr(intel_idle_cpuidle_devices, i);
416                 cpuidle_unregister_device(dev);
417         }
418
419         free_percpu(intel_idle_cpuidle_devices);
420         return;
421 }
422 /*
423  * intel_idle_cpuidle_devices_init()
424  * allocate, initialize, register cpuidle_devices
425  */
426 static int intel_idle_cpuidle_devices_init(void)
427 {
428         int i, cstate;
429         struct cpuidle_device *dev;
430
431         intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
432         if (intel_idle_cpuidle_devices == NULL)
433                 return -ENOMEM;
434
435         for_each_online_cpu(i) {
436                 dev = per_cpu_ptr(intel_idle_cpuidle_devices, i);
437
438                 dev->state_count = 1;
439
440                 for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) {
441                         int num_substates;
442
443                         if (cstate > max_cstate) {
444                                 printk(PREFIX "max_cstate %d reached\n",
445                                         max_cstate);
446                                 break;
447                         }
448
449                         /* does the state exist in CPUID.MWAIT? */
450                         num_substates = (mwait_substates >> ((cstate) * 4))
451                                                 & MWAIT_SUBSTATE_MASK;
452                         if (num_substates == 0)
453                                 continue;
454                         /* is the state not enabled? */
455                         if (cpuidle_state_table[cstate].enter == NULL) {
456                                 /* does the driver not know about the state? */
457                                 if (*cpuidle_state_table[cstate].name == '\0')
458                                         pr_debug(PREFIX "unaware of model 0x%x"
459                                                 " MWAIT %d please"
460                                                 " contact lenb@kernel.org",
461                                         boot_cpu_data.x86_model, cstate);
462                                 continue;
463                         }
464
465                         if ((cstate > 2) &&
466                                 !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
467                                 mark_tsc_unstable("TSC halts in idle"
468                                         " states deeper than C2");
469
470                         dev->states[dev->state_count] = /* structure copy */
471                                 cpuidle_state_table[cstate];
472
473                         dev->states_usage[dev->state_count].driver_data =
474                                 (void *)get_driver_data(cstate);
475
476                         dev->state_count += 1;
477                 }
478
479                 dev->cpu = i;
480                 if (cpuidle_register_device(dev)) {
481                         pr_debug(PREFIX "cpuidle_register_device %d failed!\n",
482                                  i);
483                         intel_idle_cpuidle_devices_uninit();
484                         return -EIO;
485                 }
486         }
487         if (auto_demotion_disable_flags)
488                 smp_call_function(auto_demotion_disable, NULL, 1);
489
490         return 0;
491 }
492
493
494 static int __init intel_idle_init(void)
495 {
496         int retval;
497
498         /* Do not load intel_idle at all for now if idle= is passed */
499         if (boot_option_idle_override != IDLE_NO_OVERRIDE)
500                 return -ENODEV;
501
502         retval = intel_idle_probe();
503         if (retval)
504                 return retval;
505
506         retval = cpuidle_register_driver(&intel_idle_driver);
507         if (retval) {
508                 printk(KERN_DEBUG PREFIX "intel_idle yielding to %s",
509                         cpuidle_get_driver()->name);
510                 return retval;
511         }
512
513         retval = intel_idle_cpuidle_devices_init();
514         if (retval) {
515                 cpuidle_unregister_driver(&intel_idle_driver);
516                 return retval;
517         }
518
519         return 0;
520 }
521
522 static void __exit intel_idle_exit(void)
523 {
524         intel_idle_cpuidle_devices_uninit();
525         cpuidle_unregister_driver(&intel_idle_driver);
526
527         if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) {
528                 smp_call_function(__setup_broadcast_timer, (void *)false, 1);
529                 unregister_cpu_notifier(&setup_broadcast_notifier);
530         }
531
532         return;
533 }
534
535 module_init(intel_idle_init);
536 module_exit(intel_idle_exit);
537
538 module_param(max_cstate, int, 0444);
539
540 MODULE_AUTHOR("Len Brown <len.brown@intel.com>");
541 MODULE_DESCRIPTION("Cpuidle driver for Intel Hardware v" INTEL_IDLE_VERSION);
542 MODULE_LICENSE("GPL");