arch/x86/kernel/cpu/perf_counter.c

   1 /*
   2  * Performance counter x86 architecture code
   3  *
   4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
   6  *
   7  *  For licencing details see kernel-base/COPYING
   8  */
   9
  10 #include <linux/perf_counter.h>
  11 #include <linux/capability.h>
  12 #include <linux/notifier.h>
  13 #include <linux/hardirq.h>
  14 #include <linux/kprobes.h>
  15 #include <linux/module.h>
  16 #include <linux/kdebug.h>
  17 #include <linux/sched.h>
  18
  19 #include <asm/intel_arch_perfmon.h>
  20 #include <asm/apic.h>
  21
  22 static bool perf_counters_initialized __read_mostly;
  23
  24 /*
  25  * Number of (generic) HW counters:
  26  */
  27 static int nr_hw_counters __read_mostly;
  28 static u32 perf_counter_mask __read_mostly;
  29
  30 /* No support for fixed function counters yet */
  31
  32 #define MAX_HW_COUNTERS         8
  33
  34 struct cpu_hw_counters {
  35         struct perf_counter     *counters[MAX_HW_COUNTERS];
  36         unsigned long           used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
  37 };
  38
  39 /*
  40  * Intel PerfMon v3. Used on Core2 and later.
  41  */
  42 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
  43
  44 const int intel_perfmon_event_map[] =
  45 {
  46   [PERF_COUNT_CYCLES]                   = 0x003c,
  47   [PERF_COUNT_INSTRUCTIONS]             = 0x00c0,
  48   [PERF_COUNT_CACHE_REFERENCES]         = 0x4f2e,
  49   [PERF_COUNT_CACHE_MISSES]             = 0x412e,
  50   [PERF_COUNT_BRANCH_INSTRUCTIONS]      = 0x00c4,
  51   [PERF_COUNT_BRANCH_MISSES]            = 0x00c5,
  52 };
  53
  54 const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
  55
  56 /*
  57  * Propagate counter elapsed time into the generic counter.
  58  * Can only be executed on the CPU where the counter is active.
  59  * Returns the delta events processed.
  60  */
  61 static void
  62 x86_perf_counter_update(struct perf_counter *counter,
  63                         struct hw_perf_counter *hwc, int idx)
  64 {
  65         u64 prev_raw_count, new_raw_count, delta;
  66
  67         WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
  68         /*
  69          * Careful: an NMI might modify the previous counter value.
  70          *
  71          * Our tactic to handle this is to first atomically read and
  72          * exchange a new raw count - then add that new-prev delta
  73          * count to the generic counter atomically:
  74          */
  75 again:
  76         prev_raw_count = atomic64_read(&hwc->prev_count);
  77         rdmsrl(hwc->counter_base + idx, new_raw_count);
  78
  79         if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
  80                                         new_raw_count) != prev_raw_count)
  81                 goto again;
  82
  83         /*
  84          * Now we have the new raw value and have updated the prev
  85          * timestamp already. We can now calculate the elapsed delta
  86          * (counter-)time and add that to the generic counter.
  87          *
  88          * Careful, not all hw sign-extends above the physical width
  89          * of the count, so we do that by clipping the delta to 32 bits:
  90          */
  91         delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
  92         WARN_ON_ONCE((int)delta < 0);
  93
  94         atomic64_add(delta, &counter->count);
  95         atomic64_sub(delta, &hwc->period_left);
  96 }
  97
  98 /*
  99  * Setup the hardware configuration for a given hw_event_type
 100  */
 101 static int __hw_perf_counter_init(struct perf_counter *counter)
 102 {
 103         struct perf_counter_hw_event *hw_event = &counter->hw_event;
 104         struct hw_perf_counter *hwc = &counter->hw;
 105
 106         if (unlikely(!perf_counters_initialized))
 107                 return -EINVAL;
 108
 109         /*
 110          * Count user events, and generate PMC IRQs:
 111          * (keep 'enabled' bit clear for now)
 112          */
 113         hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
 114
 115         /*
 116          * If privileged enough, count OS events too, and allow
 117          * NMI events as well:
 118          */
 119         hwc->nmi = 0;
 120         if (capable(CAP_SYS_ADMIN)) {
 121                 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 122                 if (hw_event->nmi)
 123                         hwc->nmi = 1;
 124         }
 125
 126         hwc->config_base        = MSR_ARCH_PERFMON_EVENTSEL0;
 127         hwc->counter_base       = MSR_ARCH_PERFMON_PERFCTR0;
 128
 129         hwc->irq_period         = hw_event->irq_period;
 130         /*
 131          * Intel PMCs cannot be accessed sanely above 32 bit width,
 132          * so we install an artificial 1<<31 period regardless of
 133          * the generic counter period:
 134          */
 135         if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
 136                 hwc->irq_period = 0x7FFFFFFF;
 137
 138         atomic64_set(&hwc->period_left, hwc->irq_period);
 139
 140         /*
 141          * Raw event type provide the config in the event structure
 142          */
 143         if (hw_event->raw) {
 144                 hwc->config |= hw_event->type;
 145         } else {
 146                 if (hw_event->type >= max_intel_perfmon_events)
 147                         return -EINVAL;
 148                 /*
 149                  * The generic map:
 150                  */
 151                 hwc->config |= intel_perfmon_event_map[hw_event->type];
 152         }
 153         counter->wakeup_pending = 0;
 154
 155         return 0;
 156 }
 157
 158 void hw_perf_enable_all(void)
 159 {
 160         if (unlikely(!perf_counters_initialized))
 161                 return;
 162
 163         wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 164 }
 165
 166 u64 hw_perf_save_disable(void)
 167 {
 168         u64 ctrl;
 169
 170         if (unlikely(!perf_counters_initialized))
 171                 return 0;
 172
 173         rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 174         wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 175
 176         return ctrl;
 177 }
 178 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 179
 180 void hw_perf_restore(u64 ctrl)
 181 {
 182         if (unlikely(!perf_counters_initialized))
 183                 return;
 184
 185         wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
 186 }
 187 EXPORT_SYMBOL_GPL(hw_perf_restore);
 188
 189 static inline void
 190 __x86_perf_counter_disable(struct perf_counter *counter,
 191                            struct hw_perf_counter *hwc, unsigned int idx)
 192 {
 193         int err;
 194
 195         err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 196         WARN_ON_ONCE(err);
 197 }
 198
 199 static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
 200
 201 /*
 202  * Set the next IRQ period, based on the hwc->period_left value.
 203  * To be called with the counter disabled in hw:
 204  */
 205 static void
 206 __hw_perf_counter_set_period(struct perf_counter *counter,
 207                              struct hw_perf_counter *hwc, int idx)
 208 {
 209         s32 left = atomic64_read(&hwc->period_left);
 210         s32 period = hwc->irq_period;
 211
 212         WARN_ON_ONCE(period <= 0);
 213
 214         /*
 215          * If we are way outside a reasoable range then just skip forward:
 216          */
 217         if (unlikely(left <= -period)) {
 218                 left = period;
 219                 atomic64_set(&hwc->period_left, left);
 220         }
 221
 222         if (unlikely(left <= 0)) {
 223                 left += period;
 224                 atomic64_set(&hwc->period_left, left);
 225         }
 226
 227         WARN_ON_ONCE(left <= 0);
 228
 229         per_cpu(prev_left[idx], smp_processor_id()) = left;
 230
 231         /*
 232          * The hw counter starts counting from this counter offset,
 233          * mark it to be able to extra future deltas:
 234          */
 235         atomic64_set(&hwc->prev_count, (u64)(s64)-left);
 236
 237         wrmsr(hwc->counter_base + idx, -left, 0);
 238 }
 239
 240 static void
 241 __x86_perf_counter_enable(struct perf_counter *counter,
 242                           struct hw_perf_counter *hwc, int idx)
 243 {
 244         wrmsr(hwc->config_base + idx,
 245               hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 246 }
 247
 248 /*
 249  * Find a PMC slot for the freshly enabled / scheduled in counter:
 250  */
 251 static void x86_perf_counter_enable(struct perf_counter *counter)
 252 {
 253         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 254         struct hw_perf_counter *hwc = &counter->hw;
 255         int idx = hwc->idx;
 256
 257         /* Try to get the previous counter again */
 258         if (test_and_set_bit(idx, cpuc->used)) {
 259                 idx = find_first_zero_bit(cpuc->used, nr_hw_counters);
 260                 set_bit(idx, cpuc->used);
 261                 hwc->idx = idx;
 262         }
 263
 264         perf_counters_lapic_init(hwc->nmi);
 265
 266         __x86_perf_counter_disable(counter, hwc, idx);
 267
 268         cpuc->counters[idx] = counter;
 269
 270         __hw_perf_counter_set_period(counter, hwc, idx);
 271         __x86_perf_counter_enable(counter, hwc, idx);
 272 }
 273
 274 void perf_counter_print_debug(void)
 275 {
 276         u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
 277         int cpu, idx;
 278
 279         if (!nr_hw_counters)
 280                 return;
 281
 282         local_irq_disable();
 283
 284         cpu = smp_processor_id();
 285
 286         rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 287         rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 288         rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
 289
 290         printk(KERN_INFO "\n");
 291         printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 292         printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 293         printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 294
 295         for (idx = 0; idx < nr_hw_counters; idx++) {
 296                 rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 297                 rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 298
 299                 prev_left = per_cpu(prev_left[idx], cpu);
 300
 301                 printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
 302                         cpu, idx, pmc_ctrl);
 303                 printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
 304                         cpu, idx, pmc_count);
 305                 printk(KERN_INFO "CPU#%d: PMC%d left:  %016llx\n",
 306                         cpu, idx, prev_left);
 307         }
 308         local_irq_enable();
 309 }
 310
 311 static void x86_perf_counter_disable(struct perf_counter *counter)
 312 {
 313         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 314         struct hw_perf_counter *hwc = &counter->hw;
 315         unsigned int idx = hwc->idx;
 316
 317         __x86_perf_counter_disable(counter, hwc, idx);
 318
 319         clear_bit(idx, cpuc->used);
 320         cpuc->counters[idx] = NULL;
 321
 322         /*
 323          * Drain the remaining delta count out of a counter
 324          * that we are disabling:
 325          */
 326         x86_perf_counter_update(counter, hwc, idx);
 327 }
 328
 329 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
 330 {
 331         struct perf_data *irqdata = counter->irqdata;
 332
 333         if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
 334                 irqdata->overrun++;
 335         } else {
 336                 u64 *p = (u64 *) &irqdata->data[irqdata->len];
 337
 338                 *p = data;
 339                 irqdata->len += sizeof(u64);
 340         }
 341 }
 342
 343 /*
 344  * Save and restart an expired counter. Called by NMI contexts,
 345  * so it has to be careful about preempting normal counter ops:
 346  */
 347 static void perf_save_and_restart(struct perf_counter *counter)
 348 {
 349         struct hw_perf_counter *hwc = &counter->hw;
 350         int idx = hwc->idx;
 351         u64 pmc_ctrl;
 352
 353         rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 354
 355         x86_perf_counter_update(counter, hwc, idx);
 356         __hw_perf_counter_set_period(counter, hwc, idx);
 357
 358         if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
 359                 __x86_perf_counter_enable(counter, hwc, idx);
 360 }
 361
 362 static void
 363 perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 364 {
 365         struct perf_counter *counter, *group_leader = sibling->group_leader;
 366
 367         /*
 368          * Store sibling timestamps (if any):
 369          */
 370         list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
 371                 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 372                 perf_store_irq_data(sibling, counter->hw_event.type);
 373                 perf_store_irq_data(sibling, atomic64_read(&counter->count));
 374         }
 375 }
 376
 377 /*
 378  * This handler is triggered by the local APIC, so the APIC IRQ handling
 379  * rules apply:
 380  */
 381 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 382 {
 383         int bit, cpu = smp_processor_id();
 384         u64 ack, status, saved_global;
 385         struct cpu_hw_counters *cpuc;
 386
 387         rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 388
 389         /* Disable counters globally */
 390         wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 391         ack_APIC_irq();
 392
 393         cpuc = &per_cpu(cpu_hw_counters, cpu);
 394
 395         rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 396         if (!status)
 397                 goto out;
 398
 399 again:
 400         ack = status;
 401         for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
 402                 struct perf_counter *counter = cpuc->counters[bit];
 403
 404                 clear_bit(bit, (unsigned long *) &status);
 405                 if (!counter)
 406                         continue;
 407
 408                 perf_save_and_restart(counter);
 409
 410                 switch (counter->hw_event.record_type) {
 411                 case PERF_RECORD_SIMPLE:
 412                         continue;
 413                 case PERF_RECORD_IRQ:
 414                         perf_store_irq_data(counter, instruction_pointer(regs));
 415                         break;
 416                 case PERF_RECORD_GROUP:
 417                         perf_handle_group(counter, &status, &ack);
 418                         break;
 419                 }
 420                 /*
 421                  * From NMI context we cannot call into the scheduler to
 422                  * do a task wakeup - but we mark these counters as
 423                  * wakeup_pending and initate a wakeup callback:
 424                  */
 425                 if (nmi) {
 426                         counter->wakeup_pending = 1;
 427                         set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
 428                 } else {
 429                         wake_up(&counter->waitq);
 430                 }
 431         }
 432
 433         wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0);
 434
 435         /*
 436          * Repeat if there is more work to be done:
 437          */
 438         rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 439         if (status)
 440                 goto again;
 441 out:
 442         /*
 443          * Restore - do not reenable when global enable is off:
 444          */
 445         wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0);
 446 }
 447
 448 void smp_perf_counter_interrupt(struct pt_regs *regs)
 449 {
 450         irq_enter();
 451         inc_irq_stat(apic_perf_irqs);
 452         apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 453         __smp_perf_counter_interrupt(regs, 0);
 454
 455         irq_exit();
 456 }
 457
 458 /*
 459  * This handler is triggered by NMI contexts:
 460  */
 461 void perf_counter_notify(struct pt_regs *regs)
 462 {
 463         struct cpu_hw_counters *cpuc;
 464         unsigned long flags;
 465         int bit, cpu;
 466
 467         local_irq_save(flags);
 468         cpu = smp_processor_id();
 469         cpuc = &per_cpu(cpu_hw_counters, cpu);
 470
 471         for_each_bit(bit, cpuc->used, nr_hw_counters) {
 472                 struct perf_counter *counter = cpuc->counters[bit];
 473
 474                 if (!counter)
 475                         continue;
 476
 477                 if (counter->wakeup_pending) {
 478                         counter->wakeup_pending = 0;
 479                         wake_up(&counter->waitq);
 480                 }
 481         }
 482
 483         local_irq_restore(flags);
 484 }
 485
 486 void __cpuinit perf_counters_lapic_init(int nmi)
 487 {
 488         u32 apic_val;
 489
 490         if (!perf_counters_initialized)
 491                 return;
 492         /*
 493          * Enable the performance counter vector in the APIC LVT:
 494          */
 495         apic_val = apic_read(APIC_LVTERR);
 496
 497         apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
 498         if (nmi)
 499                 apic_write(APIC_LVTPC, APIC_DM_NMI);
 500         else
 501                 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 502         apic_write(APIC_LVTERR, apic_val);
 503 }
 504
 505 static int __kprobes
 506 perf_counter_nmi_handler(struct notifier_block *self,
 507                          unsigned long cmd, void *__args)
 508 {
 509         struct die_args *args = __args;
 510         struct pt_regs *regs;
 511
 512         if (likely(cmd != DIE_NMI_IPI))
 513                 return NOTIFY_DONE;
 514
 515         regs = args->regs;
 516
 517         apic_write(APIC_LVTPC, APIC_DM_NMI);
 518         __smp_perf_counter_interrupt(regs, 1);
 519
 520         return NOTIFY_STOP;
 521 }
 522
 523 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 524         .notifier_call          = perf_counter_nmi_handler
 525 };
 526
 527 void __init init_hw_perf_counters(void)
 528 {
 529         union cpuid10_eax eax;
 530         unsigned int unused;
 531         unsigned int ebx;
 532
 533         if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 534                 return;
 535
 536         /*
 537          * Check whether the Architectural PerfMon supports
 538          * Branch Misses Retired Event or not.
 539          */
 540         cpuid(10, &(eax.full), &ebx, &unused, &unused);
 541         if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 542                 return;
 543
 544         printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
 545
 546         printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
 547         printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
 548         nr_hw_counters = eax.split.num_counters;
 549         if (nr_hw_counters > MAX_HW_COUNTERS) {
 550                 nr_hw_counters = MAX_HW_COUNTERS;
 551                 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
 552                         nr_hw_counters, MAX_HW_COUNTERS);
 553         }
 554         perf_counter_mask = (1 << nr_hw_counters) - 1;
 555         perf_max_counters = nr_hw_counters;
 556
 557         printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
 558         printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
 559
 560         perf_counters_initialized = true;
 561
 562         perf_counters_lapic_init(0);
 563         register_die_notifier(&perf_counter_nmi_notifier);
 564 }
 565
 566 static void x86_perf_counter_read(struct perf_counter *counter)
 567 {
 568         x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 569 }
 570
 571 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
 572         .hw_perf_counter_enable         = x86_perf_counter_enable,
 573         .hw_perf_counter_disable        = x86_perf_counter_disable,
 574         .hw_perf_counter_read           = x86_perf_counter_read,
 575 };
 576
 577 const struct hw_perf_counter_ops *
 578 hw_perf_counter_init(struct perf_counter *counter)
 579 {
 580         int err;
 581
 582         err = __hw_perf_counter_init(counter);
 583         if (err)
 584                 return NULL;
 585
 586         return &x86_perf_counter_ops;
 587 }