arch/powerpc/kernel/perf_counter.c

   1 /*
   2  * Performance counter support - powerpc architecture code
   3  *
   4  * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License
   8  * as published by the Free Software Foundation; either version
   9  * 2 of the License, or (at your option) any later version.
  10  */
  11 #include <linux/kernel.h>
  12 #include <linux/sched.h>
  13 #include <linux/perf_counter.h>
  14 #include <linux/percpu.h>
  15 #include <linux/hardirq.h>
  16 #include <asm/reg.h>
  17 #include <asm/pmc.h>
  18 #include <asm/machdep.h>
  19 #include <asm/firmware.h>
  20
  21 struct cpu_hw_counters {
  22         int n_counters;
  23         int n_percpu;
  24         int disabled;
  25         int n_added;
  26         struct perf_counter *counter[MAX_HWCOUNTERS];
  27         unsigned int events[MAX_HWCOUNTERS];
  28         u64 mmcr[3];
  29         u8 pmcs_enabled;
  30 };
  31 DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
  32
  33 struct power_pmu *ppmu;
  34
  35 /*
  36  * Normally, to ignore kernel events we set the FCS (freeze counters
  37  * in supervisor mode) bit in MMCR0, but if the kernel runs with the
  38  * hypervisor bit set in the MSR, or if we are running on a processor
  39  * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
  40  * then we need to use the FCHV bit to ignore kernel events.
  41  */
  42 static unsigned int freeze_counters_kernel = MMCR0_FCS;
  43
  44 static void perf_counter_interrupt(struct pt_regs *regs);
  45
  46 void perf_counter_print_debug(void)
  47 {
  48 }
  49
  50 /*
  51  * Read one performance monitor counter (PMC).
  52  */
  53 static unsigned long read_pmc(int idx)
  54 {
  55         unsigned long val;
  56
  57         switch (idx) {
  58         case 1:
  59                 val = mfspr(SPRN_PMC1);
  60                 break;
  61         case 2:
  62                 val = mfspr(SPRN_PMC2);
  63                 break;
  64         case 3:
  65                 val = mfspr(SPRN_PMC3);
  66                 break;
  67         case 4:
  68                 val = mfspr(SPRN_PMC4);
  69                 break;
  70         case 5:
  71                 val = mfspr(SPRN_PMC5);
  72                 break;
  73         case 6:
  74                 val = mfspr(SPRN_PMC6);
  75                 break;
  76         case 7:
  77                 val = mfspr(SPRN_PMC7);
  78                 break;
  79         case 8:
  80                 val = mfspr(SPRN_PMC8);
  81                 break;
  82         default:
  83                 printk(KERN_ERR "oops trying to read PMC%d\n", idx);
  84                 val = 0;
  85         }
  86         return val;
  87 }
  88
  89 /*
  90  * Write one PMC.
  91  */
  92 static void write_pmc(int idx, unsigned long val)
  93 {
  94         switch (idx) {
  95         case 1:
  96                 mtspr(SPRN_PMC1, val);
  97                 break;
  98         case 2:
  99                 mtspr(SPRN_PMC2, val);
 100                 break;
 101         case 3:
 102                 mtspr(SPRN_PMC3, val);
 103                 break;
 104         case 4:
 105                 mtspr(SPRN_PMC4, val);
 106                 break;
 107         case 5:
 108                 mtspr(SPRN_PMC5, val);
 109                 break;
 110         case 6:
 111                 mtspr(SPRN_PMC6, val);
 112                 break;
 113         case 7:
 114                 mtspr(SPRN_PMC7, val);
 115                 break;
 116         case 8:
 117                 mtspr(SPRN_PMC8, val);
 118                 break;
 119         default:
 120                 printk(KERN_ERR "oops trying to write PMC%d\n", idx);
 121         }
 122 }
 123
 124 /*
 125  * Check if a set of events can all go on the PMU at once.
 126  * If they can't, this will look at alternative codes for the events
 127  * and see if any combination of alternative codes is feasible.
 128  * The feasible set is returned in event[].
 129  */
 130 static int power_check_constraints(unsigned int event[], int n_ev)
 131 {
 132         u64 mask, value, nv;
 133         unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 134         u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 135         u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 136         u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
 137         int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
 138         int i, j;
 139         u64 addf = ppmu->add_fields;
 140         u64 tadd = ppmu->test_adder;
 141
 142         if (n_ev > ppmu->n_counter)
 143                 return -1;
 144
 145         /* First see if the events will go on as-is */
 146         for (i = 0; i < n_ev; ++i) {
 147                 alternatives[i][0] = event[i];
 148                 if (ppmu->get_constraint(event[i], &amasks[i][0],
 149                                          &avalues[i][0]))
 150                         return -1;
 151                 choice[i] = 0;
 152         }
 153         value = mask = 0;
 154         for (i = 0; i < n_ev; ++i) {
 155                 nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
 156                 if ((((nv + tadd) ^ value) & mask) != 0 ||
 157                     (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
 158                         break;
 159                 value = nv;
 160                 mask |= amasks[i][0];
 161         }
 162         if (i == n_ev)
 163                 return 0;       /* all OK */
 164
 165         /* doesn't work, gather alternatives... */
 166         if (!ppmu->get_alternatives)
 167                 return -1;
 168         for (i = 0; i < n_ev; ++i) {
 169                 n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
 170                 for (j = 1; j < n_alt[i]; ++j)
 171                         ppmu->get_constraint(alternatives[i][j],
 172                                              &amasks[i][j], &avalues[i][j]);
 173         }
 174
 175         /* enumerate all possibilities and see if any will work */
 176         i = 0;
 177         j = -1;
 178         value = mask = nv = 0;
 179         while (i < n_ev) {
 180                 if (j >= 0) {
 181                         /* we're backtracking, restore context */
 182                         value = svalues[i];
 183                         mask = smasks[i];
 184                         j = choice[i];
 185                 }
 186                 /*
 187                  * See if any alternative k for event i,
 188                  * where k > j, will satisfy the constraints.
 189                  */
 190                 while (++j < n_alt[i]) {
 191                         nv = (value | avalues[i][j]) +
 192                                 (value & avalues[i][j] & addf);
 193                         if ((((nv + tadd) ^ value) & mask) == 0 &&
 194                             (((nv + tadd) ^ avalues[i][j])
 195                              & amasks[i][j]) == 0)
 196                                 break;
 197                 }
 198                 if (j >= n_alt[i]) {
 199                         /*
 200                          * No feasible alternative, backtrack
 201                          * to event i-1 and continue enumerating its
 202                          * alternatives from where we got up to.
 203                          */
 204                         if (--i < 0)
 205                                 return -1;
 206                 } else {
 207                         /*
 208                          * Found a feasible alternative for event i,
 209                          * remember where we got up to with this event,
 210                          * go on to the next event, and start with
 211                          * the first alternative for it.
 212                          */
 213                         choice[i] = j;
 214                         svalues[i] = value;
 215                         smasks[i] = mask;
 216                         value = nv;
 217                         mask |= amasks[i][j];
 218                         ++i;
 219                         j = -1;
 220                 }
 221         }
 222
 223         /* OK, we have a feasible combination, tell the caller the solution */
 224         for (i = 0; i < n_ev; ++i)
 225                 event[i] = alternatives[i][choice[i]];
 226         return 0;
 227 }
 228
 229 /*
 230  * Check if newly-added counters have consistent settings for
 231  * exclude_{user,kernel,hv} with each other and any previously
 232  * added counters.
 233  */
 234 static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
 235 {
 236         int eu, ek, eh;
 237         int i, n;
 238         struct perf_counter *counter;
 239
 240         n = n_prev + n_new;
 241         if (n <= 1)
 242                 return 0;
 243
 244         eu = ctrs[0]->hw_event.exclude_user;
 245         ek = ctrs[0]->hw_event.exclude_kernel;
 246         eh = ctrs[0]->hw_event.exclude_hv;
 247         if (n_prev == 0)
 248                 n_prev = 1;
 249         for (i = n_prev; i < n; ++i) {
 250                 counter = ctrs[i];
 251                 if (counter->hw_event.exclude_user != eu ||
 252                     counter->hw_event.exclude_kernel != ek ||
 253                     counter->hw_event.exclude_hv != eh)
 254                         return -EAGAIN;
 255         }
 256         return 0;
 257 }
 258
 259 static void power_pmu_read(struct perf_counter *counter)
 260 {
 261         long val, delta, prev;
 262
 263         if (!counter->hw.idx)
 264                 return;
 265         /*
 266          * Performance monitor interrupts come even when interrupts
 267          * are soft-disabled, as long as interrupts are hard-enabled.
 268          * Therefore we treat them like NMIs.
 269          */
 270         do {
 271                 prev = atomic64_read(&counter->hw.prev_count);
 272                 barrier();
 273                 val = read_pmc(counter->hw.idx);
 274         } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
 275
 276         /* The counters are only 32 bits wide */
 277         delta = (val - prev) & 0xfffffffful;
 278         atomic64_add(delta, &counter->count);
 279         atomic64_sub(delta, &counter->hw.period_left);
 280 }
 281
 282 /*
 283  * Disable all counters to prevent PMU interrupts and to allow
 284  * counters to be added or removed.
 285  */
 286 u64 hw_perf_save_disable(void)
 287 {
 288         struct cpu_hw_counters *cpuhw;
 289         unsigned long ret;
 290         unsigned long flags;
 291
 292         local_irq_save(flags);
 293         cpuhw = &__get_cpu_var(cpu_hw_counters);
 294
 295         ret = cpuhw->disabled;
 296         if (!ret) {
 297                 cpuhw->disabled = 1;
 298                 cpuhw->n_added = 0;
 299
 300                 /*
 301                  * Check if we ever enabled the PMU on this cpu.
 302                  */
 303                 if (!cpuhw->pmcs_enabled) {
 304                         if (ppc_md.enable_pmcs)
 305                                 ppc_md.enable_pmcs();
 306                         cpuhw->pmcs_enabled = 1;
 307                 }
 308
 309                 /*
 310                  * Disable instruction sampling if it was enabled
 311                  */
 312                 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
 313                         mtspr(SPRN_MMCRA,
 314                               cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 315                         mb();
 316                 }
 317
 318                 /*
 319                  * Set the 'freeze counters' bit.
 320                  * The barrier is to make sure the mtspr has been
 321                  * executed and the PMU has frozen the counters
 322                  * before we return.
 323                  */
 324                 mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
 325                 mb();
 326         }
 327         local_irq_restore(flags);
 328         return ret;
 329 }
 330
 331 /*
 332  * Re-enable all counters if disable == 0.
 333  * If we were previously disabled and counters were added, then
 334  * put the new config on the PMU.
 335  */
 336 void hw_perf_restore(u64 disable)
 337 {
 338         struct perf_counter *counter;
 339         struct cpu_hw_counters *cpuhw;
 340         unsigned long flags;
 341         long i;
 342         unsigned long val;
 343         s64 left;
 344         unsigned int hwc_index[MAX_HWCOUNTERS];
 345
 346         if (disable)
 347                 return;
 348         local_irq_save(flags);
 349         cpuhw = &__get_cpu_var(cpu_hw_counters);
 350         cpuhw->disabled = 0;
 351
 352         /*
 353          * If we didn't change anything, or only removed counters,
 354          * no need to recalculate MMCR* settings and reset the PMCs.
 355          * Just reenable the PMU with the current MMCR* settings
 356          * (possibly updated for removal of counters).
 357          */
 358         if (!cpuhw->n_added) {
 359                 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 360                 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 361                 if (cpuhw->n_counters == 0)
 362                         get_lppaca()->pmcregs_in_use = 0;
 363                 goto out_enable;
 364         }
 365
 366         /*
 367          * Compute MMCR* values for the new set of counters
 368          */
 369         if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
 370                                cpuhw->mmcr)) {
 371                 /* shouldn't ever get here */
 372                 printk(KERN_ERR "oops compute_mmcr failed\n");
 373                 goto out;
 374         }
 375
 376         /*
 377          * Add in MMCR0 freeze bits corresponding to the
 378          * hw_event.exclude_* bits for the first counter.
 379          * We have already checked that all counters have the
 380          * same values for these bits as the first counter.
 381          */
 382         counter = cpuhw->counter[0];
 383         if (counter->hw_event.exclude_user)
 384                 cpuhw->mmcr[0] |= MMCR0_FCP;
 385         if (counter->hw_event.exclude_kernel)
 386                 cpuhw->mmcr[0] |= freeze_counters_kernel;
 387         if (counter->hw_event.exclude_hv)
 388                 cpuhw->mmcr[0] |= MMCR0_FCHV;
 389
 390         /*
 391          * Write the new configuration to MMCR* with the freeze
 392          * bit set and set the hardware counters to their initial values.
 393          * Then unfreeze the counters.
 394          */
 395         get_lppaca()->pmcregs_in_use = 1;
 396         mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 397         mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 398         mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
 399                                 | MMCR0_FC);
 400
 401         /*
 402          * Read off any pre-existing counters that need to move
 403          * to another PMC.
 404          */
 405         for (i = 0; i < cpuhw->n_counters; ++i) {
 406                 counter = cpuhw->counter[i];
 407                 if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
 408                         power_pmu_read(counter);
 409                         write_pmc(counter->hw.idx, 0);
 410                         counter->hw.idx = 0;
 411                 }
 412         }
 413
 414         /*
 415          * Initialize the PMCs for all the new and moved counters.
 416          */
 417         for (i = 0; i < cpuhw->n_counters; ++i) {
 418                 counter = cpuhw->counter[i];
 419                 if (counter->hw.idx)
 420                         continue;
 421                 val = 0;
 422                 if (counter->hw_event.irq_period) {
 423                         left = atomic64_read(&counter->hw.period_left);
 424                         if (left < 0x80000000L)
 425                                 val = 0x80000000L - left;
 426                 }
 427                 atomic64_set(&counter->hw.prev_count, val);
 428                 counter->hw.idx = hwc_index[i] + 1;
 429                 write_pmc(counter->hw.idx, val);
 430                 perf_counter_update_userpage(counter);
 431         }
 432         cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
 433
 434  out_enable:
 435         mb();
 436         mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
 437
 438         /*
 439          * Enable instruction sampling if necessary
 440          */
 441         if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
 442                 mb();
 443                 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
 444         }
 445
 446  out:
 447         local_irq_restore(flags);
 448 }
 449
 450 static int collect_events(struct perf_counter *group, int max_count,
 451                           struct perf_counter *ctrs[], unsigned int *events)
 452 {
 453         int n = 0;
 454         struct perf_counter *counter;
 455
 456         if (!is_software_counter(group)) {
 457                 if (n >= max_count)
 458                         return -1;
 459                 ctrs[n] = group;
 460                 events[n++] = group->hw.config;
 461         }
 462         list_for_each_entry(counter, &group->sibling_list, list_entry) {
 463                 if (!is_software_counter(counter) &&
 464                     counter->state != PERF_COUNTER_STATE_OFF) {
 465                         if (n >= max_count)
 466                                 return -1;
 467                         ctrs[n] = counter;
 468                         events[n++] = counter->hw.config;
 469                 }
 470         }
 471         return n;
 472 }
 473
 474 static void counter_sched_in(struct perf_counter *counter, int cpu)
 475 {
 476         counter->state = PERF_COUNTER_STATE_ACTIVE;
 477         counter->oncpu = cpu;
 478         counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
 479         if (is_software_counter(counter))
 480                 counter->pmu->enable(counter);
 481 }
 482
 483 /*
 484  * Called to enable a whole group of counters.
 485  * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
 486  * Assumes the caller has disabled interrupts and has
 487  * frozen the PMU with hw_perf_save_disable.
 488  */
 489 int hw_perf_group_sched_in(struct perf_counter *group_leader,
 490                struct perf_cpu_context *cpuctx,
 491                struct perf_counter_context *ctx, int cpu)
 492 {
 493         struct cpu_hw_counters *cpuhw;
 494         long i, n, n0;
 495         struct perf_counter *sub;
 496
 497         cpuhw = &__get_cpu_var(cpu_hw_counters);
 498         n0 = cpuhw->n_counters;
 499         n = collect_events(group_leader, ppmu->n_counter - n0,
 500                            &cpuhw->counter[n0], &cpuhw->events[n0]);
 501         if (n < 0)
 502                 return -EAGAIN;
 503         if (check_excludes(cpuhw->counter, n0, n))
 504                 return -EAGAIN;
 505         if (power_check_constraints(cpuhw->events, n + n0))
 506                 return -EAGAIN;
 507         cpuhw->n_counters = n0 + n;
 508         cpuhw->n_added += n;
 509
 510         /*
 511          * OK, this group can go on; update counter states etc.,
 512          * and enable any software counters
 513          */
 514         for (i = n0; i < n0 + n; ++i)
 515                 cpuhw->counter[i]->hw.config = cpuhw->events[i];
 516         cpuctx->active_oncpu += n;
 517         n = 1;
 518         counter_sched_in(group_leader, cpu);
 519         list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
 520                 if (sub->state != PERF_COUNTER_STATE_OFF) {
 521                         counter_sched_in(sub, cpu);
 522                         ++n;
 523                 }
 524         }
 525         ctx->nr_active += n;
 526
 527         return 1;
 528 }
 529
 530 /*
 531  * Add a counter to the PMU.
 532  * If all counters are not already frozen, then we disable and
 533  * re-enable the PMU in order to get hw_perf_restore to do the
 534  * actual work of reconfiguring the PMU.
 535  */
 536 static int power_pmu_enable(struct perf_counter *counter)
 537 {
 538         struct cpu_hw_counters *cpuhw;
 539         unsigned long flags;
 540         u64 pmudis;
 541         int n0;
 542         int ret = -EAGAIN;
 543
 544         local_irq_save(flags);
 545         pmudis = hw_perf_save_disable();
 546
 547         /*
 548          * Add the counter to the list (if there is room)
 549          * and check whether the total set is still feasible.
 550          */
 551         cpuhw = &__get_cpu_var(cpu_hw_counters);
 552         n0 = cpuhw->n_counters;
 553         if (n0 >= ppmu->n_counter)
 554                 goto out;
 555         cpuhw->counter[n0] = counter;
 556         cpuhw->events[n0] = counter->hw.config;
 557         if (check_excludes(cpuhw->counter, n0, 1))
 558                 goto out;
 559         if (power_check_constraints(cpuhw->events, n0 + 1))
 560                 goto out;
 561
 562         counter->hw.config = cpuhw->events[n0];
 563         ++cpuhw->n_counters;
 564         ++cpuhw->n_added;
 565
 566         ret = 0;
 567  out:
 568         hw_perf_restore(pmudis);
 569         local_irq_restore(flags);
 570         return ret;
 571 }
 572
 573 /*
 574  * Remove a counter from the PMU.
 575  */
 576 static void power_pmu_disable(struct perf_counter *counter)
 577 {
 578         struct cpu_hw_counters *cpuhw;
 579         long i;
 580         u64 pmudis;
 581         unsigned long flags;
 582
 583         local_irq_save(flags);
 584         pmudis = hw_perf_save_disable();
 585
 586         power_pmu_read(counter);
 587
 588         cpuhw = &__get_cpu_var(cpu_hw_counters);
 589         for (i = 0; i < cpuhw->n_counters; ++i) {
 590                 if (counter == cpuhw->counter[i]) {
 591                         while (++i < cpuhw->n_counters)
 592                                 cpuhw->counter[i-1] = cpuhw->counter[i];
 593                         --cpuhw->n_counters;
 594                         ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
 595                         write_pmc(counter->hw.idx, 0);
 596                         counter->hw.idx = 0;
 597                         perf_counter_update_userpage(counter);
 598                         break;
 599                 }
 600         }
 601         if (cpuhw->n_counters == 0) {
 602                 /* disable exceptions if no counters are running */
 603                 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
 604         }
 605
 606         hw_perf_restore(pmudis);
 607         local_irq_restore(flags);
 608 }
 609
 610 struct pmu power_pmu = {
 611         .enable         = power_pmu_enable,
 612         .disable        = power_pmu_disable,
 613         .read           = power_pmu_read,
 614 };
 615
 616 /* Number of perf_counters counting hardware events */
 617 static atomic_t num_counters;
 618 /* Used to avoid races in calling reserve/release_pmc_hardware */
 619 static DEFINE_MUTEX(pmc_reserve_mutex);
 620
 621 /*
 622  * Release the PMU if this is the last perf_counter.
 623  */
 624 static void hw_perf_counter_destroy(struct perf_counter *counter)
 625 {
 626         if (!atomic_add_unless(&num_counters, -1, 1)) {
 627                 mutex_lock(&pmc_reserve_mutex);
 628                 if (atomic_dec_return(&num_counters) == 0)
 629                         release_pmc_hardware();
 630                 mutex_unlock(&pmc_reserve_mutex);
 631         }
 632 }
 633
 634 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 635 {
 636         unsigned long ev;
 637         struct perf_counter *ctrs[MAX_HWCOUNTERS];
 638         unsigned int events[MAX_HWCOUNTERS];
 639         int n;
 640         int err;
 641
 642         if (!ppmu)
 643                 return ERR_PTR(-ENXIO);
 644         if ((s64)counter->hw_event.irq_period < 0)
 645                 return ERR_PTR(-EINVAL);
 646         if (!perf_event_raw(&counter->hw_event)) {
 647                 ev = perf_event_id(&counter->hw_event);
 648                 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 649                         return ERR_PTR(-EOPNOTSUPP);
 650                 ev = ppmu->generic_events[ev];
 651         } else {
 652                 ev = perf_event_config(&counter->hw_event);
 653         }
 654         counter->hw.config_base = ev;
 655         counter->hw.idx = 0;
 656
 657         /*
 658          * If we are not running on a hypervisor, force the
 659          * exclude_hv bit to 0 so that we don't care what
 660          * the user set it to.
 661          */
 662         if (!firmware_has_feature(FW_FEATURE_LPAR))
 663                 counter->hw_event.exclude_hv = 0;
 664
 665         /*
 666          * If this is in a group, check if it can go on with all the
 667          * other hardware counters in the group.  We assume the counter
 668          * hasn't been linked into its leader's sibling list at this point.
 669          */
 670         n = 0;
 671         if (counter->group_leader != counter) {
 672                 n = collect_events(counter->group_leader, ppmu->n_counter - 1,
 673                                    ctrs, events);
 674                 if (n < 0)
 675                         return ERR_PTR(-EINVAL);
 676         }
 677         events[n] = ev;
 678         ctrs[n] = counter;
 679         if (check_excludes(ctrs, n, 1))
 680                 return ERR_PTR(-EINVAL);
 681         if (power_check_constraints(events, n + 1))
 682                 return ERR_PTR(-EINVAL);
 683
 684         counter->hw.config = events[n];
 685         atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
 686
 687         /*
 688          * See if we need to reserve the PMU.
 689          * If no counters are currently in use, then we have to take a
 690          * mutex to ensure that we don't race with another task doing
 691          * reserve_pmc_hardware or release_pmc_hardware.
 692          */
 693         err = 0;
 694         if (!atomic_inc_not_zero(&num_counters)) {
 695                 mutex_lock(&pmc_reserve_mutex);
 696                 if (atomic_read(&num_counters) == 0 &&
 697                     reserve_pmc_hardware(perf_counter_interrupt))
 698                         err = -EBUSY;
 699                 else
 700                         atomic_inc(&num_counters);
 701                 mutex_unlock(&pmc_reserve_mutex);
 702         }
 703         counter->destroy = hw_perf_counter_destroy;
 704
 705         if (err)
 706                 return ERR_PTR(err);
 707         return &power_pmu;
 708 }
 709
 710 /*
 711  * A counter has overflowed; update its count and record
 712  * things if requested.  Note that interrupts are hard-disabled
 713  * here so there is no possibility of being interrupted.
 714  */
 715 static void record_and_restart(struct perf_counter *counter, long val,
 716                                struct pt_regs *regs, int nmi)
 717 {
 718         s64 prev, delta, left;
 719         int record = 0;
 720
 721         /* we don't have to worry about interrupts here */
 722         prev = atomic64_read(&counter->hw.prev_count);
 723         delta = (val - prev) & 0xfffffffful;
 724         atomic64_add(delta, &counter->count);
 725
 726         /*
 727          * See if the total period for this counter has expired,
 728          * and update for the next period.
 729          */
 730         val = 0;
 731         left = atomic64_read(&counter->hw.period_left) - delta;
 732         if (counter->hw_event.irq_period) {
 733                 if (left <= 0) {
 734                         left += counter->hw_event.irq_period;
 735                         if (left <= 0)
 736                                 left = counter->hw_event.irq_period;
 737                         record = 1;
 738                 }
 739                 if (left < 0x80000000L)
 740                         val = 0x80000000L - left;
 741         }
 742         write_pmc(counter->hw.idx, val);
 743         atomic64_set(&counter->hw.prev_count, val);
 744         atomic64_set(&counter->hw.period_left, left);
 745         perf_counter_update_userpage(counter);
 746
 747         /*
 748          * Finally record data if requested.
 749          */
 750         if (record)
 751                 perf_counter_overflow(counter, nmi, regs, 0);
 752 }
 753
 754 /*
 755  * Performance monitor interrupt stuff
 756  */
 757 static void perf_counter_interrupt(struct pt_regs *regs)
 758 {
 759         int i;
 760         struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
 761         struct perf_counter *counter;
 762         long val;
 763         int found = 0;
 764         int nmi;
 765
 766         /*
 767          * If interrupts were soft-disabled when this PMU interrupt
 768          * occurred, treat it as an NMI.
 769          */
 770         nmi = !regs->softe;
 771         if (nmi)
 772                 nmi_enter();
 773         else
 774                 irq_enter();
 775
 776         for (i = 0; i < cpuhw->n_counters; ++i) {
 777                 counter = cpuhw->counter[i];
 778                 val = read_pmc(counter->hw.idx);
 779                 if ((int)val < 0) {
 780                         /* counter has overflowed */
 781                         found = 1;
 782                         record_and_restart(counter, val, regs, nmi);
 783                 }
 784         }
 785
 786         /*
 787          * In case we didn't find and reset the counter that caused
 788          * the interrupt, scan all counters and reset any that are
 789          * negative, to avoid getting continual interrupts.
 790          * Any that we processed in the previous loop will not be negative.
 791          */
 792         if (!found) {
 793                 for (i = 0; i < ppmu->n_counter; ++i) {
 794                         val = read_pmc(i + 1);
 795                         if ((int)val < 0)
 796                                 write_pmc(i + 1, 0);
 797                 }
 798         }
 799
 800         /*
 801          * Reset MMCR0 to its normal value.  This will set PMXE and
 802          * clear FC (freeze counters) and PMAO (perf mon alert occurred)
 803          * and thus allow interrupts to occur again.
 804          * XXX might want to use MSR.PM to keep the counters frozen until
 805          * we get back out of this interrupt.
 806          */
 807         mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
 808
 809         if (nmi)
 810                 nmi_exit();
 811         else
 812                 irq_exit();
 813 }
 814
 815 void hw_perf_counter_setup(int cpu)
 816 {
 817         struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
 818
 819         memset(cpuhw, 0, sizeof(*cpuhw));
 820         cpuhw->mmcr[0] = MMCR0_FC;
 821 }
 822
 823 extern struct power_pmu power4_pmu;
 824 extern struct power_pmu ppc970_pmu;
 825 extern struct power_pmu power5_pmu;
 826 extern struct power_pmu power5p_pmu;
 827 extern struct power_pmu power6_pmu;
 828
 829 static int init_perf_counters(void)
 830 {
 831         unsigned long pvr;
 832
 833         /* XXX should get this from cputable */
 834         pvr = mfspr(SPRN_PVR);
 835         switch (PVR_VER(pvr)) {
 836         case PV_POWER4:
 837         case PV_POWER4p:
 838                 ppmu = &power4_pmu;
 839                 break;
 840         case PV_970:
 841         case PV_970FX:
 842         case PV_970MP:
 843                 ppmu = &ppc970_pmu;
 844                 break;
 845         case PV_POWER5:
 846                 ppmu = &power5_pmu;
 847                 break;
 848         case PV_POWER5p:
 849                 ppmu = &power5p_pmu;
 850                 break;
 851         case 0x3e:
 852                 ppmu = &power6_pmu;
 853                 break;
 854         }
 855
 856         /*
 857          * Use FCHV to ignore kernel events if MSR.HV is set.
 858          */
 859         if (mfmsr() & MSR_HV)
 860                 freeze_counters_kernel = MMCR0_FCHV;
 861
 862         return 0;
 863 }
 864
 865 arch_initcall(init_perf_counters);