kernel/perf_event.c

   1 /*
   2  * Performance events core code:
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8  *
   9  * For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/file.h>
  17 #include <linux/poll.h>
  18 #include <linux/slab.h>
  19 #include <linux/sysfs.h>
  20 #include <linux/dcache.h>
  21 #include <linux/percpu.h>
  22 #include <linux/ptrace.h>
  23 #include <linux/vmstat.h>
  24 #include <linux/vmalloc.h>
  25 #include <linux/hardirq.h>
  26 #include <linux/rculist.h>
  27 #include <linux/uaccess.h>
  28 #include <linux/syscalls.h>
  29 #include <linux/anon_inodes.h>
  30 #include <linux/kernel_stat.h>
  31 #include <linux/perf_event.h>
  32 #include <linux/ftrace_event.h>
  33 #include <linux/hw_breakpoint.h>
  34
  35 #include <asm/irq_regs.h>
  36
  37 /*
  38  * Each CPU has a list of per CPU events:
  39  */
  40 static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  41
  42 int perf_max_events __read_mostly = 1;
  43 static int perf_reserved_percpu __read_mostly;
  44 static int perf_overcommit __read_mostly = 1;
  45
  46 static atomic_t nr_events __read_mostly;
  47 static atomic_t nr_mmap_events __read_mostly;
  48 static atomic_t nr_comm_events __read_mostly;
  49 static atomic_t nr_task_events __read_mostly;
  50
  51 /*
  52  * perf event paranoia level:
  53  *  -1 - not paranoid at all
  54  *   0 - disallow raw tracepoint access for unpriv
  55  *   1 - disallow cpu events for unpriv
  56  *   2 - disallow kernel profiling for unpriv
  57  */
  58 int sysctl_perf_event_paranoid __read_mostly = 1;
  59
  60 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
  61
  62 /*
  63  * max perf event sample rate
  64  */
  65 int sysctl_perf_event_sample_rate __read_mostly = 100000;
  66
  67 static atomic64_t perf_event_id;
  68
  69 /*
  70  * Lock for (sysadmin-configurable) event reservations:
  71  */
  72 static DEFINE_SPINLOCK(perf_resource_lock);
  73
  74 /*
  75  * Architecture provided APIs - weak aliases:
  76  */
  77 extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
  78 {
  79         return NULL;
  80 }
  81
  82 void __weak hw_perf_disable(void)               { barrier(); }
  83 void __weak hw_perf_enable(void)                { barrier(); }
  84
  85 int __weak
  86 hw_perf_group_sched_in(struct perf_event *group_leader,
  87                struct perf_cpu_context *cpuctx,
  88                struct perf_event_context *ctx)
  89 {
  90         return 0;
  91 }
  92
  93 void __weak perf_event_print_debug(void)        { }
  94
  95 static DEFINE_PER_CPU(int, perf_disable_count);
  96
  97 void perf_disable(void)
  98 {
  99         if (!__get_cpu_var(perf_disable_count)++)
 100                 hw_perf_disable();
 101 }
 102
 103 void perf_enable(void)
 104 {
 105         if (!--__get_cpu_var(perf_disable_count))
 106                 hw_perf_enable();
 107 }
 108
 109 static void get_ctx(struct perf_event_context *ctx)
 110 {
 111         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 112 }
 113
 114 static void free_ctx(struct rcu_head *head)
 115 {
 116         struct perf_event_context *ctx;
 117
 118         ctx = container_of(head, struct perf_event_context, rcu_head);
 119         kfree(ctx);
 120 }
 121
 122 static void put_ctx(struct perf_event_context *ctx)
 123 {
 124         if (atomic_dec_and_test(&ctx->refcount)) {
 125                 if (ctx->parent_ctx)
 126                         put_ctx(ctx->parent_ctx);
 127                 if (ctx->task)
 128                         put_task_struct(ctx->task);
 129                 call_rcu(&ctx->rcu_head, free_ctx);
 130         }
 131 }
 132
 133 static void unclone_ctx(struct perf_event_context *ctx)
 134 {
 135         if (ctx->parent_ctx) {
 136                 put_ctx(ctx->parent_ctx);
 137                 ctx->parent_ctx = NULL;
 138         }
 139 }
 140
 141 /*
 142  * If we inherit events we want to return the parent event id
 143  * to userspace.
 144  */
 145 static u64 primary_event_id(struct perf_event *event)
 146 {
 147         u64 id = event->id;
 148
 149         if (event->parent)
 150                 id = event->parent->id;
 151
 152         return id;
 153 }
 154
 155 /*
 156  * Get the perf_event_context for a task and lock it.
 157  * This has to cope with with the fact that until it is locked,
 158  * the context could get moved to another task.
 159  */
 160 static struct perf_event_context *
 161 perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 162 {
 163         struct perf_event_context *ctx;
 164
 165         rcu_read_lock();
 166  retry:
 167         ctx = rcu_dereference(task->perf_event_ctxp);
 168         if (ctx) {
 169                 /*
 170                  * If this context is a clone of another, it might
 171                  * get swapped for another underneath us by
 172                  * perf_event_task_sched_out, though the
 173                  * rcu_read_lock() protects us from any context
 174                  * getting freed.  Lock the context and check if it
 175                  * got swapped before we could get the lock, and retry
 176                  * if so.  If we locked the right context, then it
 177                  * can't get swapped on us any more.
 178                  */
 179                 raw_spin_lock_irqsave(&ctx->lock, *flags);
 180                 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
 181                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 182                         goto retry;
 183                 }
 184
 185                 if (!atomic_inc_not_zero(&ctx->refcount)) {
 186                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 187                         ctx = NULL;
 188                 }
 189         }
 190         rcu_read_unlock();
 191         return ctx;
 192 }
 193
 194 /*
 195  * Get the context for a task and increment its pin_count so it
 196  * can't get swapped to another task.  This also increments its
 197  * reference count so that the context can't get freed.
 198  */
 199 static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
 200 {
 201         struct perf_event_context *ctx;
 202         unsigned long flags;
 203
 204         ctx = perf_lock_task_context(task, &flags);
 205         if (ctx) {
 206                 ++ctx->pin_count;
 207                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
 208         }
 209         return ctx;
 210 }
 211
 212 static void perf_unpin_context(struct perf_event_context *ctx)
 213 {
 214         unsigned long flags;
 215
 216         raw_spin_lock_irqsave(&ctx->lock, flags);
 217         --ctx->pin_count;
 218         raw_spin_unlock_irqrestore(&ctx->lock, flags);
 219         put_ctx(ctx);
 220 }
 221
 222 static inline u64 perf_clock(void)
 223 {
 224         return cpu_clock(raw_smp_processor_id());
 225 }
 226
 227 /*
 228  * Update the record of the current time in a context.
 229  */
 230 static void update_context_time(struct perf_event_context *ctx)
 231 {
 232         u64 now = perf_clock();
 233
 234         ctx->time += now - ctx->timestamp;
 235         ctx->timestamp = now;
 236 }
 237
 238 /*
 239  * Update the total_time_enabled and total_time_running fields for a event.
 240  */
 241 static void update_event_times(struct perf_event *event)
 242 {
 243         struct perf_event_context *ctx = event->ctx;
 244         u64 run_end;
 245
 246         if (event->state < PERF_EVENT_STATE_INACTIVE ||
 247             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 248                 return;
 249
 250         if (ctx->is_active)
 251                 run_end = ctx->time;
 252         else
 253                 run_end = event->tstamp_stopped;
 254
 255         event->total_time_enabled = run_end - event->tstamp_enabled;
 256
 257         if (event->state == PERF_EVENT_STATE_INACTIVE)
 258                 run_end = event->tstamp_stopped;
 259         else
 260                 run_end = ctx->time;
 261
 262         event->total_time_running = run_end - event->tstamp_running;
 263 }
 264
 265 static struct list_head *
 266 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 267 {
 268         if (event->attr.pinned)
 269                 return &ctx->pinned_groups;
 270         else
 271                 return &ctx->flexible_groups;
 272 }
 273
 274 /*
 275  * Add a event from the lists for its context.
 276  * Must be called with ctx->mutex and ctx->lock held.
 277  */
 278 static void
 279 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 280 {
 281         struct perf_event *group_leader = event->group_leader;
 282
 283         /*
 284          * Depending on whether it is a standalone or sibling event,
 285          * add it straight to the context's event list, or to the group
 286          * leader's sibling list:
 287          */
 288         if (group_leader == event) {
 289                 struct list_head *list;
 290
 291                 if (is_software_event(event))
 292                         event->group_flags |= PERF_GROUP_SOFTWARE;
 293
 294                 list = ctx_group_list(event, ctx);
 295                 list_add_tail(&event->group_entry, list);
 296         } else {
 297                 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
 298                     !is_software_event(event))
 299                         group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
 300
 301                 list_add_tail(&event->group_entry, &group_leader->sibling_list);
 302                 group_leader->nr_siblings++;
 303         }
 304
 305         list_add_rcu(&event->event_entry, &ctx->event_list);
 306         ctx->nr_events++;
 307         if (event->attr.inherit_stat)
 308                 ctx->nr_stat++;
 309 }
 310
 311 /*
 312  * Remove a event from the lists for its context.
 313  * Must be called with ctx->mutex and ctx->lock held.
 314  */
 315 static void
 316 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 317 {
 318         struct perf_event *sibling, *tmp;
 319
 320         if (list_empty(&event->group_entry))
 321                 return;
 322         ctx->nr_events--;
 323         if (event->attr.inherit_stat)
 324                 ctx->nr_stat--;
 325
 326         list_del_init(&event->group_entry);
 327         list_del_rcu(&event->event_entry);
 328
 329         if (event->group_leader != event)
 330                 event->group_leader->nr_siblings--;
 331
 332         update_event_times(event);
 333
 334         /*
 335          * If event was in error state, then keep it
 336          * that way, otherwise bogus counts will be
 337          * returned on read(). The only way to get out
 338          * of error state is by explicit re-enabling
 339          * of the event
 340          */
 341         if (event->state > PERF_EVENT_STATE_OFF)
 342                 event->state = PERF_EVENT_STATE_OFF;
 343
 344         /*
 345          * If this was a group event with sibling events then
 346          * upgrade the siblings to singleton events by adding them
 347          * to the context list directly:
 348          */
 349         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 350                 struct list_head *list;
 351
 352                 list = ctx_group_list(event, ctx);
 353                 list_move_tail(&sibling->group_entry, list);
 354                 sibling->group_leader = sibling;
 355
 356                 /* Inherit group flags from the previous leader */
 357                 sibling->group_flags = event->group_flags;
 358         }
 359 }
 360
 361 static void
 362 event_sched_out(struct perf_event *event,
 363                   struct perf_cpu_context *cpuctx,
 364                   struct perf_event_context *ctx)
 365 {
 366         if (event->state != PERF_EVENT_STATE_ACTIVE)
 367                 return;
 368
 369         event->state = PERF_EVENT_STATE_INACTIVE;
 370         if (event->pending_disable) {
 371                 event->pending_disable = 0;
 372                 event->state = PERF_EVENT_STATE_OFF;
 373         }
 374         event->tstamp_stopped = ctx->time;
 375         event->pmu->disable(event);
 376         event->oncpu = -1;
 377
 378         if (!is_software_event(event))
 379                 cpuctx->active_oncpu--;
 380         ctx->nr_active--;
 381         if (event->attr.exclusive || !cpuctx->active_oncpu)
 382                 cpuctx->exclusive = 0;
 383 }
 384
 385 static void
 386 group_sched_out(struct perf_event *group_event,
 387                 struct perf_cpu_context *cpuctx,
 388                 struct perf_event_context *ctx)
 389 {
 390         struct perf_event *event;
 391
 392         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
 393                 return;
 394
 395         event_sched_out(group_event, cpuctx, ctx);
 396
 397         /*
 398          * Schedule out siblings (if any):
 399          */
 400         list_for_each_entry(event, &group_event->sibling_list, group_entry)
 401                 event_sched_out(event, cpuctx, ctx);
 402
 403         if (group_event->attr.exclusive)
 404                 cpuctx->exclusive = 0;
 405 }
 406
 407 /*
 408  * Cross CPU call to remove a performance event
 409  *
 410  * We disable the event on the hardware level first. After that we
 411  * remove it from the context list.
 412  */
 413 static void __perf_event_remove_from_context(void *info)
 414 {
 415         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 416         struct perf_event *event = info;
 417         struct perf_event_context *ctx = event->ctx;
 418
 419         /*
 420          * If this is a task context, we need to check whether it is
 421          * the current task context of this cpu. If not it has been
 422          * scheduled out before the smp call arrived.
 423          */
 424         if (ctx->task && cpuctx->task_ctx != ctx)
 425                 return;
 426
 427         raw_spin_lock(&ctx->lock);
 428         /*
 429          * Protect the list operation against NMI by disabling the
 430          * events on a global level.
 431          */
 432         perf_disable();
 433
 434         event_sched_out(event, cpuctx, ctx);
 435
 436         list_del_event(event, ctx);
 437
 438         if (!ctx->task) {
 439                 /*
 440                  * Allow more per task events with respect to the
 441                  * reservation:
 442                  */
 443                 cpuctx->max_pertask =
 444                         min(perf_max_events - ctx->nr_events,
 445                             perf_max_events - perf_reserved_percpu);
 446         }
 447
 448         perf_enable();
 449         raw_spin_unlock(&ctx->lock);
 450 }
 451
 452
 453 /*
 454  * Remove the event from a task's (or a CPU's) list of events.
 455  *
 456  * Must be called with ctx->mutex held.
 457  *
 458  * CPU events are removed with a smp call. For task events we only
 459  * call when the task is on a CPU.
 460  *
 461  * If event->ctx is a cloned context, callers must make sure that
 462  * every task struct that event->ctx->task could possibly point to
 463  * remains valid.  This is OK when called from perf_release since
 464  * that only calls us on the top-level context, which can't be a clone.
 465  * When called from perf_event_exit_task, it's OK because the
 466  * context has been detached from its task.
 467  */
 468 static void perf_event_remove_from_context(struct perf_event *event)
 469 {
 470         struct perf_event_context *ctx = event->ctx;
 471         struct task_struct *task = ctx->task;
 472
 473         if (!task) {
 474                 /*
 475                  * Per cpu events are removed via an smp call and
 476                  * the removal is always successful.
 477                  */
 478                 smp_call_function_single(event->cpu,
 479                                          __perf_event_remove_from_context,
 480                                          event, 1);
 481                 return;
 482         }
 483
 484 retry:
 485         task_oncpu_function_call(task, __perf_event_remove_from_context,
 486                                  event);
 487
 488         raw_spin_lock_irq(&ctx->lock);
 489         /*
 490          * If the context is active we need to retry the smp call.
 491          */
 492         if (ctx->nr_active && !list_empty(&event->group_entry)) {
 493                 raw_spin_unlock_irq(&ctx->lock);
 494                 goto retry;
 495         }
 496
 497         /*
 498          * The lock prevents that this context is scheduled in so we
 499          * can remove the event safely, if the call above did not
 500          * succeed.
 501          */
 502         if (!list_empty(&event->group_entry))
 503                 list_del_event(event, ctx);
 504         raw_spin_unlock_irq(&ctx->lock);
 505 }
 506
 507 /*
 508  * Update total_time_enabled and total_time_running for all events in a group.
 509  */
 510 static void update_group_times(struct perf_event *leader)
 511 {
 512         struct perf_event *event;
 513
 514         update_event_times(leader);
 515         list_for_each_entry(event, &leader->sibling_list, group_entry)
 516                 update_event_times(event);
 517 }
 518
 519 /*
 520  * Cross CPU call to disable a performance event
 521  */
 522 static void __perf_event_disable(void *info)
 523 {
 524         struct perf_event *event = info;
 525         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 526         struct perf_event_context *ctx = event->ctx;
 527
 528         /*
 529          * If this is a per-task event, need to check whether this
 530          * event's task is the current task on this cpu.
 531          */
 532         if (ctx->task && cpuctx->task_ctx != ctx)
 533                 return;
 534
 535         raw_spin_lock(&ctx->lock);
 536
 537         /*
 538          * If the event is on, turn it off.
 539          * If it is in error state, leave it in error state.
 540          */
 541         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 542                 update_context_time(ctx);
 543                 update_group_times(event);
 544                 if (event == event->group_leader)
 545                         group_sched_out(event, cpuctx, ctx);
 546                 else
 547                         event_sched_out(event, cpuctx, ctx);
 548                 event->state = PERF_EVENT_STATE_OFF;
 549         }
 550
 551         raw_spin_unlock(&ctx->lock);
 552 }
 553
 554 /*
 555  * Disable a event.
 556  *
 557  * If event->ctx is a cloned context, callers must make sure that
 558  * every task struct that event->ctx->task could possibly point to
 559  * remains valid.  This condition is satisifed when called through
 560  * perf_event_for_each_child or perf_event_for_each because they
 561  * hold the top-level event's child_mutex, so any descendant that
 562  * goes to exit will block in sync_child_event.
 563  * When called from perf_pending_event it's OK because event->ctx
 564  * is the current context on this CPU and preemption is disabled,
 565  * hence we can't get into perf_event_task_sched_out for this context.
 566  */
 567 void perf_event_disable(struct perf_event *event)
 568 {
 569         struct perf_event_context *ctx = event->ctx;
 570         struct task_struct *task = ctx->task;
 571
 572         if (!task) {
 573                 /*
 574                  * Disable the event on the cpu that it's on
 575                  */
 576                 smp_call_function_single(event->cpu, __perf_event_disable,
 577                                          event, 1);
 578                 return;
 579         }
 580
 581  retry:
 582         task_oncpu_function_call(task, __perf_event_disable, event);
 583
 584         raw_spin_lock_irq(&ctx->lock);
 585         /*
 586          * If the event is still active, we need to retry the cross-call.
 587          */
 588         if (event->state == PERF_EVENT_STATE_ACTIVE) {
 589                 raw_spin_unlock_irq(&ctx->lock);
 590                 goto retry;
 591         }
 592
 593         /*
 594          * Since we have the lock this context can't be scheduled
 595          * in, so we can change the state safely.
 596          */
 597         if (event->state == PERF_EVENT_STATE_INACTIVE) {
 598                 update_group_times(event);
 599                 event->state = PERF_EVENT_STATE_OFF;
 600         }
 601
 602         raw_spin_unlock_irq(&ctx->lock);
 603 }
 604
 605 static int
 606 event_sched_in(struct perf_event *event,
 607                  struct perf_cpu_context *cpuctx,
 608                  struct perf_event_context *ctx)
 609 {
 610         if (event->state <= PERF_EVENT_STATE_OFF)
 611                 return 0;
 612
 613         event->state = PERF_EVENT_STATE_ACTIVE;
 614         event->oncpu = smp_processor_id();
 615         /*
 616          * The new state must be visible before we turn it on in the hardware:
 617          */
 618         smp_wmb();
 619
 620         if (event->pmu->enable(event)) {
 621                 event->state = PERF_EVENT_STATE_INACTIVE;
 622                 event->oncpu = -1;
 623                 return -EAGAIN;
 624         }
 625
 626         event->tstamp_running += ctx->time - event->tstamp_stopped;
 627
 628         if (!is_software_event(event))
 629                 cpuctx->active_oncpu++;
 630         ctx->nr_active++;
 631
 632         if (event->attr.exclusive)
 633                 cpuctx->exclusive = 1;
 634
 635         return 0;
 636 }
 637
 638 static int
 639 group_sched_in(struct perf_event *group_event,
 640                struct perf_cpu_context *cpuctx,
 641                struct perf_event_context *ctx)
 642 {
 643         struct perf_event *event, *partial_group;
 644         int ret;
 645
 646         if (group_event->state == PERF_EVENT_STATE_OFF)
 647                 return 0;
 648
 649         ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
 650         if (ret)
 651                 return ret < 0 ? ret : 0;
 652
 653         if (event_sched_in(group_event, cpuctx, ctx))
 654                 return -EAGAIN;
 655
 656         /*
 657          * Schedule in siblings as one group (if any):
 658          */
 659         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 660                 if (event_sched_in(event, cpuctx, ctx)) {
 661                         partial_group = event;
 662                         goto group_error;
 663                 }
 664         }
 665
 666         return 0;
 667
 668 group_error:
 669         /*
 670          * Groups can be scheduled in as one unit only, so undo any
 671          * partial group before returning:
 672          */
 673         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 674                 if (event == partial_group)
 675                         break;
 676                 event_sched_out(event, cpuctx, ctx);
 677         }
 678         event_sched_out(group_event, cpuctx, ctx);
 679
 680         return -EAGAIN;
 681 }
 682
 683 /*
 684  * Work out whether we can put this event group on the CPU now.
 685  */
 686 static int group_can_go_on(struct perf_event *event,
 687                            struct perf_cpu_context *cpuctx,
 688                            int can_add_hw)
 689 {
 690         /*
 691          * Groups consisting entirely of software events can always go on.
 692          */
 693         if (event->group_flags & PERF_GROUP_SOFTWARE)
 694                 return 1;
 695         /*
 696          * If an exclusive group is already on, no other hardware
 697          * events can go on.
 698          */
 699         if (cpuctx->exclusive)
 700                 return 0;
 701         /*
 702          * If this group is exclusive and there are already
 703          * events on the CPU, it can't go on.
 704          */
 705         if (event->attr.exclusive && cpuctx->active_oncpu)
 706                 return 0;
 707         /*
 708          * Otherwise, try to add it if all previous groups were able
 709          * to go on.
 710          */
 711         return can_add_hw;
 712 }
 713
 714 static void add_event_to_ctx(struct perf_event *event,
 715                                struct perf_event_context *ctx)
 716 {
 717         list_add_event(event, ctx);
 718         event->tstamp_enabled = ctx->time;
 719         event->tstamp_running = ctx->time;
 720         event->tstamp_stopped = ctx->time;
 721 }
 722
 723 /*
 724  * Cross CPU call to install and enable a performance event
 725  *
 726  * Must be called with ctx->mutex held
 727  */
 728 static void __perf_install_in_context(void *info)
 729 {
 730         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 731         struct perf_event *event = info;
 732         struct perf_event_context *ctx = event->ctx;
 733         struct perf_event *leader = event->group_leader;
 734         int err;
 735
 736         /*
 737          * If this is a task context, we need to check whether it is
 738          * the current task context of this cpu. If not it has been
 739          * scheduled out before the smp call arrived.
 740          * Or possibly this is the right context but it isn't
 741          * on this cpu because it had no events.
 742          */
 743         if (ctx->task && cpuctx->task_ctx != ctx) {
 744                 if (cpuctx->task_ctx || ctx->task != current)
 745                         return;
 746                 cpuctx->task_ctx = ctx;
 747         }
 748
 749         raw_spin_lock(&ctx->lock);
 750         ctx->is_active = 1;
 751         update_context_time(ctx);
 752
 753         /*
 754          * Protect the list operation against NMI by disabling the
 755          * events on a global level. NOP for non NMI based events.
 756          */
 757         perf_disable();
 758
 759         add_event_to_ctx(event, ctx);
 760
 761         if (event->cpu != -1 && event->cpu != smp_processor_id())
 762                 goto unlock;
 763
 764         /*
 765          * Don't put the event on if it is disabled or if
 766          * it is in a group and the group isn't on.
 767          */
 768         if (event->state != PERF_EVENT_STATE_INACTIVE ||
 769             (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
 770                 goto unlock;
 771
 772         /*
 773          * An exclusive event can't go on if there are already active
 774          * hardware events, and no hardware event can go on if there
 775          * is already an exclusive event on.
 776          */
 777         if (!group_can_go_on(event, cpuctx, 1))
 778                 err = -EEXIST;
 779         else
 780                 err = event_sched_in(event, cpuctx, ctx);
 781
 782         if (err) {
 783                 /*
 784                  * This event couldn't go on.  If it is in a group
 785                  * then we have to pull the whole group off.
 786                  * If the event group is pinned then put it in error state.
 787                  */
 788                 if (leader != event)
 789                         group_sched_out(leader, cpuctx, ctx);
 790                 if (leader->attr.pinned) {
 791                         update_group_times(leader);
 792                         leader->state = PERF_EVENT_STATE_ERROR;
 793                 }
 794         }
 795
 796         if (!err && !ctx->task && cpuctx->max_pertask)
 797                 cpuctx->max_pertask--;
 798
 799  unlock:
 800         perf_enable();
 801
 802         raw_spin_unlock(&ctx->lock);
 803 }
 804
 805 /*
 806  * Attach a performance event to a context
 807  *
 808  * First we add the event to the list with the hardware enable bit
 809  * in event->hw_config cleared.
 810  *
 811  * If the event is attached to a task which is on a CPU we use a smp
 812  * call to enable it in the task context. The task might have been
 813  * scheduled away, but we check this in the smp call again.
 814  *
 815  * Must be called with ctx->mutex held.
 816  */
 817 static void
 818 perf_install_in_context(struct perf_event_context *ctx,
 819                         struct perf_event *event,
 820                         int cpu)
 821 {
 822         struct task_struct *task = ctx->task;
 823
 824         if (!task) {
 825                 /*
 826                  * Per cpu events are installed via an smp call and
 827                  * the install is always successful.
 828                  */
 829                 smp_call_function_single(cpu, __perf_install_in_context,
 830                                          event, 1);
 831                 return;
 832         }
 833
 834 retry:
 835         task_oncpu_function_call(task, __perf_install_in_context,
 836                                  event);
 837
 838         raw_spin_lock_irq(&ctx->lock);
 839         /*
 840          * we need to retry the smp call.
 841          */
 842         if (ctx->is_active && list_empty(&event->group_entry)) {
 843                 raw_spin_unlock_irq(&ctx->lock);
 844                 goto retry;
 845         }
 846
 847         /*
 848          * The lock prevents that this context is scheduled in so we
 849          * can add the event safely, if it the call above did not
 850          * succeed.
 851          */
 852         if (list_empty(&event->group_entry))
 853                 add_event_to_ctx(event, ctx);
 854         raw_spin_unlock_irq(&ctx->lock);
 855 }
 856
 857 /*
 858  * Put a event into inactive state and update time fields.
 859  * Enabling the leader of a group effectively enables all
 860  * the group members that aren't explicitly disabled, so we
 861  * have to update their ->tstamp_enabled also.
 862  * Note: this works for group members as well as group leaders
 863  * since the non-leader members' sibling_lists will be empty.
 864  */
 865 static void __perf_event_mark_enabled(struct perf_event *event,
 866                                         struct perf_event_context *ctx)
 867 {
 868         struct perf_event *sub;
 869
 870         event->state = PERF_EVENT_STATE_INACTIVE;
 871         event->tstamp_enabled = ctx->time - event->total_time_enabled;
 872         list_for_each_entry(sub, &event->sibling_list, group_entry)
 873                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
 874                         sub->tstamp_enabled =
 875                                 ctx->time - sub->total_time_enabled;
 876 }
 877
 878 /*
 879  * Cross CPU call to enable a performance event
 880  */
 881 static void __perf_event_enable(void *info)
 882 {
 883         struct perf_event *event = info;
 884         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 885         struct perf_event_context *ctx = event->ctx;
 886         struct perf_event *leader = event->group_leader;
 887         int err;
 888
 889         /*
 890          * If this is a per-task event, need to check whether this
 891          * event's task is the current task on this cpu.
 892          */
 893         if (ctx->task && cpuctx->task_ctx != ctx) {
 894                 if (cpuctx->task_ctx || ctx->task != current)
 895                         return;
 896                 cpuctx->task_ctx = ctx;
 897         }
 898
 899         raw_spin_lock(&ctx->lock);
 900         ctx->is_active = 1;
 901         update_context_time(ctx);
 902
 903         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 904                 goto unlock;
 905         __perf_event_mark_enabled(event, ctx);
 906
 907         if (event->cpu != -1 && event->cpu != smp_processor_id())
 908                 goto unlock;
 909
 910         /*
 911          * If the event is in a group and isn't the group leader,
 912          * then don't put it on unless the group is on.
 913          */
 914         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
 915                 goto unlock;
 916
 917         if (!group_can_go_on(event, cpuctx, 1)) {
 918                 err = -EEXIST;
 919         } else {
 920                 perf_disable();
 921                 if (event == leader)
 922                         err = group_sched_in(event, cpuctx, ctx);
 923                 else
 924                         err = event_sched_in(event, cpuctx, ctx);
 925                 perf_enable();
 926         }
 927
 928         if (err) {
 929                 /*
 930                  * If this event can't go on and it's part of a
 931                  * group, then the whole group has to come off.
 932                  */
 933                 if (leader != event)
 934                         group_sched_out(leader, cpuctx, ctx);
 935                 if (leader->attr.pinned) {
 936                         update_group_times(leader);
 937                         leader->state = PERF_EVENT_STATE_ERROR;
 938                 }
 939         }
 940
 941  unlock:
 942         raw_spin_unlock(&ctx->lock);
 943 }
 944
 945 /*
 946  * Enable a event.
 947  *
 948  * If event->ctx is a cloned context, callers must make sure that
 949  * every task struct that event->ctx->task could possibly point to
 950  * remains valid.  This condition is satisfied when called through
 951  * perf_event_for_each_child or perf_event_for_each as described
 952  * for perf_event_disable.
 953  */
 954 void perf_event_enable(struct perf_event *event)
 955 {
 956         struct perf_event_context *ctx = event->ctx;
 957         struct task_struct *task = ctx->task;
 958
 959         if (!task) {
 960                 /*
 961                  * Enable the event on the cpu that it's on
 962                  */
 963                 smp_call_function_single(event->cpu, __perf_event_enable,
 964                                          event, 1);
 965                 return;
 966         }
 967
 968         raw_spin_lock_irq(&ctx->lock);
 969         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 970                 goto out;
 971
 972         /*
 973          * If the event is in error state, clear that first.
 974          * That way, if we see the event in error state below, we
 975          * know that it has gone back into error state, as distinct
 976          * from the task having been scheduled away before the
 977          * cross-call arrived.
 978          */
 979         if (event->state == PERF_EVENT_STATE_ERROR)
 980                 event->state = PERF_EVENT_STATE_OFF;
 981
 982  retry:
 983         raw_spin_unlock_irq(&ctx->lock);
 984         task_oncpu_function_call(task, __perf_event_enable, event);
 985
 986         raw_spin_lock_irq(&ctx->lock);
 987
 988         /*
 989          * If the context is active and the event is still off,
 990          * we need to retry the cross-call.
 991          */
 992         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
 993                 goto retry;
 994
 995         /*
 996          * Since we have the lock this context can't be scheduled
 997          * in, so we can change the state safely.
 998          */
 999         if (event->state == PERF_EVENT_STATE_OFF)
1000                 __perf_event_mark_enabled(event, ctx);
1001
1002  out:
1003         raw_spin_unlock_irq(&ctx->lock);
1004 }
1005
1006 static int perf_event_refresh(struct perf_event *event, int refresh)
1007 {
1008         /*
1009          * not supported on inherited events
1010          */
1011         if (event->attr.inherit)
1012                 return -EINVAL;
1013
1014         atomic_add(refresh, &event->event_limit);
1015         perf_event_enable(event);
1016
1017         return 0;
1018 }
1019
1020 enum event_type_t {
1021         EVENT_FLEXIBLE = 0x1,
1022         EVENT_PINNED = 0x2,
1023         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1024 };
1025
1026 static void ctx_sched_out(struct perf_event_context *ctx,
1027                           struct perf_cpu_context *cpuctx,
1028                           enum event_type_t event_type)
1029 {
1030         struct perf_event *event;
1031
1032         raw_spin_lock(&ctx->lock);
1033         ctx->is_active = 0;
1034         if (likely(!ctx->nr_events))
1035                 goto out;
1036         update_context_time(ctx);
1037
1038         perf_disable();
1039         if (!ctx->nr_active)
1040                 goto out_enable;
1041
1042         if (event_type & EVENT_PINNED)
1043                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1044                         group_sched_out(event, cpuctx, ctx);
1045
1046         if (event_type & EVENT_FLEXIBLE)
1047                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1048                         group_sched_out(event, cpuctx, ctx);
1049
1050  out_enable:
1051         perf_enable();
1052  out:
1053         raw_spin_unlock(&ctx->lock);
1054 }
1055
1056 /*
1057  * Test whether two contexts are equivalent, i.e. whether they
1058  * have both been cloned from the same version of the same context
1059  * and they both have the same number of enabled events.
1060  * If the number of enabled events is the same, then the set
1061  * of enabled events should be the same, because these are both
1062  * inherited contexts, therefore we can't access individual events
1063  * in them directly with an fd; we can only enable/disable all
1064  * events via prctl, or enable/disable all events in a family
1065  * via ioctl, which will have the same effect on both contexts.
1066  */
1067 static int context_equiv(struct perf_event_context *ctx1,
1068                          struct perf_event_context *ctx2)
1069 {
1070         return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1071                 && ctx1->parent_gen == ctx2->parent_gen
1072                 && !ctx1->pin_count && !ctx2->pin_count;
1073 }
1074
1075 static void __perf_event_sync_stat(struct perf_event *event,
1076                                      struct perf_event *next_event)
1077 {
1078         u64 value;
1079
1080         if (!event->attr.inherit_stat)
1081                 return;
1082
1083         /*
1084          * Update the event value, we cannot use perf_event_read()
1085          * because we're in the middle of a context switch and have IRQs
1086          * disabled, which upsets smp_call_function_single(), however
1087          * we know the event must be on the current CPU, therefore we
1088          * don't need to use it.
1089          */
1090         switch (event->state) {
1091         case PERF_EVENT_STATE_ACTIVE:
1092                 event->pmu->read(event);
1093                 /* fall-through */
1094
1095         case PERF_EVENT_STATE_INACTIVE:
1096                 update_event_times(event);
1097                 break;
1098
1099         default:
1100                 break;
1101         }
1102
1103         /*
1104          * In order to keep per-task stats reliable we need to flip the event
1105          * values when we flip the contexts.
1106          */
1107         value = atomic64_read(&next_event->count);
1108         value = atomic64_xchg(&event->count, value);
1109         atomic64_set(&next_event->count, value);
1110
1111         swap(event->total_time_enabled, next_event->total_time_enabled);
1112         swap(event->total_time_running, next_event->total_time_running);
1113
1114         /*
1115          * Since we swizzled the values, update the user visible data too.
1116          */
1117         perf_event_update_userpage(event);
1118         perf_event_update_userpage(next_event);
1119 }
1120
1121 #define list_next_entry(pos, member) \
1122         list_entry(pos->member.next, typeof(*pos), member)
1123
1124 static void perf_event_sync_stat(struct perf_event_context *ctx,
1125                                    struct perf_event_context *next_ctx)
1126 {
1127         struct perf_event *event, *next_event;
1128
1129         if (!ctx->nr_stat)
1130                 return;
1131
1132         update_context_time(ctx);
1133
1134         event = list_first_entry(&ctx->event_list,
1135                                    struct perf_event, event_entry);
1136
1137         next_event = list_first_entry(&next_ctx->event_list,
1138                                         struct perf_event, event_entry);
1139
1140         while (&event->event_entry != &ctx->event_list &&
1141                &next_event->event_entry != &next_ctx->event_list) {
1142
1143                 __perf_event_sync_stat(event, next_event);
1144
1145                 event = list_next_entry(event, event_entry);
1146                 next_event = list_next_entry(next_event, event_entry);
1147         }
1148 }
1149
1150 /*
1151  * Called from scheduler to remove the events of the current task,
1152  * with interrupts disabled.
1153  *
1154  * We stop each event and update the event value in event->count.
1155  *
1156  * This does not protect us against NMI, but disable()
1157  * sets the disabled bit in the control field of event _before_
1158  * accessing the event control register. If a NMI hits, then it will
1159  * not restart the event.
1160  */
1161 void perf_event_task_sched_out(struct task_struct *task,
1162                                  struct task_struct *next)
1163 {
1164         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1165         struct perf_event_context *ctx = task->perf_event_ctxp;
1166         struct perf_event_context *next_ctx;
1167         struct perf_event_context *parent;
1168         struct pt_regs *regs;
1169         int do_switch = 1;
1170
1171         regs = task_pt_regs(task);
1172         perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1173
1174         if (likely(!ctx || !cpuctx->task_ctx))
1175                 return;
1176
1177         rcu_read_lock();
1178         parent = rcu_dereference(ctx->parent_ctx);
1179         next_ctx = next->perf_event_ctxp;
1180         if (parent && next_ctx &&
1181             rcu_dereference(next_ctx->parent_ctx) == parent) {
1182                 /*
1183                  * Looks like the two contexts are clones, so we might be
1184                  * able to optimize the context switch.  We lock both
1185                  * contexts and check that they are clones under the
1186                  * lock (including re-checking that neither has been
1187                  * uncloned in the meantime).  It doesn't matter which
1188                  * order we take the locks because no other cpu could
1189                  * be trying to lock both of these tasks.
1190                  */
1191                 raw_spin_lock(&ctx->lock);
1192                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1193                 if (context_equiv(ctx, next_ctx)) {
1194                         /*
1195                          * XXX do we need a memory barrier of sorts
1196                          * wrt to rcu_dereference() of perf_event_ctxp
1197                          */
1198                         task->perf_event_ctxp = next_ctx;
1199                         next->perf_event_ctxp = ctx;
1200                         ctx->task = next;
1201                         next_ctx->task = task;
1202                         do_switch = 0;
1203
1204                         perf_event_sync_stat(ctx, next_ctx);
1205                 }
1206                 raw_spin_unlock(&next_ctx->lock);
1207                 raw_spin_unlock(&ctx->lock);
1208         }
1209         rcu_read_unlock();
1210
1211         if (do_switch) {
1212                 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1213                 cpuctx->task_ctx = NULL;
1214         }
1215 }
1216
1217 static void task_ctx_sched_out(struct perf_event_context *ctx,
1218                                enum event_type_t event_type)
1219 {
1220         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1221
1222         if (!cpuctx->task_ctx)
1223                 return;
1224
1225         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1226                 return;
1227
1228         ctx_sched_out(ctx, cpuctx, event_type);
1229         cpuctx->task_ctx = NULL;
1230 }
1231
1232 /*
1233  * Called with IRQs disabled
1234  */
1235 static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1236 {
1237         task_ctx_sched_out(ctx, EVENT_ALL);
1238 }
1239
1240 /*
1241  * Called with IRQs disabled
1242  */
1243 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1244                               enum event_type_t event_type)
1245 {
1246         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1247 }
1248
1249 static void
1250 ctx_pinned_sched_in(struct perf_event_context *ctx,
1251                     struct perf_cpu_context *cpuctx)
1252 {
1253         struct perf_event *event;
1254
1255         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1256                 if (event->state <= PERF_EVENT_STATE_OFF)
1257                         continue;
1258                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1259                         continue;
1260
1261                 if (group_can_go_on(event, cpuctx, 1))
1262                         group_sched_in(event, cpuctx, ctx);
1263
1264                 /*
1265                  * If this pinned group hasn't been scheduled,
1266                  * put it in error state.
1267                  */
1268                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1269                         update_group_times(event);
1270                         event->state = PERF_EVENT_STATE_ERROR;
1271                 }
1272         }
1273 }
1274
1275 static void
1276 ctx_flexible_sched_in(struct perf_event_context *ctx,
1277                       struct perf_cpu_context *cpuctx)
1278 {
1279         struct perf_event *event;
1280         int can_add_hw = 1;
1281
1282         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1283                 /* Ignore events in OFF or ERROR state */
1284                 if (event->state <= PERF_EVENT_STATE_OFF)
1285                         continue;
1286                 /*
1287                  * Listen to the 'cpu' scheduling filter constraint
1288                  * of events:
1289                  */
1290                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1291                         continue;
1292
1293                 if (group_can_go_on(event, cpuctx, can_add_hw))
1294                         if (group_sched_in(event, cpuctx, ctx))
1295                                 can_add_hw = 0;
1296         }
1297 }
1298
1299 static void
1300 ctx_sched_in(struct perf_event_context *ctx,
1301              struct perf_cpu_context *cpuctx,
1302              enum event_type_t event_type)
1303 {
1304         raw_spin_lock(&ctx->lock);
1305         ctx->is_active = 1;
1306         if (likely(!ctx->nr_events))
1307                 goto out;
1308
1309         ctx->timestamp = perf_clock();
1310
1311         perf_disable();
1312
1313         /*
1314          * First go through the list and put on any pinned groups
1315          * in order to give them the best chance of going on.
1316          */
1317         if (event_type & EVENT_PINNED)
1318                 ctx_pinned_sched_in(ctx, cpuctx);
1319
1320         /* Then walk through the lower prio flexible groups */
1321         if (event_type & EVENT_FLEXIBLE)
1322                 ctx_flexible_sched_in(ctx, cpuctx);
1323
1324         perf_enable();
1325  out:
1326         raw_spin_unlock(&ctx->lock);
1327 }
1328
1329 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1330                              enum event_type_t event_type)
1331 {
1332         struct perf_event_context *ctx = &cpuctx->ctx;
1333
1334         ctx_sched_in(ctx, cpuctx, event_type);
1335 }
1336
1337 static void task_ctx_sched_in(struct task_struct *task,
1338                               enum event_type_t event_type)
1339 {
1340         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1341         struct perf_event_context *ctx = task->perf_event_ctxp;
1342
1343         if (likely(!ctx))
1344                 return;
1345         if (cpuctx->task_ctx == ctx)
1346                 return;
1347         ctx_sched_in(ctx, cpuctx, event_type);
1348         cpuctx->task_ctx = ctx;
1349 }
1350 /*
1351  * Called from scheduler to add the events of the current task
1352  * with interrupts disabled.
1353  *
1354  * We restore the event value and then enable it.
1355  *
1356  * This does not protect us against NMI, but enable()
1357  * sets the enabled bit in the control field of event _before_
1358  * accessing the event control register. If a NMI hits, then it will
1359  * keep the event running.
1360  */
1361 void perf_event_task_sched_in(struct task_struct *task)
1362 {
1363         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1364         struct perf_event_context *ctx = task->perf_event_ctxp;
1365
1366         if (likely(!ctx))
1367                 return;
1368
1369         if (cpuctx->task_ctx == ctx)
1370                 return;
1371
1372         /*
1373          * We want to keep the following priority order:
1374          * cpu pinned (that don't need to move), task pinned,
1375          * cpu flexible, task flexible.
1376          */
1377         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1378
1379         ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1380         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1381         ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1382
1383         cpuctx->task_ctx = ctx;
1384 }
1385
1386 #define MAX_INTERRUPTS (~0ULL)
1387
1388 static void perf_log_throttle(struct perf_event *event, int enable);
1389
1390 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1391 {
1392         u64 frequency = event->attr.sample_freq;
1393         u64 sec = NSEC_PER_SEC;
1394         u64 divisor, dividend;
1395
1396         int count_fls, nsec_fls, frequency_fls, sec_fls;
1397
1398         count_fls = fls64(count);
1399         nsec_fls = fls64(nsec);
1400         frequency_fls = fls64(frequency);
1401         sec_fls = 30;
1402
1403         /*
1404          * We got @count in @nsec, with a target of sample_freq HZ
1405          * the target period becomes:
1406          *
1407          *             @count * 10^9
1408          * period = -------------------
1409          *          @nsec * sample_freq
1410          *
1411          */
1412
1413         /*
1414          * Reduce accuracy by one bit such that @a and @b converge
1415          * to a similar magnitude.
1416          */
1417 #define REDUCE_FLS(a, b)                \
1418 do {                                    \
1419         if (a##_fls > b##_fls) {        \
1420                 a >>= 1;                \
1421                 a##_fls--;              \
1422         } else {                        \
1423                 b >>= 1;                \
1424                 b##_fls--;              \
1425         }                               \
1426 } while (0)
1427
1428         /*
1429          * Reduce accuracy until either term fits in a u64, then proceed with
1430          * the other, so that finally we can do a u64/u64 division.
1431          */
1432         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1433                 REDUCE_FLS(nsec, frequency);
1434                 REDUCE_FLS(sec, count);
1435         }
1436
1437         if (count_fls + sec_fls > 64) {
1438                 divisor = nsec * frequency;
1439
1440                 while (count_fls + sec_fls > 64) {
1441                         REDUCE_FLS(count, sec);
1442                         divisor >>= 1;
1443                 }
1444
1445                 dividend = count * sec;
1446         } else {
1447                 dividend = count * sec;
1448
1449                 while (nsec_fls + frequency_fls > 64) {
1450                         REDUCE_FLS(nsec, frequency);
1451                         dividend >>= 1;
1452                 }
1453
1454                 divisor = nsec * frequency;
1455         }
1456
1457         return div64_u64(dividend, divisor);
1458 }
1459
1460 static void perf_event_stop(struct perf_event *event)
1461 {
1462         if (!event->pmu->stop)
1463                 return event->pmu->disable(event);
1464
1465         return event->pmu->stop(event);
1466 }
1467
1468 static int perf_event_start(struct perf_event *event)
1469 {
1470         if (!event->pmu->start)
1471                 return event->pmu->enable(event);
1472
1473         return event->pmu->start(event);
1474 }
1475
1476 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1477 {
1478         struct hw_perf_event *hwc = &event->hw;
1479         u64 period, sample_period;
1480         s64 delta;
1481
1482         period = perf_calculate_period(event, nsec, count);
1483
1484         delta = (s64)(period - hwc->sample_period);
1485         delta = (delta + 7) / 8; /* low pass filter */
1486
1487         sample_period = hwc->sample_period + delta;
1488
1489         if (!sample_period)
1490                 sample_period = 1;
1491
1492         hwc->sample_period = sample_period;
1493
1494         if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1495                 perf_disable();
1496                 perf_event_stop(event);
1497                 atomic64_set(&hwc->period_left, 0);
1498                 perf_event_start(event);
1499                 perf_enable();
1500         }
1501 }
1502
1503 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1504 {
1505         struct perf_event *event;
1506         struct hw_perf_event *hwc;
1507         u64 interrupts, now;
1508         s64 delta;
1509
1510         raw_spin_lock(&ctx->lock);
1511         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1512                 if (event->state != PERF_EVENT_STATE_ACTIVE)
1513                         continue;
1514
1515                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1516                         continue;
1517
1518                 hwc = &event->hw;
1519
1520                 interrupts = hwc->interrupts;
1521                 hwc->interrupts = 0;
1522
1523                 /*
1524                  * unthrottle events on the tick
1525                  */
1526                 if (interrupts == MAX_INTERRUPTS) {
1527                         perf_log_throttle(event, 1);
1528                         perf_disable();
1529                         event->pmu->unthrottle(event);
1530                         perf_enable();
1531                 }
1532
1533                 if (!event->attr.freq || !event->attr.sample_freq)
1534                         continue;
1535
1536                 perf_disable();
1537                 event->pmu->read(event);
1538                 now = atomic64_read(&event->count);
1539                 delta = now - hwc->freq_count_stamp;
1540                 hwc->freq_count_stamp = now;
1541
1542                 if (delta > 0)
1543                         perf_adjust_period(event, TICK_NSEC, delta);
1544                 perf_enable();
1545         }
1546         raw_spin_unlock(&ctx->lock);
1547 }
1548
1549 /*
1550  * Round-robin a context's events:
1551  */
1552 static void rotate_ctx(struct perf_event_context *ctx)
1553 {
1554         raw_spin_lock(&ctx->lock);
1555
1556         /* Rotate the first entry last of non-pinned groups */
1557         list_rotate_left(&ctx->flexible_groups);
1558
1559         raw_spin_unlock(&ctx->lock);
1560 }
1561
1562 void perf_event_task_tick(struct task_struct *curr)
1563 {
1564         struct perf_cpu_context *cpuctx;
1565         struct perf_event_context *ctx;
1566         int rotate = 0;
1567
1568         if (!atomic_read(&nr_events))
1569                 return;
1570
1571         cpuctx = &__get_cpu_var(perf_cpu_context);
1572         if (cpuctx->ctx.nr_events &&
1573             cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1574                 rotate = 1;
1575
1576         ctx = curr->perf_event_ctxp;
1577         if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
1578                 rotate = 1;
1579
1580         perf_ctx_adjust_freq(&cpuctx->ctx);
1581         if (ctx)
1582                 perf_ctx_adjust_freq(ctx);
1583
1584         if (!rotate)
1585                 return;
1586
1587         perf_disable();
1588         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1589         if (ctx)
1590                 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1591
1592         rotate_ctx(&cpuctx->ctx);
1593         if (ctx)
1594                 rotate_ctx(ctx);
1595
1596         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1597         if (ctx)
1598                 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1599         perf_enable();
1600 }
1601
1602 static int event_enable_on_exec(struct perf_event *event,
1603                                 struct perf_event_context *ctx)
1604 {
1605         if (!event->attr.enable_on_exec)
1606                 return 0;
1607
1608         event->attr.enable_on_exec = 0;
1609         if (event->state >= PERF_EVENT_STATE_INACTIVE)
1610                 return 0;
1611
1612         __perf_event_mark_enabled(event, ctx);
1613
1614         return 1;
1615 }
1616
1617 /*
1618  * Enable all of a task's events that have been marked enable-on-exec.
1619  * This expects task == current.
1620  */
1621 static void perf_event_enable_on_exec(struct task_struct *task)
1622 {
1623         struct perf_event_context *ctx;
1624         struct perf_event *event;
1625         unsigned long flags;
1626         int enabled = 0;
1627         int ret;
1628
1629         local_irq_save(flags);
1630         ctx = task->perf_event_ctxp;
1631         if (!ctx || !ctx->nr_events)
1632                 goto out;
1633
1634         __perf_event_task_sched_out(ctx);
1635
1636         raw_spin_lock(&ctx->lock);
1637
1638         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1639                 ret = event_enable_on_exec(event, ctx);
1640                 if (ret)
1641                         enabled = 1;
1642         }
1643
1644         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1645                 ret = event_enable_on_exec(event, ctx);
1646                 if (ret)
1647                         enabled = 1;
1648         }
1649
1650         /*
1651          * Unclone this context if we enabled any event.
1652          */
1653         if (enabled)
1654                 unclone_ctx(ctx);
1655
1656         raw_spin_unlock(&ctx->lock);
1657
1658         perf_event_task_sched_in(task);
1659  out:
1660         local_irq_restore(flags);
1661 }
1662
1663 /*
1664  * Cross CPU call to read the hardware event
1665  */
1666 static void __perf_event_read(void *info)
1667 {
1668         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1669         struct perf_event *event = info;
1670         struct perf_event_context *ctx = event->ctx;
1671
1672         /*
1673          * If this is a task context, we need to check whether it is
1674          * the current task context of this cpu.  If not it has been
1675          * scheduled out before the smp call arrived.  In that case
1676          * event->count would have been updated to a recent sample
1677          * when the event was scheduled out.
1678          */
1679         if (ctx->task && cpuctx->task_ctx != ctx)
1680                 return;
1681
1682         raw_spin_lock(&ctx->lock);
1683         update_context_time(ctx);
1684         update_event_times(event);
1685         raw_spin_unlock(&ctx->lock);
1686
1687         event->pmu->read(event);
1688 }
1689
1690 static u64 perf_event_read(struct perf_event *event)
1691 {
1692         /*
1693          * If event is enabled and currently active on a CPU, update the
1694          * value in the event structure:
1695          */
1696         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1697                 smp_call_function_single(event->oncpu,
1698                                          __perf_event_read, event, 1);
1699         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1700                 struct perf_event_context *ctx = event->ctx;
1701                 unsigned long flags;
1702
1703                 raw_spin_lock_irqsave(&ctx->lock, flags);
1704                 update_context_time(ctx);
1705                 update_event_times(event);
1706                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1707         }
1708
1709         return atomic64_read(&event->count);
1710 }
1711
1712 /*
1713  * Initialize the perf_event context in a task_struct:
1714  */
1715 static void
1716 __perf_event_init_context(struct perf_event_context *ctx,
1717                             struct task_struct *task)
1718 {
1719         raw_spin_lock_init(&ctx->lock);
1720         mutex_init(&ctx->mutex);
1721         INIT_LIST_HEAD(&ctx->pinned_groups);
1722         INIT_LIST_HEAD(&ctx->flexible_groups);
1723         INIT_LIST_HEAD(&ctx->event_list);
1724         atomic_set(&ctx->refcount, 1);
1725         ctx->task = task;
1726 }
1727
1728 static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1729 {
1730         struct perf_event_context *ctx;
1731         struct perf_cpu_context *cpuctx;
1732         struct task_struct *task;
1733         unsigned long flags;
1734         int err;
1735
1736         if (pid == -1 && cpu != -1) {
1737                 /* Must be root to operate on a CPU event: */
1738                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1739                         return ERR_PTR(-EACCES);
1740
1741                 if (cpu < 0 || cpu >= nr_cpumask_bits)
1742                         return ERR_PTR(-EINVAL);
1743
1744                 /*
1745                  * We could be clever and allow to attach a event to an
1746                  * offline CPU and activate it when the CPU comes up, but
1747                  * that's for later.
1748                  */
1749                 if (!cpu_online(cpu))
1750                         return ERR_PTR(-ENODEV);
1751
1752                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1753                 ctx = &cpuctx->ctx;
1754                 get_ctx(ctx);
1755
1756                 return ctx;
1757         }
1758
1759         rcu_read_lock();
1760         if (!pid)
1761                 task = current;
1762         else
1763                 task = find_task_by_vpid(pid);
1764         if (task)
1765                 get_task_struct(task);
1766         rcu_read_unlock();
1767
1768         if (!task)
1769                 return ERR_PTR(-ESRCH);
1770
1771         /*
1772          * Can't attach events to a dying task.
1773          */
1774         err = -ESRCH;
1775         if (task->flags & PF_EXITING)
1776                 goto errout;
1777
1778         /* Reuse ptrace permission checks for now. */
1779         err = -EACCES;
1780         if (!ptrace_may_access(task, PTRACE_MODE_READ))
1781                 goto errout;
1782
1783  retry:
1784         ctx = perf_lock_task_context(task, &flags);
1785         if (ctx) {
1786                 unclone_ctx(ctx);
1787                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1788         }
1789
1790         if (!ctx) {
1791                 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1792                 err = -ENOMEM;
1793                 if (!ctx)
1794                         goto errout;
1795                 __perf_event_init_context(ctx, task);
1796                 get_ctx(ctx);
1797                 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1798                         /*
1799                          * We raced with some other task; use
1800                          * the context they set.
1801                          */
1802                         kfree(ctx);
1803                         goto retry;
1804                 }
1805                 get_task_struct(task);
1806         }
1807
1808         put_task_struct(task);
1809         return ctx;
1810
1811  errout:
1812         put_task_struct(task);
1813         return ERR_PTR(err);
1814 }
1815
1816 static void perf_event_free_filter(struct perf_event *event);
1817
1818 static void free_event_rcu(struct rcu_head *head)
1819 {
1820         struct perf_event *event;
1821
1822         event = container_of(head, struct perf_event, rcu_head);
1823         if (event->ns)
1824                 put_pid_ns(event->ns);
1825         perf_event_free_filter(event);
1826         kfree(event);
1827 }
1828
1829 static void perf_pending_sync(struct perf_event *event);
1830
1831 static void free_event(struct perf_event *event)
1832 {
1833         perf_pending_sync(event);
1834
1835         if (!event->parent) {
1836                 atomic_dec(&nr_events);
1837                 if (event->attr.mmap)
1838                         atomic_dec(&nr_mmap_events);
1839                 if (event->attr.comm)
1840                         atomic_dec(&nr_comm_events);
1841                 if (event->attr.task)
1842                         atomic_dec(&nr_task_events);
1843         }
1844
1845         if (event->output) {
1846                 fput(event->output->filp);
1847                 event->output = NULL;
1848         }
1849
1850         if (event->destroy)
1851                 event->destroy(event);
1852
1853         put_ctx(event->ctx);
1854         call_rcu(&event->rcu_head, free_event_rcu);
1855 }
1856
1857 int perf_event_release_kernel(struct perf_event *event)
1858 {
1859         struct perf_event_context *ctx = event->ctx;
1860
1861         WARN_ON_ONCE(ctx->parent_ctx);
1862         mutex_lock(&ctx->mutex);
1863         perf_event_remove_from_context(event);
1864         mutex_unlock(&ctx->mutex);
1865
1866         mutex_lock(&event->owner->perf_event_mutex);
1867         list_del_init(&event->owner_entry);
1868         mutex_unlock(&event->owner->perf_event_mutex);
1869         put_task_struct(event->owner);
1870
1871         free_event(event);
1872
1873         return 0;
1874 }
1875 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1876
1877 /*
1878  * Called when the last reference to the file is gone.
1879  */
1880 static int perf_release(struct inode *inode, struct file *file)
1881 {
1882         struct perf_event *event = file->private_data;
1883
1884         file->private_data = NULL;
1885
1886         return perf_event_release_kernel(event);
1887 }
1888
1889 static int perf_event_read_size(struct perf_event *event)
1890 {
1891         int entry = sizeof(u64); /* value */
1892         int size = 0;
1893         int nr = 1;
1894
1895         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1896                 size += sizeof(u64);
1897
1898         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1899                 size += sizeof(u64);
1900
1901         if (event->attr.read_format & PERF_FORMAT_ID)
1902                 entry += sizeof(u64);
1903
1904         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1905                 nr += event->group_leader->nr_siblings;
1906                 size += sizeof(u64);
1907         }
1908
1909         size += entry * nr;
1910
1911         return size;
1912 }
1913
1914 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1915 {
1916         struct perf_event *child;
1917         u64 total = 0;
1918
1919         *enabled = 0;
1920         *running = 0;
1921
1922         mutex_lock(&event->child_mutex);
1923         total += perf_event_read(event);
1924         *enabled += event->total_time_enabled +
1925                         atomic64_read(&event->child_total_time_enabled);
1926         *running += event->total_time_running +
1927                         atomic64_read(&event->child_total_time_running);
1928
1929         list_for_each_entry(child, &event->child_list, child_list) {
1930                 total += perf_event_read(child);
1931                 *enabled += child->total_time_enabled;
1932                 *running += child->total_time_running;
1933         }
1934         mutex_unlock(&event->child_mutex);
1935
1936         return total;
1937 }
1938 EXPORT_SYMBOL_GPL(perf_event_read_value);
1939
1940 static int perf_event_read_group(struct perf_event *event,
1941                                    u64 read_format, char __user *buf)
1942 {
1943         struct perf_event *leader = event->group_leader, *sub;
1944         int n = 0, size = 0, ret = -EFAULT;
1945         struct perf_event_context *ctx = leader->ctx;
1946         u64 values[5];
1947         u64 count, enabled, running;
1948
1949         mutex_lock(&ctx->mutex);
1950         count = perf_event_read_value(leader, &enabled, &running);
1951
1952         values[n++] = 1 + leader->nr_siblings;
1953         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1954                 values[n++] = enabled;
1955         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1956                 values[n++] = running;
1957         values[n++] = count;
1958         if (read_format & PERF_FORMAT_ID)
1959                 values[n++] = primary_event_id(leader);
1960
1961         size = n * sizeof(u64);
1962
1963         if (copy_to_user(buf, values, size))
1964                 goto unlock;
1965
1966         ret = size;
1967
1968         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1969                 n = 0;
1970
1971                 values[n++] = perf_event_read_value(sub, &enabled, &running);
1972                 if (read_format & PERF_FORMAT_ID)
1973                         values[n++] = primary_event_id(sub);
1974
1975                 size = n * sizeof(u64);
1976
1977                 if (copy_to_user(buf + ret, values, size)) {
1978                         ret = -EFAULT;
1979                         goto unlock;
1980                 }
1981
1982                 ret += size;
1983         }
1984 unlock:
1985         mutex_unlock(&ctx->mutex);
1986
1987         return ret;
1988 }
1989
1990 static int perf_event_read_one(struct perf_event *event,
1991                                  u64 read_format, char __user *buf)
1992 {
1993         u64 enabled, running;
1994         u64 values[4];
1995         int n = 0;
1996
1997         values[n++] = perf_event_read_value(event, &enabled, &running);
1998         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1999                 values[n++] = enabled;
2000         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2001                 values[n++] = running;
2002         if (read_format & PERF_FORMAT_ID)
2003                 values[n++] = primary_event_id(event);
2004
2005         if (copy_to_user(buf, values, n * sizeof(u64)))
2006                 return -EFAULT;
2007
2008         return n * sizeof(u64);
2009 }
2010
2011 /*
2012  * Read the performance event - simple non blocking version for now
2013  */
2014 static ssize_t
2015 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2016 {
2017         u64 read_format = event->attr.read_format;
2018         int ret;
2019
2020         /*
2021          * Return end-of-file for a read on a event that is in
2022          * error state (i.e. because it was pinned but it couldn't be
2023          * scheduled on to the CPU at some point).
2024          */
2025         if (event->state == PERF_EVENT_STATE_ERROR)
2026                 return 0;
2027
2028         if (count < perf_event_read_size(event))
2029                 return -ENOSPC;
2030
2031         WARN_ON_ONCE(event->ctx->parent_ctx);
2032         if (read_format & PERF_FORMAT_GROUP)
2033                 ret = perf_event_read_group(event, read_format, buf);
2034         else
2035                 ret = perf_event_read_one(event, read_format, buf);
2036
2037         return ret;
2038 }
2039
2040 static ssize_t
2041 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
2042 {
2043         struct perf_event *event = file->private_data;
2044
2045         return perf_read_hw(event, buf, count);
2046 }
2047
2048 static unsigned int perf_poll(struct file *file, poll_table *wait)
2049 {
2050         struct perf_event *event = file->private_data;
2051         struct perf_mmap_data *data;
2052         unsigned int events = POLL_HUP;
2053
2054         rcu_read_lock();
2055         data = rcu_dereference(event->data);
2056         if (data)
2057                 events = atomic_xchg(&data->poll, 0);
2058         rcu_read_unlock();
2059
2060         poll_wait(file, &event->waitq, wait);
2061
2062         return events;
2063 }
2064
2065 static void perf_event_reset(struct perf_event *event)
2066 {
2067         (void)perf_event_read(event);
2068         atomic64_set(&event->count, 0);
2069         perf_event_update_userpage(event);
2070 }
2071
2072 /*
2073  * Holding the top-level event's child_mutex means that any
2074  * descendant process that has inherited this event will block
2075  * in sync_child_event if it goes to exit, thus satisfying the
2076  * task existence requirements of perf_event_enable/disable.
2077  */
2078 static void perf_event_for_each_child(struct perf_event *event,
2079                                         void (*func)(struct perf_event *))
2080 {
2081         struct perf_event *child;
2082
2083         WARN_ON_ONCE(event->ctx->parent_ctx);
2084         mutex_lock(&event->child_mutex);
2085         func(event);
2086         list_for_each_entry(child, &event->child_list, child_list)
2087                 func(child);
2088         mutex_unlock(&event->child_mutex);
2089 }
2090
2091 static void perf_event_for_each(struct perf_event *event,
2092                                   void (*func)(struct perf_event *))
2093 {
2094         struct perf_event_context *ctx = event->ctx;
2095         struct perf_event *sibling;
2096
2097         WARN_ON_ONCE(ctx->parent_ctx);
2098         mutex_lock(&ctx->mutex);
2099         event = event->group_leader;
2100
2101         perf_event_for_each_child(event, func);
2102         func(event);
2103         list_for_each_entry(sibling, &event->sibling_list, group_entry)
2104                 perf_event_for_each_child(event, func);
2105         mutex_unlock(&ctx->mutex);
2106 }
2107
2108 static int perf_event_period(struct perf_event *event, u64 __user *arg)
2109 {
2110         struct perf_event_context *ctx = event->ctx;
2111         unsigned long size;
2112         int ret = 0;
2113         u64 value;
2114
2115         if (!event->attr.sample_period)
2116                 return -EINVAL;
2117
2118         size = copy_from_user(&value, arg, sizeof(value));
2119         if (size != sizeof(value))
2120                 return -EFAULT;
2121
2122         if (!value)
2123                 return -EINVAL;
2124
2125         raw_spin_lock_irq(&ctx->lock);
2126         if (event->attr.freq) {
2127                 if (value > sysctl_perf_event_sample_rate) {
2128                         ret = -EINVAL;
2129                         goto unlock;
2130                 }
2131
2132                 event->attr.sample_freq = value;
2133         } else {
2134                 event->attr.sample_period = value;
2135                 event->hw.sample_period = value;
2136         }
2137 unlock:
2138         raw_spin_unlock_irq(&ctx->lock);
2139
2140         return ret;
2141 }
2142
2143 static int perf_event_set_output(struct perf_event *event, int output_fd);
2144 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2145
2146 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2147 {
2148         struct perf_event *event = file->private_data;
2149         void (*func)(struct perf_event *);
2150         u32 flags = arg;
2151
2152         switch (cmd) {
2153         case PERF_EVENT_IOC_ENABLE:
2154                 func = perf_event_enable;
2155                 break;
2156         case PERF_EVENT_IOC_DISABLE:
2157                 func = perf_event_disable;
2158                 break;
2159         case PERF_EVENT_IOC_RESET:
2160                 func = perf_event_reset;
2161                 break;
2162
2163         case PERF_EVENT_IOC_REFRESH:
2164                 return perf_event_refresh(event, arg);
2165
2166         case PERF_EVENT_IOC_PERIOD:
2167                 return perf_event_period(event, (u64 __user *)arg);
2168
2169         case PERF_EVENT_IOC_SET_OUTPUT:
2170                 return perf_event_set_output(event, arg);
2171
2172         case PERF_EVENT_IOC_SET_FILTER:
2173                 return perf_event_set_filter(event, (void __user *)arg);
2174
2175         default:
2176                 return -ENOTTY;
2177         }
2178
2179         if (flags & PERF_IOC_FLAG_GROUP)
2180                 perf_event_for_each(event, func);
2181         else
2182                 perf_event_for_each_child(event, func);
2183
2184         return 0;
2185 }
2186
2187 int perf_event_task_enable(void)
2188 {
2189         struct perf_event *event;
2190
2191         mutex_lock(&current->perf_event_mutex);
2192         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2193                 perf_event_for_each_child(event, perf_event_enable);
2194         mutex_unlock(&current->perf_event_mutex);
2195
2196         return 0;
2197 }
2198
2199 int perf_event_task_disable(void)
2200 {
2201         struct perf_event *event;
2202
2203         mutex_lock(&current->perf_event_mutex);
2204         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2205                 perf_event_for_each_child(event, perf_event_disable);
2206         mutex_unlock(&current->perf_event_mutex);
2207
2208         return 0;
2209 }
2210
2211 #ifndef PERF_EVENT_INDEX_OFFSET
2212 # define PERF_EVENT_INDEX_OFFSET 0
2213 #endif
2214
2215 static int perf_event_index(struct perf_event *event)
2216 {
2217         if (event->state != PERF_EVENT_STATE_ACTIVE)
2218                 return 0;
2219
2220         return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2221 }
2222
2223 /*
2224  * Callers need to ensure there can be no nesting of this function, otherwise
2225  * the seqlock logic goes bad. We can not serialize this because the arch
2226  * code calls this from NMI context.
2227  */
2228 void perf_event_update_userpage(struct perf_event *event)
2229 {
2230         struct perf_event_mmap_page *userpg;
2231         struct perf_mmap_data *data;
2232
2233         rcu_read_lock();
2234         data = rcu_dereference(event->data);
2235         if (!data)
2236                 goto unlock;
2237
2238         userpg = data->user_page;
2239
2240         /*
2241          * Disable preemption so as to not let the corresponding user-space
2242          * spin too long if we get preempted.
2243          */
2244         preempt_disable();
2245         ++userpg->lock;
2246         barrier();
2247         userpg->index = perf_event_index(event);
2248         userpg->offset = atomic64_read(&event->count);
2249         if (event->state == PERF_EVENT_STATE_ACTIVE)
2250                 userpg->offset -= atomic64_read(&event->hw.prev_count);
2251
2252         userpg->time_enabled = event->total_time_enabled +
2253                         atomic64_read(&event->child_total_time_enabled);
2254
2255         userpg->time_running = event->total_time_running +
2256                         atomic64_read(&event->child_total_time_running);
2257
2258         barrier();
2259         ++userpg->lock;
2260         preempt_enable();
2261 unlock:
2262         rcu_read_unlock();
2263 }
2264
2265 static unsigned long perf_data_size(struct perf_mmap_data *data)
2266 {
2267         return data->nr_pages << (PAGE_SHIFT + data->data_order);
2268 }
2269
2270 #ifndef CONFIG_PERF_USE_VMALLOC
2271
2272 /*
2273  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2274  */
2275
2276 static struct page *
2277 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2278 {
2279         if (pgoff > data->nr_pages)
2280                 return NULL;
2281
2282         if (pgoff == 0)
2283                 return virt_to_page(data->user_page);
2284
2285         return virt_to_page(data->data_pages[pgoff - 1]);
2286 }
2287
2288 static struct perf_mmap_data *
2289 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2290 {
2291         struct perf_mmap_data *data;
2292         unsigned long size;
2293         int i;
2294
2295         WARN_ON(atomic_read(&event->mmap_count));
2296
2297         size = sizeof(struct perf_mmap_data);
2298         size += nr_pages * sizeof(void *);
2299
2300         data = kzalloc(size, GFP_KERNEL);
2301         if (!data)
2302                 goto fail;
2303
2304         data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2305         if (!data->user_page)
2306                 goto fail_user_page;
2307
2308         for (i = 0; i < nr_pages; i++) {
2309                 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2310                 if (!data->data_pages[i])
2311                         goto fail_data_pages;
2312         }
2313
2314         data->data_order = 0;
2315         data->nr_pages = nr_pages;
2316
2317         return data;
2318
2319 fail_data_pages:
2320         for (i--; i >= 0; i--)
2321                 free_page((unsigned long)data->data_pages[i]);
2322
2323         free_page((unsigned long)data->user_page);
2324
2325 fail_user_page:
2326         kfree(data);
2327
2328 fail:
2329         return NULL;
2330 }
2331
2332 static void perf_mmap_free_page(unsigned long addr)
2333 {
2334         struct page *page = virt_to_page((void *)addr);
2335
2336         page->mapping = NULL;
2337         __free_page(page);
2338 }
2339
2340 static void perf_mmap_data_free(struct perf_mmap_data *data)
2341 {
2342         int i;
2343
2344         perf_mmap_free_page((unsigned long)data->user_page);
2345         for (i = 0; i < data->nr_pages; i++)
2346                 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2347         kfree(data);
2348 }
2349
2350 #else
2351
2352 /*
2353  * Back perf_mmap() with vmalloc memory.
2354  *
2355  * Required for architectures that have d-cache aliasing issues.
2356  */
2357
2358 static struct page *
2359 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2360 {
2361         if (pgoff > (1UL << data->data_order))
2362                 return NULL;
2363
2364         return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2365 }
2366
2367 static void perf_mmap_unmark_page(void *addr)
2368 {
2369         struct page *page = vmalloc_to_page(addr);
2370
2371         page->mapping = NULL;
2372 }
2373
2374 static void perf_mmap_data_free_work(struct work_struct *work)
2375 {
2376         struct perf_mmap_data *data;
2377         void *base;
2378         int i, nr;
2379
2380         data = container_of(work, struct perf_mmap_data, work);
2381         nr = 1 << data->data_order;
2382
2383         base = data->user_page;
2384         for (i = 0; i < nr + 1; i++)
2385                 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2386
2387         vfree(base);
2388         kfree(data);
2389 }
2390
2391 static void perf_mmap_data_free(struct perf_mmap_data *data)
2392 {
2393         schedule_work(&data->work);
2394 }
2395
2396 static struct perf_mmap_data *
2397 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2398 {
2399         struct perf_mmap_data *data;
2400         unsigned long size;
2401         void *all_buf;
2402
2403         WARN_ON(atomic_read(&event->mmap_count));
2404
2405         size = sizeof(struct perf_mmap_data);
2406         size += sizeof(void *);
2407
2408         data = kzalloc(size, GFP_KERNEL);
2409         if (!data)
2410                 goto fail;
2411
2412         INIT_WORK(&data->work, perf_mmap_data_free_work);
2413
2414         all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2415         if (!all_buf)
2416                 goto fail_all_buf;
2417
2418         data->user_page = all_buf;
2419         data->data_pages[0] = all_buf + PAGE_SIZE;
2420         data->data_order = ilog2(nr_pages);
2421         data->nr_pages = 1;
2422
2423         return data;
2424
2425 fail_all_buf:
2426         kfree(data);
2427
2428 fail:
2429         return NULL;
2430 }
2431
2432 #endif
2433
2434 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2435 {
2436         struct perf_event *event = vma->vm_file->private_data;
2437         struct perf_mmap_data *data;
2438         int ret = VM_FAULT_SIGBUS;
2439
2440         if (vmf->flags & FAULT_FLAG_MKWRITE) {
2441                 if (vmf->pgoff == 0)
2442                         ret = 0;
2443                 return ret;
2444         }
2445
2446         rcu_read_lock();
2447         data = rcu_dereference(event->data);
2448         if (!data)
2449                 goto unlock;
2450
2451         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2452                 goto unlock;
2453
2454         vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2455         if (!vmf->page)
2456                 goto unlock;
2457
2458         get_page(vmf->page);
2459         vmf->page->mapping = vma->vm_file->f_mapping;
2460         vmf->page->index   = vmf->pgoff;
2461
2462         ret = 0;
2463 unlock:
2464         rcu_read_unlock();
2465
2466         return ret;
2467 }
2468
2469 static void
2470 perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2471 {
2472         long max_size = perf_data_size(data);
2473
2474         atomic_set(&data->lock, -1);
2475
2476         if (event->attr.watermark) {
2477                 data->watermark = min_t(long, max_size,
2478                                         event->attr.wakeup_watermark);
2479         }
2480
2481         if (!data->watermark)
2482                 data->watermark = max_size / 2;
2483
2484
2485         rcu_assign_pointer(event->data, data);
2486 }
2487
2488 static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2489 {
2490         struct perf_mmap_data *data;
2491
2492         data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2493         perf_mmap_data_free(data);
2494 }
2495
2496 static void perf_mmap_data_release(struct perf_event *event)
2497 {
2498         struct perf_mmap_data *data = event->data;
2499
2500         WARN_ON(atomic_read(&event->mmap_count));
2501
2502         rcu_assign_pointer(event->data, NULL);
2503         call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2504 }
2505
2506 static void perf_mmap_open(struct vm_area_struct *vma)
2507 {
2508         struct perf_event *event = vma->vm_file->private_data;
2509
2510         atomic_inc(&event->mmap_count);
2511 }
2512
2513 static void perf_mmap_close(struct vm_area_struct *vma)
2514 {
2515         struct perf_event *event = vma->vm_file->private_data;
2516
2517         WARN_ON_ONCE(event->ctx->parent_ctx);
2518         if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2519                 unsigned long size = perf_data_size(event->data);
2520                 struct user_struct *user = current_user();
2521
2522                 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2523                 vma->vm_mm->locked_vm -= event->data->nr_locked;
2524                 perf_mmap_data_release(event);
2525                 mutex_unlock(&event->mmap_mutex);
2526         }
2527 }
2528
2529 static const struct vm_operations_struct perf_mmap_vmops = {
2530         .open           = perf_mmap_open,
2531         .close          = perf_mmap_close,
2532         .fault          = perf_mmap_fault,
2533         .page_mkwrite   = perf_mmap_fault,
2534 };
2535
2536 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2537 {
2538         struct perf_event *event = file->private_data;
2539         unsigned long user_locked, user_lock_limit;
2540         struct user_struct *user = current_user();
2541         unsigned long locked, lock_limit;
2542         struct perf_mmap_data *data;
2543         unsigned long vma_size;
2544         unsigned long nr_pages;
2545         long user_extra, extra;
2546         int ret = 0;
2547
2548         if (!(vma->vm_flags & VM_SHARED))
2549                 return -EINVAL;
2550
2551         vma_size = vma->vm_end - vma->vm_start;
2552         nr_pages = (vma_size / PAGE_SIZE) - 1;
2553
2554         /*
2555          * If we have data pages ensure they're a power-of-two number, so we
2556          * can do bitmasks instead of modulo.
2557          */
2558         if (nr_pages != 0 && !is_power_of_2(nr_pages))
2559                 return -EINVAL;
2560
2561         if (vma_size != PAGE_SIZE * (1 + nr_pages))
2562                 return -EINVAL;
2563
2564         if (vma->vm_pgoff != 0)
2565                 return -EINVAL;
2566
2567         WARN_ON_ONCE(event->ctx->parent_ctx);
2568         mutex_lock(&event->mmap_mutex);
2569         if (event->output) {
2570                 ret = -EINVAL;
2571                 goto unlock;
2572         }
2573
2574         if (atomic_inc_not_zero(&event->mmap_count)) {
2575                 if (nr_pages != event->data->nr_pages)
2576                         ret = -EINVAL;
2577                 goto unlock;
2578         }
2579
2580         user_extra = nr_pages + 1;
2581         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2582
2583         /*
2584          * Increase the limit linearly with more CPUs:
2585          */
2586         user_lock_limit *= num_online_cpus();
2587
2588         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2589
2590         extra = 0;
2591         if (user_locked > user_lock_limit)
2592                 extra = user_locked - user_lock_limit;
2593
2594         lock_limit = rlimit(RLIMIT_MEMLOCK);
2595         lock_limit >>= PAGE_SHIFT;
2596         locked = vma->vm_mm->locked_vm + extra;
2597
2598         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2599                 !capable(CAP_IPC_LOCK)) {
2600                 ret = -EPERM;
2601                 goto unlock;
2602         }
2603
2604         WARN_ON(event->data);
2605
2606         data = perf_mmap_data_alloc(event, nr_pages);
2607         ret = -ENOMEM;
2608         if (!data)
2609                 goto unlock;
2610
2611         ret = 0;
2612         perf_mmap_data_init(event, data);
2613
2614         atomic_set(&event->mmap_count, 1);
2615         atomic_long_add(user_extra, &user->locked_vm);
2616         vma->vm_mm->locked_vm += extra;
2617         event->data->nr_locked = extra;
2618         if (vma->vm_flags & VM_WRITE)
2619                 event->data->writable = 1;
2620
2621 unlock:
2622         mutex_unlock(&event->mmap_mutex);
2623
2624         vma->vm_flags |= VM_RESERVED;
2625         vma->vm_ops = &perf_mmap_vmops;
2626
2627         return ret;
2628 }
2629
2630 static int perf_fasync(int fd, struct file *filp, int on)
2631 {
2632         struct inode *inode = filp->f_path.dentry->d_inode;
2633         struct perf_event *event = filp->private_data;
2634         int retval;
2635
2636         mutex_lock(&inode->i_mutex);
2637         retval = fasync_helper(fd, filp, on, &event->fasync);
2638         mutex_unlock(&inode->i_mutex);
2639
2640         if (retval < 0)
2641                 return retval;
2642
2643         return 0;
2644 }
2645
2646 static const struct file_operations perf_fops = {
2647         .release                = perf_release,
2648         .read                   = perf_read,
2649         .poll                   = perf_poll,
2650         .unlocked_ioctl         = perf_ioctl,
2651         .compat_ioctl           = perf_ioctl,
2652         .mmap                   = perf_mmap,
2653         .fasync                 = perf_fasync,
2654 };
2655
2656 /*
2657  * Perf event wakeup
2658  *
2659  * If there's data, ensure we set the poll() state and publish everything
2660  * to user-space before waking everybody up.
2661  */
2662
2663 void perf_event_wakeup(struct perf_event *event)
2664 {
2665         wake_up_all(&event->waitq);
2666
2667         if (event->pending_kill) {
2668                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2669                 event->pending_kill = 0;
2670         }
2671 }
2672
2673 /*
2674  * Pending wakeups
2675  *
2676  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2677  *
2678  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2679  * single linked list and use cmpxchg() to add entries lockless.
2680  */
2681
2682 static void perf_pending_event(struct perf_pending_entry *entry)
2683 {
2684         struct perf_event *event = container_of(entry,
2685                         struct perf_event, pending);
2686
2687         if (event->pending_disable) {
2688                 event->pending_disable = 0;
2689                 __perf_event_disable(event);
2690         }
2691
2692         if (event->pending_wakeup) {
2693                 event->pending_wakeup = 0;
2694                 perf_event_wakeup(event);
2695         }
2696 }
2697
2698 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2699
2700 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2701         PENDING_TAIL,
2702 };
2703
2704 static void perf_pending_queue(struct perf_pending_entry *entry,
2705                                void (*func)(struct perf_pending_entry *))
2706 {
2707         struct perf_pending_entry **head;
2708
2709         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2710                 return;
2711
2712         entry->func = func;
2713
2714         head = &get_cpu_var(perf_pending_head);
2715
2716         do {
2717                 entry->next = *head;
2718         } while (cmpxchg(head, entry->next, entry) != entry->next);
2719
2720         set_perf_event_pending();
2721
2722         put_cpu_var(perf_pending_head);
2723 }
2724
2725 static int __perf_pending_run(void)
2726 {
2727         struct perf_pending_entry *list;
2728         int nr = 0;
2729
2730         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2731         while (list != PENDING_TAIL) {
2732                 void (*func)(struct perf_pending_entry *);
2733                 struct perf_pending_entry *entry = list;
2734
2735                 list = list->next;
2736
2737                 func = entry->func;
2738                 entry->next = NULL;
2739                 /*
2740                  * Ensure we observe the unqueue before we issue the wakeup,
2741                  * so that we won't be waiting forever.
2742                  * -- see perf_not_pending().
2743                  */
2744                 smp_wmb();
2745
2746                 func(entry);
2747                 nr++;
2748         }
2749
2750         return nr;
2751 }
2752
2753 static inline int perf_not_pending(struct perf_event *event)
2754 {
2755         /*
2756          * If we flush on whatever cpu we run, there is a chance we don't
2757          * need to wait.
2758          */
2759         get_cpu();
2760         __perf_pending_run();
2761         put_cpu();
2762
2763         /*
2764          * Ensure we see the proper queue state before going to sleep
2765          * so that we do not miss the wakeup. -- see perf_pending_handle()
2766          */
2767         smp_rmb();
2768         return event->pending.next == NULL;
2769 }
2770
2771 static void perf_pending_sync(struct perf_event *event)
2772 {
2773         wait_event(event->waitq, perf_not_pending(event));
2774 }
2775
2776 void perf_event_do_pending(void)
2777 {
2778         __perf_pending_run();
2779 }
2780
2781 /*
2782  * Callchain support -- arch specific
2783  */
2784
2785 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2786 {
2787         return NULL;
2788 }
2789
2790 #ifdef CONFIG_EVENT_TRACING
2791 __weak
2792 void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2793 {
2794 }
2795 #endif
2796
2797 /*
2798  * Output
2799  */
2800 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2801                               unsigned long offset, unsigned long head)
2802 {
2803         unsigned long mask;
2804
2805         if (!data->writable)
2806                 return true;
2807
2808         mask = perf_data_size(data) - 1;
2809
2810         offset = (offset - tail) & mask;
2811         head   = (head   - tail) & mask;
2812
2813         if ((int)(head - offset) < 0)
2814                 return false;
2815
2816         return true;
2817 }
2818
2819 static void perf_output_wakeup(struct perf_output_handle *handle)
2820 {
2821         atomic_set(&handle->data->poll, POLL_IN);
2822
2823         if (handle->nmi) {
2824                 handle->event->pending_wakeup = 1;
2825                 perf_pending_queue(&handle->event->pending,
2826                                    perf_pending_event);
2827         } else
2828                 perf_event_wakeup(handle->event);
2829 }
2830
2831 /*
2832  * Curious locking construct.
2833  *
2834  * We need to ensure a later event_id doesn't publish a head when a former
2835  * event_id isn't done writing. However since we need to deal with NMIs we
2836  * cannot fully serialize things.
2837  *
2838  * What we do is serialize between CPUs so we only have to deal with NMI
2839  * nesting on a single CPU.
2840  *
2841  * We only publish the head (and generate a wakeup) when the outer-most
2842  * event_id completes.
2843  */
2844 static void perf_output_lock(struct perf_output_handle *handle)
2845 {
2846         struct perf_mmap_data *data = handle->data;
2847         int cur, cpu = get_cpu();
2848
2849         handle->locked = 0;
2850
2851         for (;;) {
2852                 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2853                 if (cur == -1) {
2854                         handle->locked = 1;
2855                         break;
2856                 }
2857                 if (cur == cpu)
2858                         break;
2859
2860                 cpu_relax();
2861         }
2862 }
2863
2864 static void perf_output_unlock(struct perf_output_handle *handle)
2865 {
2866         struct perf_mmap_data *data = handle->data;
2867         unsigned long head;
2868         int cpu;
2869
2870         data->done_head = data->head;
2871
2872         if (!handle->locked)
2873                 goto out;
2874
2875 again:
2876         /*
2877          * The xchg implies a full barrier that ensures all writes are done
2878          * before we publish the new head, matched by a rmb() in userspace when
2879          * reading this position.
2880          */
2881         while ((head = atomic_long_xchg(&data->done_head, 0)))
2882                 data->user_page->data_head = head;
2883
2884         /*
2885          * NMI can happen here, which means we can miss a done_head update.
2886          */
2887
2888         cpu = atomic_xchg(&data->lock, -1);
2889         WARN_ON_ONCE(cpu != smp_processor_id());
2890
2891         /*
2892          * Therefore we have to validate we did not indeed do so.
2893          */
2894         if (unlikely(atomic_long_read(&data->done_head))) {
2895                 /*
2896                  * Since we had it locked, we can lock it again.
2897                  */
2898                 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2899                         cpu_relax();
2900
2901                 goto again;
2902         }
2903
2904         if (atomic_xchg(&data->wakeup, 0))
2905                 perf_output_wakeup(handle);
2906 out:
2907         put_cpu();
2908 }
2909
2910 void perf_output_copy(struct perf_output_handle *handle,
2911                       const void *buf, unsigned int len)
2912 {
2913         unsigned int pages_mask;
2914         unsigned long offset;
2915         unsigned int size;
2916         void **pages;
2917
2918         offset          = handle->offset;
2919         pages_mask      = handle->data->nr_pages - 1;
2920         pages           = handle->data->data_pages;
2921
2922         do {
2923                 unsigned long page_offset;
2924                 unsigned long page_size;
2925                 int nr;
2926
2927                 nr          = (offset >> PAGE_SHIFT) & pages_mask;
2928                 page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
2929                 page_offset = offset & (page_size - 1);
2930                 size        = min_t(unsigned int, page_size - page_offset, len);
2931
2932                 memcpy(pages[nr] + page_offset, buf, size);
2933
2934                 len         -= size;
2935                 buf         += size;
2936                 offset      += size;
2937         } while (len);
2938
2939         handle->offset = offset;
2940
2941         /*
2942          * Check we didn't copy past our reservation window, taking the
2943          * possible unsigned int wrap into account.
2944          */
2945         WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2946 }
2947
2948 int perf_output_begin(struct perf_output_handle *handle,
2949                       struct perf_event *event, unsigned int size,
2950                       int nmi, int sample)
2951 {
2952         struct perf_event *output_event;
2953         struct perf_mmap_data *data;
2954         unsigned long tail, offset, head;
2955         int have_lost;
2956         struct {
2957                 struct perf_event_header header;
2958                 u64                      id;
2959                 u64                      lost;
2960         } lost_event;
2961
2962         rcu_read_lock();
2963         /*
2964          * For inherited events we send all the output towards the parent.
2965          */
2966         if (event->parent)
2967                 event = event->parent;
2968
2969         output_event = rcu_dereference(event->output);
2970         if (output_event)
2971                 event = output_event;
2972
2973         data = rcu_dereference(event->data);
2974         if (!data)
2975                 goto out;
2976
2977         handle->data    = data;
2978         handle->event   = event;
2979         handle->nmi     = nmi;
2980         handle->sample  = sample;
2981
2982         if (!data->nr_pages)
2983                 goto fail;
2984
2985         have_lost = atomic_read(&data->lost);
2986         if (have_lost)
2987                 size += sizeof(lost_event);
2988
2989         perf_output_lock(handle);
2990
2991         do {
2992                 /*
2993                  * Userspace could choose to issue a mb() before updating the
2994                  * tail pointer. So that all reads will be completed before the
2995                  * write is issued.
2996                  */
2997                 tail = ACCESS_ONCE(data->user_page->data_tail);
2998                 smp_rmb();
2999                 offset = head = atomic_long_read(&data->head);
3000                 head += size;
3001                 if (unlikely(!perf_output_space(data, tail, offset, head)))
3002                         goto fail;
3003         } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
3004
3005         handle->offset  = offset;
3006         handle->head    = head;
3007
3008         if (head - tail > data->watermark)
3009                 atomic_set(&data->wakeup, 1);
3010
3011         if (have_lost) {
3012                 lost_event.header.type = PERF_RECORD_LOST;
3013                 lost_event.header.misc = 0;
3014                 lost_event.header.size = sizeof(lost_event);
3015                 lost_event.id          = event->id;
3016                 lost_event.lost        = atomic_xchg(&data->lost, 0);
3017
3018                 perf_output_put(handle, lost_event);
3019         }
3020
3021         return 0;
3022
3023 fail:
3024         atomic_inc(&data->lost);
3025         perf_output_unlock(handle);
3026 out:
3027         rcu_read_unlock();
3028
3029         return -ENOSPC;
3030 }
3031
3032 void perf_output_end(struct perf_output_handle *handle)
3033 {
3034         struct perf_event *event = handle->event;
3035         struct perf_mmap_data *data = handle->data;
3036
3037         int wakeup_events = event->attr.wakeup_events;
3038
3039         if (handle->sample && wakeup_events) {
3040                 int events = atomic_inc_return(&data->events);
3041                 if (events >= wakeup_events) {
3042                         atomic_sub(wakeup_events, &data->events);
3043                         atomic_set(&data->wakeup, 1);
3044                 }
3045         }
3046
3047         perf_output_unlock(handle);
3048         rcu_read_unlock();
3049 }
3050
3051 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3052 {
3053         /*
3054          * only top level events have the pid namespace they were created in
3055          */
3056         if (event->parent)
3057                 event = event->parent;
3058
3059         return task_tgid_nr_ns(p, event->ns);
3060 }
3061
3062 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3063 {
3064         /*
3065          * only top level events have the pid namespace they were created in
3066          */
3067         if (event->parent)
3068                 event = event->parent;
3069
3070         return task_pid_nr_ns(p, event->ns);
3071 }
3072
3073 static void perf_output_read_one(struct perf_output_handle *handle,
3074                                  struct perf_event *event)
3075 {
3076         u64 read_format = event->attr.read_format;
3077         u64 values[4];
3078         int n = 0;
3079
3080         values[n++] = atomic64_read(&event->count);
3081         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3082                 values[n++] = event->total_time_enabled +
3083                         atomic64_read(&event->child_total_time_enabled);
3084         }
3085         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3086                 values[n++] = event->total_time_running +
3087                         atomic64_read(&event->child_total_time_running);
3088         }
3089         if (read_format & PERF_FORMAT_ID)
3090                 values[n++] = primary_event_id(event);
3091
3092         perf_output_copy(handle, values, n * sizeof(u64));
3093 }
3094
3095 /*
3096  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3097  */
3098 static void perf_output_read_group(struct perf_output_handle *handle,
3099                             struct perf_event *event)
3100 {
3101         struct perf_event *leader = event->group_leader, *sub;
3102         u64 read_format = event->attr.read_format;
3103         u64 values[5];
3104         int n = 0;
3105
3106         values[n++] = 1 + leader->nr_siblings;
3107
3108         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3109                 values[n++] = leader->total_time_enabled;
3110
3111         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3112                 values[n++] = leader->total_time_running;
3113
3114         if (leader != event)
3115                 leader->pmu->read(leader);
3116
3117         values[n++] = atomic64_read(&leader->count);
3118         if (read_format & PERF_FORMAT_ID)
3119                 values[n++] = primary_event_id(leader);
3120
3121         perf_output_copy(handle, values, n * sizeof(u64));
3122
3123         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3124                 n = 0;
3125
3126                 if (sub != event)
3127                         sub->pmu->read(sub);
3128
3129                 values[n++] = atomic64_read(&sub->count);
3130                 if (read_format & PERF_FORMAT_ID)
3131                         values[n++] = primary_event_id(sub);
3132
3133                 perf_output_copy(handle, values, n * sizeof(u64));
3134         }
3135 }
3136
3137 static void perf_output_read(struct perf_output_handle *handle,
3138                              struct perf_event *event)
3139 {
3140         if (event->attr.read_format & PERF_FORMAT_GROUP)
3141                 perf_output_read_group(handle, event);
3142         else
3143                 perf_output_read_one(handle, event);
3144 }
3145
3146 void perf_output_sample(struct perf_output_handle *handle,
3147                         struct perf_event_header *header,
3148                         struct perf_sample_data *data,
3149                         struct perf_event *event)
3150 {
3151         u64 sample_type = data->type;
3152
3153         perf_output_put(handle, *header);
3154
3155         if (sample_type & PERF_SAMPLE_IP)
3156                 perf_output_put(handle, data->ip);
3157
3158         if (sample_type & PERF_SAMPLE_TID)
3159                 perf_output_put(handle, data->tid_entry);
3160
3161         if (sample_type & PERF_SAMPLE_TIME)
3162                 perf_output_put(handle, data->time);
3163
3164         if (sample_type & PERF_SAMPLE_ADDR)
3165                 perf_output_put(handle, data->addr);
3166
3167         if (sample_type & PERF_SAMPLE_ID)
3168                 perf_output_put(handle, data->id);
3169
3170         if (sample_type & PERF_SAMPLE_STREAM_ID)
3171                 perf_output_put(handle, data->stream_id);
3172
3173         if (sample_type & PERF_SAMPLE_CPU)
3174                 perf_output_put(handle, data->cpu_entry);
3175
3176         if (sample_type & PERF_SAMPLE_PERIOD)
3177                 perf_output_put(handle, data->period);
3178
3179         if (sample_type & PERF_SAMPLE_READ)
3180                 perf_output_read(handle, event);
3181
3182         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3183                 if (data->callchain) {
3184                         int size = 1;
3185
3186                         if (data->callchain)
3187                                 size += data->callchain->nr;
3188
3189                         size *= sizeof(u64);
3190
3191                         perf_output_copy(handle, data->callchain, size);
3192                 } else {
3193                         u64 nr = 0;
3194                         perf_output_put(handle, nr);
3195                 }
3196         }
3197
3198         if (sample_type & PERF_SAMPLE_RAW) {
3199                 if (data->raw) {
3200                         perf_output_put(handle, data->raw->size);
3201                         perf_output_copy(handle, data->raw->data,
3202                                          data->raw->size);
3203                 } else {
3204                         struct {
3205                                 u32     size;
3206                                 u32     data;
3207                         } raw = {
3208                                 .size = sizeof(u32),
3209                                 .data = 0,
3210                         };
3211                         perf_output_put(handle, raw);
3212                 }
3213         }
3214 }
3215
3216 void perf_prepare_sample(struct perf_event_header *header,
3217                          struct perf_sample_data *data,
3218                          struct perf_event *event,
3219                          struct pt_regs *regs)
3220 {
3221         u64 sample_type = event->attr.sample_type;
3222
3223         data->type = sample_type;
3224
3225         header->type = PERF_RECORD_SAMPLE;
3226         header->size = sizeof(*header);
3227
3228         header->misc = 0;
3229         header->misc |= perf_misc_flags(regs);
3230
3231         if (sample_type & PERF_SAMPLE_IP) {
3232                 data->ip = perf_instruction_pointer(regs);
3233
3234                 header->size += sizeof(data->ip);
3235         }
3236
3237         if (sample_type & PERF_SAMPLE_TID) {
3238                 /* namespace issues */
3239                 data->tid_entry.pid = perf_event_pid(event, current);
3240                 data->tid_entry.tid = perf_event_tid(event, current);
3241
3242                 header->size += sizeof(data->tid_entry);
3243         }
3244
3245         if (sample_type & PERF_SAMPLE_TIME) {
3246                 data->time = perf_clock();
3247
3248                 header->size += sizeof(data->time);
3249         }
3250
3251         if (sample_type & PERF_SAMPLE_ADDR)
3252                 header->size += sizeof(data->addr);
3253
3254         if (sample_type & PERF_SAMPLE_ID) {
3255                 data->id = primary_event_id(event);
3256
3257                 header->size += sizeof(data->id);
3258         }
3259
3260         if (sample_type & PERF_SAMPLE_STREAM_ID) {
3261                 data->stream_id = event->id;
3262
3263                 header->size += sizeof(data->stream_id);
3264         }
3265
3266         if (sample_type & PERF_SAMPLE_CPU) {
3267                 data->cpu_entry.cpu             = raw_smp_processor_id();
3268                 data->cpu_entry.reserved        = 0;
3269
3270                 header->size += sizeof(data->cpu_entry);
3271         }
3272
3273         if (sample_type & PERF_SAMPLE_PERIOD)
3274                 header->size += sizeof(data->period);
3275
3276         if (sample_type & PERF_SAMPLE_READ)
3277                 header->size += perf_event_read_size(event);
3278
3279         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3280                 int size = 1;
3281
3282                 data->callchain = perf_callchain(regs);
3283
3284                 if (data->callchain)
3285                         size += data->callchain->nr;
3286
3287                 header->size += size * sizeof(u64);
3288         }
3289
3290         if (sample_type & PERF_SAMPLE_RAW) {
3291                 int size = sizeof(u32);
3292
3293                 if (data->raw)
3294                         size += data->raw->size;
3295                 else
3296                         size += sizeof(u32);
3297
3298                 WARN_ON_ONCE(size & (sizeof(u64)-1));
3299                 header->size += size;
3300         }
3301 }
3302
3303 static void perf_event_output(struct perf_event *event, int nmi,
3304                                 struct perf_sample_data *data,
3305                                 struct pt_regs *regs)
3306 {
3307         struct perf_output_handle handle;
3308         struct perf_event_header header;
3309
3310         perf_prepare_sample(&header, data, event, regs);
3311
3312         if (perf_output_begin(&handle, event, header.size, nmi, 1))
3313                 return;
3314
3315         perf_output_sample(&handle, &header, data, event);
3316
3317         perf_output_end(&handle);
3318 }
3319
3320 /*
3321  * read event_id
3322  */
3323
3324 struct perf_read_event {
3325         struct perf_event_header        header;
3326
3327         u32                             pid;
3328         u32                             tid;
3329 };
3330
3331 static void
3332 perf_event_read_event(struct perf_event *event,
3333                         struct task_struct *task)
3334 {
3335         struct perf_output_handle handle;
3336         struct perf_read_event read_event = {
3337                 .header = {
3338                         .type = PERF_RECORD_READ,
3339                         .misc = 0,
3340                         .size = sizeof(read_event) + perf_event_read_size(event),
3341                 },
3342                 .pid = perf_event_pid(event, task),
3343                 .tid = perf_event_tid(event, task),
3344         };
3345         int ret;
3346
3347         ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3348         if (ret)
3349                 return;
3350
3351         perf_output_put(&handle, read_event);
3352         perf_output_read(&handle, event);
3353
3354         perf_output_end(&handle);
3355 }
3356
3357 /*
3358  * task tracking -- fork/exit
3359  *
3360  * enabled by: attr.comm | attr.mmap | attr.task
3361  */
3362
3363 struct perf_task_event {
3364         struct task_struct              *task;
3365         struct perf_event_context       *task_ctx;
3366
3367         struct {
3368                 struct perf_event_header        header;
3369
3370                 u32                             pid;
3371                 u32                             ppid;
3372                 u32                             tid;
3373                 u32                             ptid;
3374                 u64                             time;
3375         } event_id;
3376 };
3377
3378 static void perf_event_task_output(struct perf_event *event,
3379                                      struct perf_task_event *task_event)
3380 {
3381         struct perf_output_handle handle;
3382         int size;
3383         struct task_struct *task = task_event->task;
3384         int ret;
3385
3386         size  = task_event->event_id.header.size;
3387         ret = perf_output_begin(&handle, event, size, 0, 0);
3388
3389         if (ret)
3390                 return;
3391
3392         task_event->event_id.pid = perf_event_pid(event, task);
3393         task_event->event_id.ppid = perf_event_pid(event, current);
3394
3395         task_event->event_id.tid = perf_event_tid(event, task);
3396         task_event->event_id.ptid = perf_event_tid(event, current);
3397
3398         perf_output_put(&handle, task_event->event_id);
3399
3400         perf_output_end(&handle);
3401 }
3402
3403 static int perf_event_task_match(struct perf_event *event)
3404 {
3405         if (event->state < PERF_EVENT_STATE_INACTIVE)
3406                 return 0;
3407
3408         if (event->cpu != -1 && event->cpu != smp_processor_id())
3409                 return 0;
3410
3411         if (event->attr.comm || event->attr.mmap || event->attr.task)
3412                 return 1;
3413
3414         return 0;
3415 }
3416
3417 static void perf_event_task_ctx(struct perf_event_context *ctx,
3418                                   struct perf_task_event *task_event)
3419 {
3420         struct perf_event *event;
3421
3422         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3423                 if (perf_event_task_match(event))
3424                         perf_event_task_output(event, task_event);
3425         }
3426 }
3427
3428 static void perf_event_task_event(struct perf_task_event *task_event)
3429 {
3430         struct perf_cpu_context *cpuctx;
3431         struct perf_event_context *ctx = task_event->task_ctx;
3432
3433         rcu_read_lock();
3434         cpuctx = &get_cpu_var(perf_cpu_context);
3435         perf_event_task_ctx(&cpuctx->ctx, task_event);
3436         if (!ctx)
3437                 ctx = rcu_dereference(current->perf_event_ctxp);
3438         if (ctx)
3439                 perf_event_task_ctx(ctx, task_event);
3440         put_cpu_var(perf_cpu_context);
3441         rcu_read_unlock();
3442 }
3443
3444 static void perf_event_task(struct task_struct *task,
3445                               struct perf_event_context *task_ctx,
3446                               int new)
3447 {
3448         struct perf_task_event task_event;
3449
3450         if (!atomic_read(&nr_comm_events) &&
3451             !atomic_read(&nr_mmap_events) &&
3452             !atomic_read(&nr_task_events))
3453                 return;
3454
3455         task_event = (struct perf_task_event){
3456                 .task     = task,
3457                 .task_ctx = task_ctx,
3458                 .event_id    = {
3459                         .header = {
3460                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3461                                 .misc = 0,
3462                                 .size = sizeof(task_event.event_id),
3463                         },
3464                         /* .pid  */
3465                         /* .ppid */
3466                         /* .tid  */
3467                         /* .ptid */
3468                         .time = perf_clock(),
3469                 },
3470         };
3471
3472         perf_event_task_event(&task_event);
3473 }
3474
3475 void perf_event_fork(struct task_struct *task)
3476 {
3477         perf_event_task(task, NULL, 1);
3478 }
3479
3480 /*
3481  * comm tracking
3482  */
3483
3484 struct perf_comm_event {
3485         struct task_struct      *task;
3486         char                    *comm;
3487         int                     comm_size;
3488
3489         struct {
3490                 struct perf_event_header        header;
3491
3492                 u32                             pid;
3493                 u32                             tid;
3494         } event_id;
3495 };
3496
3497 static void perf_event_comm_output(struct perf_event *event,
3498                                      struct perf_comm_event *comm_event)
3499 {
3500         struct perf_output_handle handle;
3501         int size = comm_event->event_id.header.size;
3502         int ret = perf_output_begin(&handle, event, size, 0, 0);
3503
3504         if (ret)
3505                 return;
3506
3507         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3508         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3509
3510         perf_output_put(&handle, comm_event->event_id);
3511         perf_output_copy(&handle, comm_event->comm,
3512                                    comm_event->comm_size);
3513         perf_output_end(&handle);
3514 }
3515
3516 static int perf_event_comm_match(struct perf_event *event)
3517 {
3518         if (event->state < PERF_EVENT_STATE_INACTIVE)
3519                 return 0;
3520
3521         if (event->cpu != -1 && event->cpu != smp_processor_id())
3522                 return 0;
3523
3524         if (event->attr.comm)
3525                 return 1;
3526
3527         return 0;
3528 }
3529
3530 static void perf_event_comm_ctx(struct perf_event_context *ctx,
3531                                   struct perf_comm_event *comm_event)
3532 {
3533         struct perf_event *event;
3534
3535         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3536                 if (perf_event_comm_match(event))
3537                         perf_event_comm_output(event, comm_event);
3538         }
3539 }
3540
3541 static void perf_event_comm_event(struct perf_comm_event *comm_event)
3542 {
3543         struct perf_cpu_context *cpuctx;
3544         struct perf_event_context *ctx;
3545         unsigned int size;
3546         char comm[TASK_COMM_LEN];
3547
3548         memset(comm, 0, sizeof(comm));
3549         strlcpy(comm, comm_event->task->comm, sizeof(comm));
3550         size = ALIGN(strlen(comm)+1, sizeof(u64));
3551
3552         comm_event->comm = comm;
3553         comm_event->comm_size = size;
3554
3555         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3556
3557         rcu_read_lock();
3558         cpuctx = &get_cpu_var(perf_cpu_context);
3559         perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3560         ctx = rcu_dereference(current->perf_event_ctxp);
3561         if (ctx)
3562                 perf_event_comm_ctx(ctx, comm_event);
3563         put_cpu_var(perf_cpu_context);
3564         rcu_read_unlock();
3565 }
3566
3567 void perf_event_comm(struct task_struct *task)
3568 {
3569         struct perf_comm_event comm_event;
3570
3571         if (task->perf_event_ctxp)
3572                 perf_event_enable_on_exec(task);
3573
3574         if (!atomic_read(&nr_comm_events))
3575                 return;
3576
3577         comm_event = (struct perf_comm_event){
3578                 .task   = task,
3579                 /* .comm      */
3580                 /* .comm_size */
3581                 .event_id  = {
3582                         .header = {
3583                                 .type = PERF_RECORD_COMM,
3584                                 .misc = 0,
3585                                 /* .size */
3586                         },
3587                         /* .pid */
3588                         /* .tid */
3589                 },
3590         };
3591
3592         perf_event_comm_event(&comm_event);
3593 }
3594
3595 /*
3596  * mmap tracking
3597  */
3598
3599 struct perf_mmap_event {
3600         struct vm_area_struct   *vma;
3601
3602         const char              *file_name;
3603         int                     file_size;
3604
3605         struct {
3606                 struct perf_event_header        header;
3607
3608                 u32                             pid;
3609                 u32                             tid;
3610                 u64                             start;
3611                 u64                             len;
3612                 u64                             pgoff;
3613         } event_id;
3614 };
3615
3616 static void perf_event_mmap_output(struct perf_event *event,
3617                                      struct perf_mmap_event *mmap_event)
3618 {
3619         struct perf_output_handle handle;
3620         int size = mmap_event->event_id.header.size;
3621         int ret = perf_output_begin(&handle, event, size, 0, 0);
3622
3623         if (ret)
3624                 return;
3625
3626         mmap_event->event_id.pid = perf_event_pid(event, current);
3627         mmap_event->event_id.tid = perf_event_tid(event, current);
3628
3629         perf_output_put(&handle, mmap_event->event_id);
3630         perf_output_copy(&handle, mmap_event->file_name,
3631                                    mmap_event->file_size);
3632         perf_output_end(&handle);
3633 }
3634
3635 static int perf_event_mmap_match(struct perf_event *event,
3636                                    struct perf_mmap_event *mmap_event)
3637 {
3638         if (event->state < PERF_EVENT_STATE_INACTIVE)
3639                 return 0;
3640
3641         if (event->cpu != -1 && event->cpu != smp_processor_id())
3642                 return 0;
3643
3644         if (event->attr.mmap)
3645                 return 1;
3646
3647         return 0;
3648 }
3649
3650 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3651                                   struct perf_mmap_event *mmap_event)
3652 {
3653         struct perf_event *event;
3654
3655         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3656                 if (perf_event_mmap_match(event, mmap_event))
3657                         perf_event_mmap_output(event, mmap_event);
3658         }
3659 }
3660
3661 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3662 {
3663         struct perf_cpu_context *cpuctx;
3664         struct perf_event_context *ctx;
3665         struct vm_area_struct *vma = mmap_event->vma;
3666         struct file *file = vma->vm_file;
3667         unsigned int size;
3668         char tmp[16];
3669         char *buf = NULL;
3670         const char *name;
3671
3672         memset(tmp, 0, sizeof(tmp));
3673
3674         if (file) {
3675                 /*
3676                  * d_path works from the end of the buffer backwards, so we
3677                  * need to add enough zero bytes after the string to handle
3678                  * the 64bit alignment we do later.
3679                  */
3680                 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3681                 if (!buf) {
3682                         name = strncpy(tmp, "//enomem", sizeof(tmp));
3683                         goto got_name;
3684                 }
3685                 name = d_path(&file->f_path, buf, PATH_MAX);
3686                 if (IS_ERR(name)) {
3687                         name = strncpy(tmp, "//toolong", sizeof(tmp));
3688                         goto got_name;
3689                 }
3690         } else {
3691                 if (arch_vma_name(mmap_event->vma)) {
3692                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3693                                        sizeof(tmp));
3694                         goto got_name;
3695                 }
3696
3697                 if (!vma->vm_mm) {
3698                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
3699                         goto got_name;
3700                 }
3701
3702                 name = strncpy(tmp, "//anon", sizeof(tmp));
3703                 goto got_name;
3704         }
3705
3706 got_name:
3707         size = ALIGN(strlen(name)+1, sizeof(u64));
3708
3709         mmap_event->file_name = name;
3710         mmap_event->file_size = size;
3711
3712         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3713
3714         rcu_read_lock();
3715         cpuctx = &get_cpu_var(perf_cpu_context);
3716         perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3717         ctx = rcu_dereference(current->perf_event_ctxp);
3718         if (ctx)
3719                 perf_event_mmap_ctx(ctx, mmap_event);
3720         put_cpu_var(perf_cpu_context);
3721         rcu_read_unlock();
3722
3723         kfree(buf);
3724 }
3725
3726 void __perf_event_mmap(struct vm_area_struct *vma)
3727 {
3728         struct perf_mmap_event mmap_event;
3729
3730         if (!atomic_read(&nr_mmap_events))
3731                 return;
3732
3733         mmap_event = (struct perf_mmap_event){
3734                 .vma    = vma,
3735                 /* .file_name */
3736                 /* .file_size */
3737                 .event_id  = {
3738                         .header = {
3739                                 .type = PERF_RECORD_MMAP,
3740                                 .misc = 0,
3741                                 /* .size */
3742                         },
3743                         /* .pid */
3744                         /* .tid */
3745                         .start  = vma->vm_start,
3746                         .len    = vma->vm_end - vma->vm_start,
3747                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
3748                 },
3749         };
3750
3751         perf_event_mmap_event(&mmap_event);
3752 }
3753
3754 /*
3755  * IRQ throttle logging
3756  */
3757
3758 static void perf_log_throttle(struct perf_event *event, int enable)
3759 {
3760         struct perf_output_handle handle;
3761         int ret;
3762
3763         struct {
3764                 struct perf_event_header        header;
3765                 u64                             time;
3766                 u64                             id;
3767                 u64                             stream_id;
3768         } throttle_event = {
3769                 .header = {
3770                         .type = PERF_RECORD_THROTTLE,
3771                         .misc = 0,
3772                         .size = sizeof(throttle_event),
3773                 },
3774                 .time           = perf_clock(),
3775                 .id             = primary_event_id(event),
3776                 .stream_id      = event->id,
3777         };
3778
3779         if (enable)
3780                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3781
3782         ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3783         if (ret)
3784                 return;
3785
3786         perf_output_put(&handle, throttle_event);
3787         perf_output_end(&handle);
3788 }
3789
3790 /*
3791  * Generic event overflow handling, sampling.
3792  */
3793
3794 static int __perf_event_overflow(struct perf_event *event, int nmi,
3795                                    int throttle, struct perf_sample_data *data,
3796                                    struct pt_regs *regs)
3797 {
3798         int events = atomic_read(&event->event_limit);
3799         struct hw_perf_event *hwc = &event->hw;
3800         int ret = 0;
3801
3802         throttle = (throttle && event->pmu->unthrottle != NULL);
3803
3804         if (!throttle) {
3805                 hwc->interrupts++;
3806         } else {
3807                 if (hwc->interrupts != MAX_INTERRUPTS) {
3808                         hwc->interrupts++;
3809                         if (HZ * hwc->interrupts >
3810                                         (u64)sysctl_perf_event_sample_rate) {
3811                                 hwc->interrupts = MAX_INTERRUPTS;
3812                                 perf_log_throttle(event, 0);
3813                                 ret = 1;
3814                         }
3815                 } else {
3816                         /*
3817                          * Keep re-disabling events even though on the previous
3818                          * pass we disabled it - just in case we raced with a
3819                          * sched-in and the event got enabled again:
3820                          */
3821                         ret = 1;
3822                 }
3823         }
3824
3825         if (event->attr.freq) {
3826                 u64 now = perf_clock();
3827                 s64 delta = now - hwc->freq_time_stamp;
3828
3829                 hwc->freq_time_stamp = now;
3830
3831                 if (delta > 0 && delta < 2*TICK_NSEC)
3832                         perf_adjust_period(event, delta, hwc->last_period);
3833         }
3834
3835         /*
3836          * XXX event_limit might not quite work as expected on inherited
3837          * events
3838          */
3839
3840         event->pending_kill = POLL_IN;
3841         if (events && atomic_dec_and_test(&event->event_limit)) {
3842                 ret = 1;
3843                 event->pending_kill = POLL_HUP;
3844                 if (nmi) {
3845                         event->pending_disable = 1;
3846                         perf_pending_queue(&event->pending,
3847                                            perf_pending_event);
3848                 } else
3849                         perf_event_disable(event);
3850         }
3851
3852         if (event->overflow_handler)
3853                 event->overflow_handler(event, nmi, data, regs);
3854         else
3855                 perf_event_output(event, nmi, data, regs);
3856
3857         return ret;
3858 }
3859
3860 int perf_event_overflow(struct perf_event *event, int nmi,
3861                           struct perf_sample_data *data,
3862                           struct pt_regs *regs)
3863 {
3864         return __perf_event_overflow(event, nmi, 1, data, regs);
3865 }
3866
3867 /*
3868  * Generic software event infrastructure
3869  */
3870
3871 /*
3872  * We directly increment event->count and keep a second value in
3873  * event->hw.period_left to count intervals. This period event
3874  * is kept in the range [-sample_period, 0] so that we can use the
3875  * sign as trigger.
3876  */
3877
3878 static u64 perf_swevent_set_period(struct perf_event *event)
3879 {
3880         struct hw_perf_event *hwc = &event->hw;
3881         u64 period = hwc->last_period;
3882         u64 nr, offset;
3883         s64 old, val;
3884
3885         hwc->last_period = hwc->sample_period;
3886
3887 again:
3888         old = val = atomic64_read(&hwc->period_left);
3889         if (val < 0)
3890                 return 0;
3891
3892         nr = div64_u64(period + val, period);
3893         offset = nr * period;
3894         val -= offset;
3895         if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3896                 goto again;
3897
3898         return nr;
3899 }
3900
3901 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3902                                     int nmi, struct perf_sample_data *data,
3903                                     struct pt_regs *regs)
3904 {
3905         struct hw_perf_event *hwc = &event->hw;
3906         int throttle = 0;
3907
3908         data->period = event->hw.last_period;
3909         if (!overflow)
3910                 overflow = perf_swevent_set_period(event);
3911
3912         if (hwc->interrupts == MAX_INTERRUPTS)
3913                 return;
3914
3915         for (; overflow; overflow--) {
3916                 if (__perf_event_overflow(event, nmi, throttle,
3917                                             data, regs)) {
3918                         /*
3919                          * We inhibit the overflow from happening when
3920                          * hwc->interrupts == MAX_INTERRUPTS.
3921                          */
3922                         break;
3923                 }
3924                 throttle = 1;
3925         }
3926 }
3927
3928 static void perf_swevent_unthrottle(struct perf_event *event)
3929 {
3930         /*
3931          * Nothing to do, we already reset hwc->interrupts.
3932          */
3933 }
3934
3935 static void perf_swevent_add(struct perf_event *event, u64 nr,
3936                                int nmi, struct perf_sample_data *data,
3937                                struct pt_regs *regs)
3938 {
3939         struct hw_perf_event *hwc = &event->hw;
3940
3941         atomic64_add(nr, &event->count);
3942
3943         if (!regs)
3944                 return;
3945
3946         if (!hwc->sample_period)
3947                 return;
3948
3949         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3950                 return perf_swevent_overflow(event, 1, nmi, data, regs);
3951
3952         if (atomic64_add_negative(nr, &hwc->period_left))
3953                 return;
3954
3955         perf_swevent_overflow(event, 0, nmi, data, regs);
3956 }
3957
3958 static int perf_swevent_is_counting(struct perf_event *event)
3959 {
3960         /*
3961          * The event is active, we're good!
3962          */
3963         if (event->state == PERF_EVENT_STATE_ACTIVE)
3964                 return 1;
3965
3966         /*
3967          * The event is off/error, not counting.
3968          */
3969         if (event->state != PERF_EVENT_STATE_INACTIVE)
3970                 return 0;
3971
3972         /*
3973          * The event is inactive, if the context is active
3974          * we're part of a group that didn't make it on the 'pmu',
3975          * not counting.
3976          */
3977         if (event->ctx->is_active)
3978                 return 0;
3979
3980         /*
3981          * We're inactive and the context is too, this means the
3982          * task is scheduled out, we're counting events that happen
3983          * to us, like migration events.
3984          */
3985         return 1;
3986 }
3987
3988 static int perf_tp_event_match(struct perf_event *event,
3989                                 struct perf_sample_data *data);
3990
3991 static int perf_exclude_event(struct perf_event *event,
3992                               struct pt_regs *regs)
3993 {
3994         if (regs) {
3995                 if (event->attr.exclude_user && user_mode(regs))
3996                         return 1;
3997
3998                 if (event->attr.exclude_kernel && !user_mode(regs))
3999                         return 1;
4000         }
4001
4002         return 0;
4003 }
4004
4005 static int perf_swevent_match(struct perf_event *event,
4006                                 enum perf_type_id type,
4007                                 u32 event_id,
4008                                 struct perf_sample_data *data,
4009                                 struct pt_regs *regs)
4010 {
4011         if (event->cpu != -1 && event->cpu != smp_processor_id())
4012                 return 0;
4013
4014         if (!perf_swevent_is_counting(event))
4015                 return 0;
4016
4017         if (event->attr.type != type)
4018                 return 0;
4019
4020         if (event->attr.config != event_id)
4021                 return 0;
4022
4023         if (perf_exclude_event(event, regs))
4024                 return 0;
4025
4026         if (event->attr.type == PERF_TYPE_TRACEPOINT &&
4027             !perf_tp_event_match(event, data))
4028                 return 0;
4029
4030         return 1;
4031 }
4032
4033 static void perf_swevent_ctx_event(struct perf_event_context *ctx,
4034                                      enum perf_type_id type,
4035                                      u32 event_id, u64 nr, int nmi,
4036                                      struct perf_sample_data *data,
4037                                      struct pt_regs *regs)
4038 {
4039         struct perf_event *event;
4040
4041         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4042                 if (perf_swevent_match(event, type, event_id, data, regs))
4043                         perf_swevent_add(event, nr, nmi, data, regs);
4044         }
4045 }
4046
4047 int perf_swevent_get_recursion_context(void)
4048 {
4049         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
4050         int rctx;
4051
4052         if (in_nmi())
4053                 rctx = 3;
4054         else if (in_irq())
4055                 rctx = 2;
4056         else if (in_softirq())
4057                 rctx = 1;
4058         else
4059                 rctx = 0;
4060
4061         if (cpuctx->recursion[rctx]) {
4062                 put_cpu_var(perf_cpu_context);
4063                 return -1;
4064         }
4065
4066         cpuctx->recursion[rctx]++;
4067         barrier();
4068
4069         return rctx;
4070 }
4071 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4072
4073 void perf_swevent_put_recursion_context(int rctx)
4074 {
4075         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4076         barrier();
4077         cpuctx->recursion[rctx]--;
4078         put_cpu_var(perf_cpu_context);
4079 }
4080 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4081
4082 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4083                                     u64 nr, int nmi,
4084                                     struct perf_sample_data *data,
4085                                     struct pt_regs *regs)
4086 {
4087         struct perf_cpu_context *cpuctx;
4088         struct perf_event_context *ctx;
4089
4090         cpuctx = &__get_cpu_var(perf_cpu_context);
4091         rcu_read_lock();
4092         perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4093                                  nr, nmi, data, regs);
4094         /*
4095          * doesn't really matter which of the child contexts the
4096          * events ends up in.
4097          */
4098         ctx = rcu_dereference(current->perf_event_ctxp);
4099         if (ctx)
4100                 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4101         rcu_read_unlock();
4102 }
4103
4104 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4105                             struct pt_regs *regs, u64 addr)
4106 {
4107         struct perf_sample_data data;
4108         int rctx;
4109
4110         rctx = perf_swevent_get_recursion_context();
4111         if (rctx < 0)
4112                 return;
4113
4114         perf_sample_data_init(&data, addr);
4115
4116         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4117
4118         perf_swevent_put_recursion_context(rctx);
4119 }
4120
4121 static void perf_swevent_read(struct perf_event *event)
4122 {
4123 }
4124
4125 static int perf_swevent_enable(struct perf_event *event)
4126 {
4127         struct hw_perf_event *hwc = &event->hw;
4128
4129         if (hwc->sample_period) {
4130                 hwc->last_period = hwc->sample_period;
4131                 perf_swevent_set_period(event);
4132         }
4133         return 0;
4134 }
4135
4136 static void perf_swevent_disable(struct perf_event *event)
4137 {
4138 }
4139
4140 static const struct pmu perf_ops_generic = {
4141         .enable         = perf_swevent_enable,
4142         .disable        = perf_swevent_disable,
4143         .read           = perf_swevent_read,
4144         .unthrottle     = perf_swevent_unthrottle,
4145 };
4146
4147 /*
4148  * hrtimer based swevent callback
4149  */
4150
4151 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4152 {
4153         enum hrtimer_restart ret = HRTIMER_RESTART;
4154         struct perf_sample_data data;
4155         struct pt_regs *regs;
4156         struct perf_event *event;
4157         u64 period;
4158
4159         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4160         event->pmu->read(event);
4161
4162         perf_sample_data_init(&data, 0);
4163         data.period = event->hw.last_period;
4164         regs = get_irq_regs();
4165         /*
4166          * In case we exclude kernel IPs or are somehow not in interrupt
4167          * context, provide the next best thing, the user IP.
4168          */
4169         if ((event->attr.exclude_kernel || !regs) &&
4170                         !event->attr.exclude_user)
4171                 regs = task_pt_regs(current);
4172
4173         if (regs) {
4174                 if (!(event->attr.exclude_idle && current->pid == 0))
4175                         if (perf_event_overflow(event, 0, &data, regs))
4176                                 ret = HRTIMER_NORESTART;
4177         }
4178
4179         period = max_t(u64, 10000, event->hw.sample_period);
4180         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4181
4182         return ret;
4183 }
4184
4185 static void perf_swevent_start_hrtimer(struct perf_event *event)
4186 {
4187         struct hw_perf_event *hwc = &event->hw;
4188
4189         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4190         hwc->hrtimer.function = perf_swevent_hrtimer;
4191         if (hwc->sample_period) {
4192                 u64 period;
4193
4194                 if (hwc->remaining) {
4195                         if (hwc->remaining < 0)
4196                                 period = 10000;
4197                         else
4198                                 period = hwc->remaining;
4199                         hwc->remaining = 0;
4200                 } else {
4201                         period = max_t(u64, 10000, hwc->sample_period);
4202                 }
4203                 __hrtimer_start_range_ns(&hwc->hrtimer,
4204                                 ns_to_ktime(period), 0,
4205                                 HRTIMER_MODE_REL, 0);
4206         }
4207 }
4208
4209 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4210 {
4211         struct hw_perf_event *hwc = &event->hw;
4212
4213         if (hwc->sample_period) {
4214                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4215                 hwc->remaining = ktime_to_ns(remaining);
4216
4217                 hrtimer_cancel(&hwc->hrtimer);
4218         }
4219 }
4220
4221 /*
4222  * Software event: cpu wall time clock
4223  */
4224
4225 static void cpu_clock_perf_event_update(struct perf_event *event)
4226 {
4227         int cpu = raw_smp_processor_id();
4228         s64 prev;
4229         u64 now;
4230
4231         now = cpu_clock(cpu);
4232         prev = atomic64_xchg(&event->hw.prev_count, now);
4233         atomic64_add(now - prev, &event->count);
4234 }
4235
4236 static int cpu_clock_perf_event_enable(struct perf_event *event)
4237 {
4238         struct hw_perf_event *hwc = &event->hw;
4239         int cpu = raw_smp_processor_id();
4240
4241         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4242         perf_swevent_start_hrtimer(event);
4243
4244         return 0;
4245 }
4246
4247 static void cpu_clock_perf_event_disable(struct perf_event *event)
4248 {
4249         perf_swevent_cancel_hrtimer(event);
4250         cpu_clock_perf_event_update(event);
4251 }
4252
4253 static void cpu_clock_perf_event_read(struct perf_event *event)
4254 {
4255         cpu_clock_perf_event_update(event);
4256 }
4257
4258 static const struct pmu perf_ops_cpu_clock = {
4259         .enable         = cpu_clock_perf_event_enable,
4260         .disable        = cpu_clock_perf_event_disable,
4261         .read           = cpu_clock_perf_event_read,
4262 };
4263
4264 /*
4265  * Software event: task time clock
4266  */
4267
4268 static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4269 {
4270         u64 prev;
4271         s64 delta;
4272
4273         prev = atomic64_xchg(&event->hw.prev_count, now);
4274         delta = now - prev;
4275         atomic64_add(delta, &event->count);
4276 }
4277
4278 static int task_clock_perf_event_enable(struct perf_event *event)
4279 {
4280         struct hw_perf_event *hwc = &event->hw;
4281         u64 now;
4282
4283         now = event->ctx->time;
4284
4285         atomic64_set(&hwc->prev_count, now);
4286
4287         perf_swevent_start_hrtimer(event);
4288
4289         return 0;
4290 }
4291
4292 static void task_clock_perf_event_disable(struct perf_event *event)
4293 {
4294         perf_swevent_cancel_hrtimer(event);
4295         task_clock_perf_event_update(event, event->ctx->time);
4296
4297 }
4298
4299 static void task_clock_perf_event_read(struct perf_event *event)
4300 {
4301         u64 time;
4302
4303         if (!in_nmi()) {
4304                 update_context_time(event->ctx);
4305                 time = event->ctx->time;
4306         } else {
4307                 u64 now = perf_clock();
4308                 u64 delta = now - event->ctx->timestamp;
4309                 time = event->ctx->time + delta;
4310         }
4311
4312         task_clock_perf_event_update(event, time);
4313 }
4314
4315 static const struct pmu perf_ops_task_clock = {
4316         .enable         = task_clock_perf_event_enable,
4317         .disable        = task_clock_perf_event_disable,
4318         .read           = task_clock_perf_event_read,
4319 };
4320
4321 #ifdef CONFIG_EVENT_TRACING
4322
4323 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4324                    int entry_size, struct pt_regs *regs)
4325 {
4326         struct perf_sample_data data;
4327         struct perf_raw_record raw = {
4328                 .size = entry_size,
4329                 .data = record,
4330         };
4331
4332         perf_sample_data_init(&data, addr);
4333         data.raw = &raw;
4334
4335         /* Trace events already protected against recursion */
4336         do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4337                          &data, regs);
4338 }
4339 EXPORT_SYMBOL_GPL(perf_tp_event);
4340
4341 static int perf_tp_event_match(struct perf_event *event,
4342                                 struct perf_sample_data *data)
4343 {
4344         void *record = data->raw->data;
4345
4346         if (likely(!event->filter) || filter_match_preds(event->filter, record))
4347                 return 1;
4348         return 0;
4349 }
4350
4351 static void tp_perf_event_destroy(struct perf_event *event)
4352 {
4353         perf_trace_disable(event->attr.config);
4354 }
4355
4356 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4357 {
4358         /*
4359          * Raw tracepoint data is a severe data leak, only allow root to
4360          * have these.
4361          */
4362         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4363                         perf_paranoid_tracepoint_raw() &&
4364                         !capable(CAP_SYS_ADMIN))
4365                 return ERR_PTR(-EPERM);
4366
4367         if (perf_trace_enable(event->attr.config))
4368                 return NULL;
4369
4370         event->destroy = tp_perf_event_destroy;
4371
4372         return &perf_ops_generic;
4373 }
4374
4375 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4376 {
4377         char *filter_str;
4378         int ret;
4379
4380         if (event->attr.type != PERF_TYPE_TRACEPOINT)
4381                 return -EINVAL;
4382
4383         filter_str = strndup_user(arg, PAGE_SIZE);
4384         if (IS_ERR(filter_str))
4385                 return PTR_ERR(filter_str);
4386
4387         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4388
4389         kfree(filter_str);
4390         return ret;
4391 }
4392
4393 static void perf_event_free_filter(struct perf_event *event)
4394 {
4395         ftrace_profile_free_filter(event);
4396 }
4397
4398 #else
4399
4400 static int perf_tp_event_match(struct perf_event *event,
4401                                 struct perf_sample_data *data)
4402 {
4403         return 1;
4404 }
4405
4406 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4407 {
4408         return NULL;
4409 }
4410
4411 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4412 {
4413         return -ENOENT;
4414 }
4415
4416 static void perf_event_free_filter(struct perf_event *event)
4417 {
4418 }
4419
4420 #endif /* CONFIG_EVENT_TRACING */
4421
4422 #ifdef CONFIG_HAVE_HW_BREAKPOINT
4423 static void bp_perf_event_destroy(struct perf_event *event)
4424 {
4425         release_bp_slot(event);
4426 }
4427
4428 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4429 {
4430         int err;
4431
4432         err = register_perf_hw_breakpoint(bp);
4433         if (err)
4434                 return ERR_PTR(err);
4435
4436         bp->destroy = bp_perf_event_destroy;
4437
4438         return &perf_ops_bp;
4439 }
4440
4441 void perf_bp_event(struct perf_event *bp, void *data)
4442 {
4443         struct perf_sample_data sample;
4444         struct pt_regs *regs = data;
4445
4446         perf_sample_data_init(&sample, bp->attr.bp_addr);
4447
4448         if (!perf_exclude_event(bp, regs))
4449                 perf_swevent_add(bp, 1, 1, &sample, regs);
4450 }
4451 #else
4452 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4453 {
4454         return NULL;
4455 }
4456
4457 void perf_bp_event(struct perf_event *bp, void *regs)
4458 {
4459 }
4460 #endif
4461
4462 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4463
4464 static void sw_perf_event_destroy(struct perf_event *event)
4465 {
4466         u64 event_id = event->attr.config;
4467
4468         WARN_ON(event->parent);
4469
4470         atomic_dec(&perf_swevent_enabled[event_id]);
4471 }
4472
4473 static const struct pmu *sw_perf_event_init(struct perf_event *event)
4474 {
4475         const struct pmu *pmu = NULL;
4476         u64 event_id = event->attr.config;
4477
4478         /*
4479          * Software events (currently) can't in general distinguish
4480          * between user, kernel and hypervisor events.
4481          * However, context switches and cpu migrations are considered
4482          * to be kernel events, and page faults are never hypervisor
4483          * events.
4484          */
4485         switch (event_id) {
4486         case PERF_COUNT_SW_CPU_CLOCK:
4487                 pmu = &perf_ops_cpu_clock;
4488
4489                 break;
4490         case PERF_COUNT_SW_TASK_CLOCK:
4491                 /*
4492                  * If the user instantiates this as a per-cpu event,
4493                  * use the cpu_clock event instead.
4494                  */
4495                 if (event->ctx->task)
4496                         pmu = &perf_ops_task_clock;
4497                 else
4498                         pmu = &perf_ops_cpu_clock;
4499
4500                 break;
4501         case PERF_COUNT_SW_PAGE_FAULTS:
4502         case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4503         case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4504         case PERF_COUNT_SW_CONTEXT_SWITCHES:
4505         case PERF_COUNT_SW_CPU_MIGRATIONS:
4506         case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4507         case PERF_COUNT_SW_EMULATION_FAULTS:
4508                 if (!event->parent) {
4509                         atomic_inc(&perf_swevent_enabled[event_id]);
4510                         event->destroy = sw_perf_event_destroy;
4511                 }
4512                 pmu = &perf_ops_generic;
4513                 break;
4514         }
4515
4516         return pmu;
4517 }
4518
4519 /*
4520  * Allocate and initialize a event structure
4521  */
4522 static struct perf_event *
4523 perf_event_alloc(struct perf_event_attr *attr,
4524                    int cpu,
4525                    struct perf_event_context *ctx,
4526                    struct perf_event *group_leader,
4527                    struct perf_event *parent_event,
4528                    perf_overflow_handler_t overflow_handler,
4529                    gfp_t gfpflags)
4530 {
4531         const struct pmu *pmu;
4532         struct perf_event *event;
4533         struct hw_perf_event *hwc;
4534         long err;
4535
4536         event = kzalloc(sizeof(*event), gfpflags);
4537         if (!event)
4538                 return ERR_PTR(-ENOMEM);
4539
4540         /*
4541          * Single events are their own group leaders, with an
4542          * empty sibling list:
4543          */
4544         if (!group_leader)
4545                 group_leader = event;
4546
4547         mutex_init(&event->child_mutex);
4548         INIT_LIST_HEAD(&event->child_list);
4549
4550         INIT_LIST_HEAD(&event->group_entry);
4551         INIT_LIST_HEAD(&event->event_entry);
4552         INIT_LIST_HEAD(&event->sibling_list);
4553         init_waitqueue_head(&event->waitq);
4554
4555         mutex_init(&event->mmap_mutex);
4556
4557         event->cpu              = cpu;
4558         event->attr             = *attr;
4559         event->group_leader     = group_leader;
4560         event->pmu              = NULL;
4561         event->ctx              = ctx;
4562         event->oncpu            = -1;
4563
4564         event->parent           = parent_event;
4565
4566         event->ns               = get_pid_ns(current->nsproxy->pid_ns);
4567         event->id               = atomic64_inc_return(&perf_event_id);
4568
4569         event->state            = PERF_EVENT_STATE_INACTIVE;
4570
4571         if (!overflow_handler && parent_event)
4572                 overflow_handler = parent_event->overflow_handler;
4573
4574         event->overflow_handler = overflow_handler;
4575
4576         if (attr->disabled)
4577                 event->state = PERF_EVENT_STATE_OFF;
4578
4579         pmu = NULL;
4580
4581         hwc = &event->hw;
4582         hwc->sample_period = attr->sample_period;
4583         if (attr->freq && attr->sample_freq)
4584                 hwc->sample_period = 1;
4585         hwc->last_period = hwc->sample_period;
4586
4587         atomic64_set(&hwc->period_left, hwc->sample_period);
4588
4589         /*
4590          * we currently do not support PERF_FORMAT_GROUP on inherited events
4591          */
4592         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4593                 goto done;
4594
4595         switch (attr->type) {
4596         case PERF_TYPE_RAW:
4597         case PERF_TYPE_HARDWARE:
4598         case PERF_TYPE_HW_CACHE:
4599                 pmu = hw_perf_event_init(event);
4600                 break;
4601
4602         case PERF_TYPE_SOFTWARE:
4603                 pmu = sw_perf_event_init(event);
4604                 break;
4605
4606         case PERF_TYPE_TRACEPOINT:
4607                 pmu = tp_perf_event_init(event);
4608                 break;
4609
4610         case PERF_TYPE_BREAKPOINT:
4611                 pmu = bp_perf_event_init(event);
4612                 break;
4613
4614
4615         default:
4616                 break;
4617         }
4618 done:
4619         err = 0;
4620         if (!pmu)
4621                 err = -EINVAL;
4622         else if (IS_ERR(pmu))
4623                 err = PTR_ERR(pmu);
4624
4625         if (err) {
4626                 if (event->ns)
4627                         put_pid_ns(event->ns);
4628                 kfree(event);
4629                 return ERR_PTR(err);
4630         }
4631
4632         event->pmu = pmu;
4633
4634         if (!event->parent) {
4635                 atomic_inc(&nr_events);
4636                 if (event->attr.mmap)
4637                         atomic_inc(&nr_mmap_events);
4638                 if (event->attr.comm)
4639                         atomic_inc(&nr_comm_events);
4640                 if (event->attr.task)
4641                         atomic_inc(&nr_task_events);
4642         }
4643
4644         return event;
4645 }
4646
4647 static int perf_copy_attr(struct perf_event_attr __user *uattr,
4648                           struct perf_event_attr *attr)
4649 {
4650         u32 size;
4651         int ret;
4652
4653         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4654                 return -EFAULT;
4655
4656         /*
4657          * zero the full structure, so that a short copy will be nice.
4658          */
4659         memset(attr, 0, sizeof(*attr));
4660
4661         ret = get_user(size, &uattr->size);
4662         if (ret)
4663                 return ret;
4664
4665         if (size > PAGE_SIZE)   /* silly large */
4666                 goto err_size;
4667
4668         if (!size)              /* abi compat */
4669                 size = PERF_ATTR_SIZE_VER0;
4670
4671         if (size < PERF_ATTR_SIZE_VER0)
4672                 goto err_size;
4673
4674         /*
4675          * If we're handed a bigger struct than we know of,
4676          * ensure all the unknown bits are 0 - i.e. new
4677          * user-space does not rely on any kernel feature
4678          * extensions we dont know about yet.
4679          */
4680         if (size > sizeof(*attr)) {
4681                 unsigned char __user *addr;
4682                 unsigned char __user *end;
4683                 unsigned char val;
4684
4685                 addr = (void __user *)uattr + sizeof(*attr);
4686                 end  = (void __user *)uattr + size;
4687
4688                 for (; addr < end; addr++) {
4689                         ret = get_user(val, addr);
4690                         if (ret)
4691                                 return ret;
4692                         if (val)
4693                                 goto err_size;
4694                 }
4695                 size = sizeof(*attr);
4696         }
4697
4698         ret = copy_from_user(attr, uattr, size);
4699         if (ret)
4700                 return -EFAULT;
4701
4702         /*
4703          * If the type exists, the corresponding creation will verify
4704          * the attr->config.
4705          */
4706         if (attr->type >= PERF_TYPE_MAX)
4707                 return -EINVAL;
4708
4709         if (attr->__reserved_1)
4710                 return -EINVAL;
4711
4712         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4713                 return -EINVAL;
4714
4715         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4716                 return -EINVAL;
4717
4718 out:
4719         return ret;
4720
4721 err_size:
4722         put_user(sizeof(*attr), &uattr->size);
4723         ret = -E2BIG;
4724         goto out;
4725 }
4726
4727 static int perf_event_set_output(struct perf_event *event, int output_fd)
4728 {
4729         struct perf_event *output_event = NULL;
4730         struct file *output_file = NULL;
4731         struct perf_event *old_output;
4732         int fput_needed = 0;
4733         int ret = -EINVAL;
4734
4735         if (!output_fd)
4736                 goto set;
4737
4738         output_file = fget_light(output_fd, &fput_needed);
4739         if (!output_file)
4740                 return -EBADF;
4741
4742         if (output_file->f_op != &perf_fops)
4743                 goto out;
4744
4745         output_event = output_file->private_data;
4746
4747         /* Don't chain output fds */
4748         if (output_event->output)
4749                 goto out;
4750
4751         /* Don't set an output fd when we already have an output channel */
4752         if (event->data)
4753                 goto out;
4754
4755         atomic_long_inc(&output_file->f_count);
4756
4757 set:
4758         mutex_lock(&event->mmap_mutex);
4759         old_output = event->output;
4760         rcu_assign_pointer(event->output, output_event);
4761         mutex_unlock(&event->mmap_mutex);
4762
4763         if (old_output) {
4764                 /*
4765                  * we need to make sure no existing perf_output_*()
4766                  * is still referencing this event.
4767                  */
4768                 synchronize_rcu();
4769                 fput(old_output->filp);
4770         }
4771
4772         ret = 0;
4773 out:
4774         fput_light(output_file, fput_needed);
4775         return ret;
4776 }
4777
4778 /**
4779  * sys_perf_event_open - open a performance event, associate it to a task/cpu
4780  *
4781  * @attr_uptr:  event_id type attributes for monitoring/sampling
4782  * @pid:                target pid
4783  * @cpu:                target cpu
4784  * @group_fd:           group leader event fd
4785  */
4786 SYSCALL_DEFINE5(perf_event_open,
4787                 struct perf_event_attr __user *, attr_uptr,
4788                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4789 {
4790         struct perf_event *event, *group_leader;
4791         struct perf_event_attr attr;
4792         struct perf_event_context *ctx;
4793         struct file *event_file = NULL;
4794         struct file *group_file = NULL;
4795         int fput_needed = 0;
4796         int fput_needed2 = 0;
4797         int err;
4798
4799         /* for future expandability... */
4800         if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4801                 return -EINVAL;
4802
4803         err = perf_copy_attr(attr_uptr, &attr);
4804         if (err)
4805                 return err;
4806
4807         if (!attr.exclude_kernel) {
4808                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4809                         return -EACCES;
4810         }
4811
4812         if (attr.freq) {
4813                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4814                         return -EINVAL;
4815         }
4816
4817         /*
4818          * Get the target context (task or percpu):
4819          */
4820         ctx = find_get_context(pid, cpu);
4821         if (IS_ERR(ctx))
4822                 return PTR_ERR(ctx);
4823
4824         /*
4825          * Look up the group leader (we will attach this event to it):
4826          */
4827         group_leader = NULL;
4828         if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4829                 err = -EINVAL;
4830                 group_file = fget_light(group_fd, &fput_needed);
4831                 if (!group_file)
4832                         goto err_put_context;
4833                 if (group_file->f_op != &perf_fops)
4834                         goto err_put_context;
4835
4836                 group_leader = group_file->private_data;
4837                 /*
4838                  * Do not allow a recursive hierarchy (this new sibling
4839                  * becoming part of another group-sibling):
4840                  */
4841                 if (group_leader->group_leader != group_leader)
4842                         goto err_put_context;
4843                 /*
4844                  * Do not allow to attach to a group in a different
4845                  * task or CPU context:
4846                  */
4847                 if (group_leader->ctx != ctx)
4848                         goto err_put_context;
4849                 /*
4850                  * Only a group leader can be exclusive or pinned
4851                  */
4852                 if (attr.exclusive || attr.pinned)
4853                         goto err_put_context;
4854         }
4855
4856         event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4857                                      NULL, NULL, GFP_KERNEL);
4858         err = PTR_ERR(event);
4859         if (IS_ERR(event))
4860                 goto err_put_context;
4861
4862         err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
4863         if (err < 0)
4864                 goto err_free_put_context;
4865
4866         event_file = fget_light(err, &fput_needed2);
4867         if (!event_file)
4868                 goto err_free_put_context;
4869
4870         if (flags & PERF_FLAG_FD_OUTPUT) {
4871                 err = perf_event_set_output(event, group_fd);
4872                 if (err)
4873                         goto err_fput_free_put_context;
4874         }
4875
4876         event->filp = event_file;
4877         WARN_ON_ONCE(ctx->parent_ctx);
4878         mutex_lock(&ctx->mutex);
4879         perf_install_in_context(ctx, event, cpu);
4880         ++ctx->generation;
4881         mutex_unlock(&ctx->mutex);
4882
4883         event->owner = current;
4884         get_task_struct(current);
4885         mutex_lock(&current->perf_event_mutex);
4886         list_add_tail(&event->owner_entry, &current->perf_event_list);
4887         mutex_unlock(&current->perf_event_mutex);
4888
4889 err_fput_free_put_context:
4890         fput_light(event_file, fput_needed2);
4891
4892 err_free_put_context:
4893         if (err < 0)
4894                 kfree(event);
4895
4896 err_put_context:
4897         if (err < 0)
4898                 put_ctx(ctx);
4899
4900         fput_light(group_file, fput_needed);
4901
4902         return err;
4903 }
4904
4905 /**
4906  * perf_event_create_kernel_counter
4907  *
4908  * @attr: attributes of the counter to create
4909  * @cpu: cpu in which the counter is bound
4910  * @pid: task to profile
4911  */
4912 struct perf_event *
4913 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4914                                  pid_t pid,
4915                                  perf_overflow_handler_t overflow_handler)
4916 {
4917         struct perf_event *event;
4918         struct perf_event_context *ctx;
4919         int err;
4920
4921         /*
4922          * Get the target context (task or percpu):
4923          */
4924
4925         ctx = find_get_context(pid, cpu);
4926         if (IS_ERR(ctx)) {
4927                 err = PTR_ERR(ctx);
4928                 goto err_exit;
4929         }
4930
4931         event = perf_event_alloc(attr, cpu, ctx, NULL,
4932                                  NULL, overflow_handler, GFP_KERNEL);
4933         if (IS_ERR(event)) {
4934                 err = PTR_ERR(event);
4935                 goto err_put_context;
4936         }
4937
4938         event->filp = NULL;
4939         WARN_ON_ONCE(ctx->parent_ctx);
4940         mutex_lock(&ctx->mutex);
4941         perf_install_in_context(ctx, event, cpu);
4942         ++ctx->generation;
4943         mutex_unlock(&ctx->mutex);
4944
4945         event->owner = current;
4946         get_task_struct(current);
4947         mutex_lock(&current->perf_event_mutex);
4948         list_add_tail(&event->owner_entry, &current->perf_event_list);
4949         mutex_unlock(&current->perf_event_mutex);
4950
4951         return event;
4952
4953  err_put_context:
4954         put_ctx(ctx);
4955  err_exit:
4956         return ERR_PTR(err);
4957 }
4958 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4959
4960 /*
4961  * inherit a event from parent task to child task:
4962  */
4963 static struct perf_event *
4964 inherit_event(struct perf_event *parent_event,
4965               struct task_struct *parent,
4966               struct perf_event_context *parent_ctx,
4967               struct task_struct *child,
4968               struct perf_event *group_leader,
4969               struct perf_event_context *child_ctx)
4970 {
4971         struct perf_event *child_event;
4972
4973         /*
4974          * Instead of creating recursive hierarchies of events,
4975          * we link inherited events back to the original parent,
4976          * which has a filp for sure, which we use as the reference
4977          * count:
4978          */
4979         if (parent_event->parent)
4980                 parent_event = parent_event->parent;
4981
4982         child_event = perf_event_alloc(&parent_event->attr,
4983                                            parent_event->cpu, child_ctx,
4984                                            group_leader, parent_event,
4985                                            NULL, GFP_KERNEL);
4986         if (IS_ERR(child_event))
4987                 return child_event;
4988         get_ctx(child_ctx);
4989
4990         /*
4991          * Make the child state follow the state of the parent event,
4992          * not its attr.disabled bit.  We hold the parent's mutex,
4993          * so we won't race with perf_event_{en, dis}able_family.
4994          */
4995         if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4996                 child_event->state = PERF_EVENT_STATE_INACTIVE;
4997         else
4998                 child_event->state = PERF_EVENT_STATE_OFF;
4999
5000         if (parent_event->attr.freq) {
5001                 u64 sample_period = parent_event->hw.sample_period;
5002                 struct hw_perf_event *hwc = &child_event->hw;
5003
5004                 hwc->sample_period = sample_period;
5005                 hwc->last_period   = sample_period;
5006
5007                 atomic64_set(&hwc->period_left, sample_period);
5008         }
5009
5010         child_event->overflow_handler = parent_event->overflow_handler;
5011
5012         /*
5013          * Link it up in the child's context:
5014          */
5015         add_event_to_ctx(child_event, child_ctx);
5016
5017         /*
5018          * Get a reference to the parent filp - we will fput it
5019          * when the child event exits. This is safe to do because
5020          * we are in the parent and we know that the filp still
5021          * exists and has a nonzero count:
5022          */
5023         atomic_long_inc(&parent_event->filp->f_count);
5024
5025         /*
5026          * Link this into the parent event's child list
5027          */
5028         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5029         mutex_lock(&parent_event->child_mutex);
5030         list_add_tail(&child_event->child_list, &parent_event->child_list);
5031         mutex_unlock(&parent_event->child_mutex);
5032
5033         return child_event;
5034 }
5035
5036 static int inherit_group(struct perf_event *parent_event,
5037               struct task_struct *parent,
5038               struct perf_event_context *parent_ctx,
5039               struct task_struct *child,
5040               struct perf_event_context *child_ctx)
5041 {
5042         struct perf_event *leader;
5043         struct perf_event *sub;
5044         struct perf_event *child_ctr;
5045
5046         leader = inherit_event(parent_event, parent, parent_ctx,
5047                                  child, NULL, child_ctx);
5048         if (IS_ERR(leader))
5049                 return PTR_ERR(leader);
5050         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5051                 child_ctr = inherit_event(sub, parent, parent_ctx,
5052                                             child, leader, child_ctx);
5053                 if (IS_ERR(child_ctr))
5054                         return PTR_ERR(child_ctr);
5055         }
5056         return 0;
5057 }
5058
5059 static void sync_child_event(struct perf_event *child_event,
5060                                struct task_struct *child)
5061 {
5062         struct perf_event *parent_event = child_event->parent;
5063         u64 child_val;
5064
5065         if (child_event->attr.inherit_stat)
5066                 perf_event_read_event(child_event, child);
5067
5068         child_val = atomic64_read(&child_event->count);
5069
5070         /*
5071          * Add back the child's count to the parent's count:
5072          */
5073         atomic64_add(child_val, &parent_event->count);
5074         atomic64_add(child_event->total_time_enabled,
5075                      &parent_event->child_total_time_enabled);
5076         atomic64_add(child_event->total_time_running,
5077                      &parent_event->child_total_time_running);
5078
5079         /*
5080          * Remove this event from the parent's list
5081          */
5082         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5083         mutex_lock(&parent_event->child_mutex);
5084         list_del_init(&child_event->child_list);
5085         mutex_unlock(&parent_event->child_mutex);
5086
5087         /*
5088          * Release the parent event, if this was the last
5089          * reference to it.
5090          */
5091         fput(parent_event->filp);
5092 }
5093
5094 static void
5095 __perf_event_exit_task(struct perf_event *child_event,
5096                          struct perf_event_context *child_ctx,
5097                          struct task_struct *child)
5098 {
5099         struct perf_event *parent_event;
5100
5101         perf_event_remove_from_context(child_event);
5102
5103         parent_event = child_event->parent;
5104         /*
5105          * It can happen that parent exits first, and has events
5106          * that are still around due to the child reference. These
5107          * events need to be zapped - but otherwise linger.
5108          */
5109         if (parent_event) {
5110                 sync_child_event(child_event, child);
5111                 free_event(child_event);
5112         }
5113 }
5114
5115 /*
5116  * When a child task exits, feed back event values to parent events.
5117  */
5118 void perf_event_exit_task(struct task_struct *child)
5119 {
5120         struct perf_event *child_event, *tmp;
5121         struct perf_event_context *child_ctx;
5122         unsigned long flags;
5123
5124         if (likely(!child->perf_event_ctxp)) {
5125                 perf_event_task(child, NULL, 0);
5126                 return;
5127         }
5128
5129         local_irq_save(flags);
5130         /*
5131          * We can't reschedule here because interrupts are disabled,
5132          * and either child is current or it is a task that can't be
5133          * scheduled, so we are now safe from rescheduling changing
5134          * our context.
5135          */
5136         child_ctx = child->perf_event_ctxp;
5137         __perf_event_task_sched_out(child_ctx);
5138
5139         /*
5140          * Take the context lock here so that if find_get_context is
5141          * reading child->perf_event_ctxp, we wait until it has
5142          * incremented the context's refcount before we do put_ctx below.
5143          */
5144         raw_spin_lock(&child_ctx->lock);
5145         child->perf_event_ctxp = NULL;
5146         /*
5147          * If this context is a clone; unclone it so it can't get
5148          * swapped to another process while we're removing all
5149          * the events from it.
5150          */
5151         unclone_ctx(child_ctx);
5152         update_context_time(child_ctx);
5153         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
5154
5155         /*
5156          * Report the task dead after unscheduling the events so that we
5157          * won't get any samples after PERF_RECORD_EXIT. We can however still
5158          * get a few PERF_RECORD_READ events.
5159          */
5160         perf_event_task(child, child_ctx, 0);
5161
5162         /*
5163          * We can recurse on the same lock type through:
5164          *
5165          *   __perf_event_exit_task()
5166          *     sync_child_event()
5167          *       fput(parent_event->filp)
5168          *         perf_release()
5169          *           mutex_lock(&ctx->mutex)
5170          *
5171          * But since its the parent context it won't be the same instance.
5172          */
5173         mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5174
5175 again:
5176         list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5177                                  group_entry)
5178                 __perf_event_exit_task(child_event, child_ctx, child);
5179
5180         list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5181                                  group_entry)
5182                 __perf_event_exit_task(child_event, child_ctx, child);
5183
5184         /*
5185          * If the last event was a group event, it will have appended all
5186          * its siblings to the list, but we obtained 'tmp' before that which
5187          * will still point to the list head terminating the iteration.
5188          */
5189         if (!list_empty(&child_ctx->pinned_groups) ||
5190             !list_empty(&child_ctx->flexible_groups))
5191                 goto again;
5192
5193         mutex_unlock(&child_ctx->mutex);
5194
5195         put_ctx(child_ctx);
5196 }
5197
5198 static void perf_free_event(struct perf_event *event,
5199                             struct perf_event_context *ctx)
5200 {
5201         struct perf_event *parent = event->parent;
5202
5203         if (WARN_ON_ONCE(!parent))
5204                 return;
5205
5206         mutex_lock(&parent->child_mutex);
5207         list_del_init(&event->child_list);
5208         mutex_unlock(&parent->child_mutex);
5209
5210         fput(parent->filp);
5211
5212         list_del_event(event, ctx);
5213         free_event(event);
5214 }
5215
5216 /*
5217  * free an unexposed, unused context as created by inheritance by
5218  * init_task below, used by fork() in case of fail.
5219  */
5220 void perf_event_free_task(struct task_struct *task)
5221 {
5222         struct perf_event_context *ctx = task->perf_event_ctxp;
5223         struct perf_event *event, *tmp;
5224
5225         if (!ctx)
5226                 return;
5227
5228         mutex_lock(&ctx->mutex);
5229 again:
5230         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5231                 perf_free_event(event, ctx);
5232
5233         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5234                                  group_entry)
5235                 perf_free_event(event, ctx);
5236
5237         if (!list_empty(&ctx->pinned_groups) ||
5238             !list_empty(&ctx->flexible_groups))
5239                 goto again;
5240
5241         mutex_unlock(&ctx->mutex);
5242
5243         put_ctx(ctx);
5244 }
5245
5246 static int
5247 inherit_task_group(struct perf_event *event, struct task_struct *parent,
5248                    struct perf_event_context *parent_ctx,
5249                    struct task_struct *child,
5250                    int *inherited_all)
5251 {
5252         int ret;
5253         struct perf_event_context *child_ctx = child->perf_event_ctxp;
5254
5255         if (!event->attr.inherit) {
5256                 *inherited_all = 0;
5257                 return 0;
5258         }
5259
5260         if (!child_ctx) {
5261                 /*
5262                  * This is executed from the parent task context, so
5263                  * inherit events that have been marked for cloning.
5264                  * First allocate and initialize a context for the
5265                  * child.
5266                  */
5267
5268                 child_ctx = kzalloc(sizeof(struct perf_event_context),
5269                                     GFP_KERNEL);
5270                 if (!child_ctx)
5271                         return -ENOMEM;
5272
5273                 __perf_event_init_context(child_ctx, child);
5274                 child->perf_event_ctxp = child_ctx;
5275                 get_task_struct(child);
5276         }
5277
5278         ret = inherit_group(event, parent, parent_ctx,
5279                             child, child_ctx);
5280
5281         if (ret)
5282                 *inherited_all = 0;
5283
5284         return ret;
5285 }
5286
5287
5288 /*
5289  * Initialize the perf_event context in task_struct
5290  */
5291 int perf_event_init_task(struct task_struct *child)
5292 {
5293         struct perf_event_context *child_ctx, *parent_ctx;
5294         struct perf_event_context *cloned_ctx;
5295         struct perf_event *event;
5296         struct task_struct *parent = current;
5297         int inherited_all = 1;
5298         int ret = 0;
5299
5300         child->perf_event_ctxp = NULL;
5301
5302         mutex_init(&child->perf_event_mutex);
5303         INIT_LIST_HEAD(&child->perf_event_list);
5304
5305         if (likely(!parent->perf_event_ctxp))
5306                 return 0;
5307
5308         /*
5309          * If the parent's context is a clone, pin it so it won't get
5310          * swapped under us.
5311          */
5312         parent_ctx = perf_pin_task_context(parent);
5313
5314         /*
5315          * No need to check if parent_ctx != NULL here; since we saw
5316          * it non-NULL earlier, the only reason for it to become NULL
5317          * is if we exit, and since we're currently in the middle of
5318          * a fork we can't be exiting at the same time.
5319          */
5320
5321         /*
5322          * Lock the parent list. No need to lock the child - not PID
5323          * hashed yet and not running, so nobody can access it.
5324          */
5325         mutex_lock(&parent_ctx->mutex);
5326
5327         /*
5328          * We dont have to disable NMIs - we are only looking at
5329          * the list, not manipulating it:
5330          */
5331         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5332                 ret = inherit_task_group(event, parent, parent_ctx, child,
5333                                          &inherited_all);
5334                 if (ret)
5335                         break;
5336         }
5337
5338         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5339                 ret = inherit_task_group(event, parent, parent_ctx, child,
5340                                          &inherited_all);
5341                 if (ret)
5342                         break;
5343         }
5344
5345         child_ctx = child->perf_event_ctxp;
5346
5347         if (child_ctx && inherited_all) {
5348                 /*
5349                  * Mark the child context as a clone of the parent
5350                  * context, or of whatever the parent is a clone of.
5351                  * Note that if the parent is a clone, it could get
5352                  * uncloned at any point, but that doesn't matter
5353                  * because the list of events and the generation
5354                  * count can't have changed since we took the mutex.
5355                  */
5356                 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
5357                 if (cloned_ctx) {
5358                         child_ctx->parent_ctx = cloned_ctx;
5359                         child_ctx->parent_gen = parent_ctx->parent_gen;
5360                 } else {
5361                         child_ctx->parent_ctx = parent_ctx;
5362                         child_ctx->parent_gen = parent_ctx->generation;
5363                 }
5364                 get_ctx(child_ctx->parent_ctx);
5365         }
5366
5367         mutex_unlock(&parent_ctx->mutex);
5368
5369         perf_unpin_context(parent_ctx);
5370
5371         return ret;
5372 }
5373
5374 static void __init perf_event_init_all_cpus(void)
5375 {
5376         int cpu;
5377         struct perf_cpu_context *cpuctx;
5378
5379         for_each_possible_cpu(cpu) {
5380                 cpuctx = &per_cpu(perf_cpu_context, cpu);
5381                 __perf_event_init_context(&cpuctx->ctx, NULL);
5382         }
5383 }
5384
5385 static void __cpuinit perf_event_init_cpu(int cpu)
5386 {
5387         struct perf_cpu_context *cpuctx;
5388
5389         cpuctx = &per_cpu(perf_cpu_context, cpu);
5390
5391         spin_lock(&perf_resource_lock);
5392         cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5393         spin_unlock(&perf_resource_lock);
5394 }
5395
5396 #ifdef CONFIG_HOTPLUG_CPU
5397 static void __perf_event_exit_cpu(void *info)
5398 {
5399         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5400         struct perf_event_context *ctx = &cpuctx->ctx;
5401         struct perf_event *event, *tmp;
5402
5403         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5404                 __perf_event_remove_from_context(event);
5405         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5406                 __perf_event_remove_from_context(event);
5407 }
5408 static void perf_event_exit_cpu(int cpu)
5409 {
5410         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5411         struct perf_event_context *ctx = &cpuctx->ctx;
5412
5413         mutex_lock(&ctx->mutex);
5414         smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5415         mutex_unlock(&ctx->mutex);
5416 }
5417 #else
5418 static inline void perf_event_exit_cpu(int cpu) { }
5419 #endif
5420
5421 static int __cpuinit
5422 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5423 {
5424         unsigned int cpu = (long)hcpu;
5425
5426         switch (action) {
5427
5428         case CPU_UP_PREPARE:
5429         case CPU_UP_PREPARE_FROZEN:
5430                 perf_event_init_cpu(cpu);
5431                 break;
5432
5433         case CPU_DOWN_PREPARE:
5434         case CPU_DOWN_PREPARE_FROZEN:
5435                 perf_event_exit_cpu(cpu);
5436                 break;
5437
5438         default:
5439                 break;
5440         }
5441
5442         return NOTIFY_OK;
5443 }
5444
5445 /*
5446  * This has to have a higher priority than migration_notifier in sched.c.
5447  */
5448 static struct notifier_block __cpuinitdata perf_cpu_nb = {
5449         .notifier_call          = perf_cpu_notify,
5450         .priority               = 20,
5451 };
5452
5453 void __init perf_event_init(void)
5454 {
5455         perf_event_init_all_cpus();
5456         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5457                         (void *)(long)smp_processor_id());
5458         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5459                         (void *)(long)smp_processor_id());
5460         register_cpu_notifier(&perf_cpu_nb);
5461 }
5462
5463 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5464                                         struct sysdev_class_attribute *attr,
5465                                         char *buf)
5466 {
5467         return sprintf(buf, "%d\n", perf_reserved_percpu);
5468 }
5469
5470 static ssize_t
5471 perf_set_reserve_percpu(struct sysdev_class *class,
5472                         struct sysdev_class_attribute *attr,
5473                         const char *buf,
5474                         size_t count)
5475 {
5476         struct perf_cpu_context *cpuctx;
5477         unsigned long val;
5478         int err, cpu, mpt;
5479
5480         err = strict_strtoul(buf, 10, &val);
5481         if (err)
5482                 return err;
5483         if (val > perf_max_events)
5484                 return -EINVAL;
5485
5486         spin_lock(&perf_resource_lock);
5487         perf_reserved_percpu = val;
5488         for_each_online_cpu(cpu) {
5489                 cpuctx = &per_cpu(perf_cpu_context, cpu);
5490                 raw_spin_lock_irq(&cpuctx->ctx.lock);
5491                 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5492                           perf_max_events - perf_reserved_percpu);
5493                 cpuctx->max_pertask = mpt;
5494                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5495         }
5496         spin_unlock(&perf_resource_lock);
5497
5498         return count;
5499 }
5500
5501 static ssize_t perf_show_overcommit(struct sysdev_class *class,
5502                                     struct sysdev_class_attribute *attr,
5503                                     char *buf)
5504 {
5505         return sprintf(buf, "%d\n", perf_overcommit);
5506 }
5507
5508 static ssize_t
5509 perf_set_overcommit(struct sysdev_class *class,
5510                     struct sysdev_class_attribute *attr,
5511                     const char *buf, size_t count)
5512 {
5513         unsigned long val;
5514         int err;
5515
5516         err = strict_strtoul(buf, 10, &val);
5517         if (err)
5518                 return err;
5519         if (val > 1)
5520                 return -EINVAL;
5521
5522         spin_lock(&perf_resource_lock);
5523         perf_overcommit = val;
5524         spin_unlock(&perf_resource_lock);
5525
5526         return count;
5527 }
5528
5529 static SYSDEV_CLASS_ATTR(
5530                                 reserve_percpu,
5531                                 0644,
5532                                 perf_show_reserve_percpu,
5533                                 perf_set_reserve_percpu
5534                         );
5535
5536 static SYSDEV_CLASS_ATTR(
5537                                 overcommit,
5538                                 0644,
5539                                 perf_show_overcommit,
5540                                 perf_set_overcommit
5541                         );
5542
5543 static struct attribute *perfclass_attrs[] = {
5544         &attr_reserve_percpu.attr,
5545         &attr_overcommit.attr,
5546         NULL
5547 };
5548
5549 static struct attribute_group perfclass_attr_group = {
5550         .attrs                  = perfclass_attrs,
5551         .name                   = "perf_events",
5552 };
5553
5554 static int __init perf_event_sysfs_init(void)
5555 {
5556         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5557                                   &perfclass_attr_group);
5558 }
5559 device_initcall(perf_event_sysfs_init);