drivers/oprofile/cpu_buffer.c

   1 /**
   2  * @file cpu_buffer.c
   3  *
   4  * @remark Copyright 2002 OProfile authors
   5  * @remark Read the file COPYING
   6  *
   7  * @author John Levon <levon@movementarian.org>
   8  *
   9  * Each CPU has a local buffer that stores PC value/event
  10  * pairs. We also log context switches when we notice them.
  11  * Eventually each CPU's buffer is processed into the global
  12  * event buffer by sync_buffer().
  13  *
  14  * We use a local buffer for two reasons: an NMI or similar
  15  * interrupt cannot synchronise, and high sampling rates
  16  * would lead to catastrophic global synchronisation if
  17  * a global buffer was used.
  18  */
  19
  20 #include <linux/sched.h>
  21 #include <linux/oprofile.h>
  22 #include <linux/vmalloc.h>
  23 #include <linux/errno.h>
  24
  25 #include "event_buffer.h"
  26 #include "cpu_buffer.h"
  27 #include "buffer_sync.h"
  28 #include "oprof.h"
  29
  30 DEFINE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer);
  31
  32 static void wq_sync_buffer(struct work_struct *work);
  33
  34 #define DEFAULT_TIMER_EXPIRE (HZ / 10)
  35 static int work_enabled;
  36
  37 void free_cpu_buffers(void)
  38 {
  39         int i;
  40
  41         for_each_online_cpu(i) {
  42                 vfree(per_cpu(cpu_buffer, i).buffer);
  43                 per_cpu(cpu_buffer, i).buffer = NULL;
  44         }
  45 }
  46
  47 int alloc_cpu_buffers(void)
  48 {
  49         int i;
  50
  51         unsigned long buffer_size = fs_cpu_buffer_size;
  52
  53         for_each_online_cpu(i) {
  54                 struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
  55
  56                 b->buffer = vmalloc_node(sizeof(struct op_sample) * buffer_size,
  57                         cpu_to_node(i));
  58                 if (!b->buffer)
  59                         goto fail;
  60
  61                 b->last_task = NULL;
  62                 b->last_is_kernel = -1;
  63                 b->tracing = 0;
  64                 b->buffer_size = buffer_size;
  65                 b->tail_pos = 0;
  66                 b->head_pos = 0;
  67                 b->sample_received = 0;
  68                 b->sample_lost_overflow = 0;
  69                 b->backtrace_aborted = 0;
  70                 b->sample_invalid_eip = 0;
  71                 b->cpu = i;
  72                 INIT_DELAYED_WORK(&b->work, wq_sync_buffer);
  73         }
  74         return 0;
  75
  76 fail:
  77         free_cpu_buffers();
  78         return -ENOMEM;
  79 }
  80
  81 void start_cpu_work(void)
  82 {
  83         int i;
  84
  85         work_enabled = 1;
  86
  87         for_each_online_cpu(i) {
  88                 struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
  89
  90                 /*
  91                  * Spread the work by 1 jiffy per cpu so they dont all
  92                  * fire at once.
  93                  */
  94                 schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
  95         }
  96 }
  97
  98 void end_cpu_work(void)
  99 {
 100         int i;
 101
 102         work_enabled = 0;
 103
 104         for_each_online_cpu(i) {
 105                 struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
 106
 107                 cancel_delayed_work(&b->work);
 108         }
 109
 110         flush_scheduled_work();
 111 }
 112
 113 /* Resets the cpu buffer to a sane state. */
 114 void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf)
 115 {
 116         /* reset these to invalid values; the next sample
 117          * collected will populate the buffer with proper
 118          * values to initialize the buffer
 119          */
 120         cpu_buf->last_is_kernel = -1;
 121         cpu_buf->last_task = NULL;
 122 }
 123
 124 /* compute number of available slots in cpu_buffer queue */
 125 static unsigned long nr_available_slots(struct oprofile_cpu_buffer const * b)
 126 {
 127         unsigned long head = b->head_pos;
 128         unsigned long tail = b->tail_pos;
 129
 130         if (tail > head)
 131                 return (tail - head) - 1;
 132
 133         return tail + (b->buffer_size - head) - 1;
 134 }
 135
 136 static void increment_head(struct oprofile_cpu_buffer * b)
 137 {
 138         unsigned long new_head = b->head_pos + 1;
 139
 140         /* Ensure anything written to the slot before we
 141          * increment is visible */
 142         wmb();
 143
 144         if (new_head < b->buffer_size)
 145                 b->head_pos = new_head;
 146         else
 147                 b->head_pos = 0;
 148 }
 149
 150 static inline void
 151 add_sample(struct oprofile_cpu_buffer * cpu_buf,
 152            unsigned long pc, unsigned long event)
 153 {
 154         struct op_sample * entry = &cpu_buf->buffer[cpu_buf->head_pos];
 155         entry->eip = pc;
 156         entry->event = event;
 157         increment_head(cpu_buf);
 158 }
 159
 160 static inline void
 161 add_code(struct oprofile_cpu_buffer * buffer, unsigned long value)
 162 {
 163         add_sample(buffer, ESCAPE_CODE, value);
 164 }
 165
 166 /* This must be safe from any context. It's safe writing here
 167  * because of the head/tail separation of the writer and reader
 168  * of the CPU buffer.
 169  *
 170  * is_kernel is needed because on some architectures you cannot
 171  * tell if you are in kernel or user space simply by looking at
 172  * pc. We tag this in the buffer by generating kernel enter/exit
 173  * events whenever is_kernel changes
 174  */
 175 static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc,
 176                       int is_kernel, unsigned long event)
 177 {
 178         struct task_struct * task;
 179
 180         cpu_buf->sample_received++;
 181
 182         if (pc == ESCAPE_CODE) {
 183                 cpu_buf->sample_invalid_eip++;
 184                 return 0;
 185         }
 186
 187         if (nr_available_slots(cpu_buf) < 3) {
 188                 cpu_buf->sample_lost_overflow++;
 189                 return 0;
 190         }
 191
 192         is_kernel = !!is_kernel;
 193
 194         task = current;
 195
 196         /* notice a switch from user->kernel or vice versa */
 197         if (cpu_buf->last_is_kernel != is_kernel) {
 198                 cpu_buf->last_is_kernel = is_kernel;
 199                 add_code(cpu_buf, is_kernel);
 200         }
 201
 202         /* notice a task switch */
 203         if (cpu_buf->last_task != task) {
 204                 cpu_buf->last_task = task;
 205                 add_code(cpu_buf, (unsigned long)task);
 206         }
 207
 208         add_sample(cpu_buf, pc, event);
 209         return 1;
 210 }
 211
 212 static int oprofile_begin_trace(struct oprofile_cpu_buffer * cpu_buf)
 213 {
 214         if (nr_available_slots(cpu_buf) < 4) {
 215                 cpu_buf->sample_lost_overflow++;
 216                 return 0;
 217         }
 218
 219         add_code(cpu_buf, CPU_TRACE_BEGIN);
 220         cpu_buf->tracing = 1;
 221         return 1;
 222 }
 223
 224 static void oprofile_end_trace(struct oprofile_cpu_buffer * cpu_buf)
 225 {
 226         cpu_buf->tracing = 0;
 227 }
 228
 229 void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 230                                 unsigned long event, int is_kernel)
 231 {
 232         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
 233
 234         if (!backtrace_depth) {
 235                 log_sample(cpu_buf, pc, is_kernel, event);
 236                 return;
 237         }
 238
 239         if (!oprofile_begin_trace(cpu_buf))
 240                 return;
 241
 242         /* if log_sample() fail we can't backtrace since we lost the source
 243          * of this event */
 244         if (log_sample(cpu_buf, pc, is_kernel, event))
 245                 oprofile_ops.backtrace(regs, backtrace_depth);
 246         oprofile_end_trace(cpu_buf);
 247 }
 248
 249 void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
 250 {
 251         int is_kernel = !user_mode(regs);
 252         unsigned long pc = profile_pc(regs);
 253
 254         oprofile_add_ext_sample(pc, regs, event, is_kernel);
 255 }
 256
 257 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 258 {
 259         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
 260         log_sample(cpu_buf, pc, is_kernel, event);
 261 }
 262
 263 void oprofile_add_trace(unsigned long pc)
 264 {
 265         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
 266
 267         if (!cpu_buf->tracing)
 268                 return;
 269
 270         if (nr_available_slots(cpu_buf) < 1) {
 271                 cpu_buf->tracing = 0;
 272                 cpu_buf->sample_lost_overflow++;
 273                 return;
 274         }
 275
 276         /* broken frame can give an eip with the same value as an escape code,
 277          * abort the trace if we get it */
 278         if (pc == ESCAPE_CODE) {
 279                 cpu_buf->tracing = 0;
 280                 cpu_buf->backtrace_aborted++;
 281                 return;
 282         }
 283
 284         add_sample(cpu_buf, pc, 0);
 285 }
 286
 287 /*
 288  * This serves to avoid cpu buffer overflow, and makes sure
 289  * the task mortuary progresses
 290  *
 291  * By using schedule_delayed_work_on and then schedule_delayed_work
 292  * we guarantee this will stay on the correct cpu
 293  */
 294 static void wq_sync_buffer(struct work_struct *work)
 295 {
 296         struct oprofile_cpu_buffer * b =
 297                 container_of(work, struct oprofile_cpu_buffer, work.work);
 298         if (b->cpu != smp_processor_id()) {
 299                 printk("WQ on CPU%d, prefer CPU%d\n",
 300                        smp_processor_id(), b->cpu);
 301         }
 302         sync_buffer(b->cpu);
 303
 304         /* don't re-add the work if we're shutting down */
 305         if (work_enabled)
 306                 schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE);
 307 }