ipv6: fix race condition regarding dst->expires and dst->from.
[pandora-kernel.git] / tools / perf / builtin-record.c
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #define _FILE_OFFSET_BITS 64
9
10 #include "builtin.h"
11
12 #include "perf.h"
13
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
18
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29
30 #include <unistd.h>
31 #include <sched.h>
32 #include <sys/mman.h>
33
34 #ifndef HAVE_ON_EXIT
35 #ifndef ATEXIT_MAX
36 #define ATEXIT_MAX 32
37 #endif
38 static int __on_exit_count = 0;
39 typedef void (*on_exit_func_t) (int, void *);
40 static on_exit_func_t __on_exit_funcs[ATEXIT_MAX];
41 static void *__on_exit_args[ATEXIT_MAX];
42 static int __exitcode = 0;
43 static void __handle_on_exit_funcs(void);
44 static int on_exit(on_exit_func_t function, void *arg);
45 #define exit(x) (exit)(__exitcode = (x))
46
47 static int on_exit(on_exit_func_t function, void *arg)
48 {
49         if (__on_exit_count == ATEXIT_MAX)
50                 return -ENOMEM;
51         else if (__on_exit_count == 0)
52                 atexit(__handle_on_exit_funcs);
53         __on_exit_funcs[__on_exit_count] = function;
54         __on_exit_args[__on_exit_count++] = arg;
55         return 0;
56 }
57
58 static void __handle_on_exit_funcs(void)
59 {
60         int i;
61         for (i = 0; i < __on_exit_count; i++)
62                 __on_exit_funcs[i] (__exitcode, __on_exit_args[i]);
63 }
64 #endif
65
66 enum write_mode_t {
67         WRITE_FORCE,
68         WRITE_APPEND
69 };
70
71 struct perf_record {
72         struct perf_tool        tool;
73         struct perf_record_opts opts;
74         u64                     bytes_written;
75         const char              *output_name;
76         struct perf_evlist      *evlist;
77         struct perf_session     *session;
78         const char              *progname;
79         int                     output;
80         unsigned int            page_size;
81         int                     realtime_prio;
82         enum write_mode_t       write_mode;
83         bool                    no_buildid;
84         bool                    no_buildid_cache;
85         bool                    force;
86         bool                    file_new;
87         bool                    append_file;
88         long                    samples;
89         off_t                   post_processing_offset;
90 };
91
92 static void advance_output(struct perf_record *rec, size_t size)
93 {
94         rec->bytes_written += size;
95 }
96
97 static int write_output(struct perf_record *rec, void *buf, size_t size)
98 {
99         while (size) {
100                 int ret = write(rec->output, buf, size);
101
102                 if (ret < 0) {
103                         pr_err("failed to write\n");
104                         return -1;
105                 }
106
107                 size -= ret;
108                 buf += ret;
109
110                 rec->bytes_written += ret;
111         }
112
113         return 0;
114 }
115
116 static int process_synthesized_event(struct perf_tool *tool,
117                                      union perf_event *event,
118                                      struct perf_sample *sample __maybe_unused,
119                                      struct machine *machine __maybe_unused)
120 {
121         struct perf_record *rec = container_of(tool, struct perf_record, tool);
122         if (write_output(rec, event, event->header.size) < 0)
123                 return -1;
124
125         return 0;
126 }
127
128 static int perf_record__mmap_read(struct perf_record *rec,
129                                    struct perf_mmap *md)
130 {
131         unsigned int head = perf_mmap__read_head(md);
132         unsigned int old = md->prev;
133         unsigned char *data = md->base + rec->page_size;
134         unsigned long size;
135         void *buf;
136         int rc = 0;
137
138         if (old == head)
139                 return 0;
140
141         rec->samples++;
142
143         size = head - old;
144
145         if ((old & md->mask) + size != (head & md->mask)) {
146                 buf = &data[old & md->mask];
147                 size = md->mask + 1 - (old & md->mask);
148                 old += size;
149
150                 if (write_output(rec, buf, size) < 0) {
151                         rc = -1;
152                         goto out;
153                 }
154         }
155
156         buf = &data[old & md->mask];
157         size = head - old;
158         old += size;
159
160         if (write_output(rec, buf, size) < 0) {
161                 rc = -1;
162                 goto out;
163         }
164
165         md->prev = old;
166         perf_mmap__write_tail(md, old);
167
168 out:
169         return rc;
170 }
171
172 static volatile int done = 0;
173 static volatile int signr = -1;
174 static volatile int child_finished = 0;
175
176 static void sig_handler(int sig)
177 {
178         if (sig == SIGCHLD)
179                 child_finished = 1;
180
181         done = 1;
182         signr = sig;
183 }
184
185 static void perf_record__sig_exit(int exit_status __maybe_unused, void *arg)
186 {
187         struct perf_record *rec = arg;
188         int status;
189
190         if (rec->evlist->workload.pid > 0) {
191                 if (!child_finished)
192                         kill(rec->evlist->workload.pid, SIGTERM);
193
194                 wait(&status);
195                 if (WIFSIGNALED(status))
196                         psignal(WTERMSIG(status), rec->progname);
197         }
198
199         if (signr == -1 || signr == SIGUSR1)
200                 return;
201
202         signal(signr, SIG_DFL);
203         kill(getpid(), signr);
204 }
205
206 static bool perf_evlist__equal(struct perf_evlist *evlist,
207                                struct perf_evlist *other)
208 {
209         struct perf_evsel *pos, *pair;
210
211         if (evlist->nr_entries != other->nr_entries)
212                 return false;
213
214         pair = perf_evlist__first(other);
215
216         list_for_each_entry(pos, &evlist->entries, node) {
217                 if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
218                         return false;
219                 pair = perf_evsel__next(pair);
220         }
221
222         return true;
223 }
224
225 static int perf_record__open(struct perf_record *rec)
226 {
227         struct perf_evsel *pos;
228         struct perf_evlist *evlist = rec->evlist;
229         struct perf_session *session = rec->session;
230         struct perf_record_opts *opts = &rec->opts;
231         int rc = 0;
232
233         /*
234          * Set the evsel leader links before we configure attributes,
235          * since some might depend on this info.
236          */
237         if (opts->group)
238                 perf_evlist__set_leader(evlist);
239
240         perf_evlist__config_attrs(evlist, opts);
241
242         list_for_each_entry(pos, &evlist->entries, node) {
243                 struct perf_event_attr *attr = &pos->attr;
244                 /*
245                  * Check if parse_single_tracepoint_event has already asked for
246                  * PERF_SAMPLE_TIME.
247                  *
248                  * XXX this is kludgy but short term fix for problems introduced by
249                  * eac23d1c that broke 'perf script' by having different sample_types
250                  * when using multiple tracepoint events when we use a perf binary
251                  * that tries to use sample_id_all on an older kernel.
252                  *
253                  * We need to move counter creation to perf_session, support
254                  * different sample_types, etc.
255                  */
256                 bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
257
258 fallback_missing_features:
259                 if (opts->exclude_guest_missing)
260                         attr->exclude_guest = attr->exclude_host = 0;
261 retry_sample_id:
262                 attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
263 try_again:
264                 if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
265                         int err = errno;
266
267                         if (err == EPERM || err == EACCES) {
268                                 ui__error_paranoid();
269                                 rc = -err;
270                                 goto out;
271                         } else if (err ==  ENODEV && opts->target.cpu_list) {
272                                 pr_err("No such device - did you specify"
273                                        " an out-of-range profile CPU?\n");
274                                 rc = -err;
275                                 goto out;
276                         } else if (err == EINVAL) {
277                                 if (!opts->exclude_guest_missing &&
278                                     (attr->exclude_guest || attr->exclude_host)) {
279                                         pr_debug("Old kernel, cannot exclude "
280                                                  "guest or host samples.\n");
281                                         opts->exclude_guest_missing = true;
282                                         goto fallback_missing_features;
283                                 } else if (!opts->sample_id_all_missing) {
284                                         /*
285                                          * Old kernel, no attr->sample_id_type_all field
286                                          */
287                                         opts->sample_id_all_missing = true;
288                                         if (!opts->sample_time && !opts->raw_samples && !time_needed)
289                                                 attr->sample_type &= ~PERF_SAMPLE_TIME;
290
291                                         goto retry_sample_id;
292                                 }
293                         }
294
295                         /*
296                          * If it's cycles then fall back to hrtimer
297                          * based cpu-clock-tick sw counter, which
298                          * is always available even if no PMU support.
299                          *
300                          * PPC returns ENXIO until 2.6.37 (behavior changed
301                          * with commit b0a873e).
302                          */
303                         if ((err == ENOENT || err == ENXIO)
304                                         && attr->type == PERF_TYPE_HARDWARE
305                                         && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
306
307                                 if (verbose)
308                                         ui__warning("The cycles event is not supported, "
309                                                     "trying to fall back to cpu-clock-ticks\n");
310                                 attr->type = PERF_TYPE_SOFTWARE;
311                                 attr->config = PERF_COUNT_SW_CPU_CLOCK;
312                                 if (pos->name) {
313                                         free(pos->name);
314                                         pos->name = NULL;
315                                 }
316                                 goto try_again;
317                         }
318
319                         if (err == ENOENT) {
320                                 ui__error("The %s event is not supported.\n",
321                                           perf_evsel__name(pos));
322                                 rc = -err;
323                                 goto out;
324                         } else if ((err == EOPNOTSUPP) && (attr->precise_ip)) {
325                                 ui__error("\'precise\' request may not be supported. "
326                                           "Try removing 'p' modifier\n");
327                                 rc = -err;
328                                 goto out;
329                         }
330
331                         printf("\n");
332                         error("sys_perf_event_open() syscall returned with %d "
333                               "(%s) for event %s. /bin/dmesg may provide "
334                               "additional information.\n",
335                               err, strerror(err), perf_evsel__name(pos));
336
337 #if defined(__i386__) || defined(__x86_64__)
338                         if (attr->type == PERF_TYPE_HARDWARE &&
339                             err == EOPNOTSUPP) {
340                                 pr_err("No hardware sampling interrupt available."
341                                        " No APIC? If so then you can boot the kernel"
342                                        " with the \"lapic\" boot parameter to"
343                                        " force-enable it.\n");
344                                 rc = -err;
345                                 goto out;
346                         }
347 #endif
348
349                         pr_err("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
350                         rc = -err;
351                         goto out;
352                 }
353         }
354
355         if (perf_evlist__apply_filters(evlist)) {
356                 error("failed to set filter with %d (%s)\n", errno,
357                         strerror(errno));
358                 rc = -1;
359                 goto out;
360         }
361
362         if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
363                 if (errno == EPERM) {
364                         pr_err("Permission error mapping pages.\n"
365                                "Consider increasing "
366                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
367                                "or try again with a smaller value of -m/--mmap_pages.\n"
368                                "(current value: %d)\n", opts->mmap_pages);
369                         rc = -errno;
370                 } else if (!is_power_of_2(opts->mmap_pages) &&
371                            (opts->mmap_pages != UINT_MAX)) {
372                         pr_err("--mmap_pages/-m value must be a power of two.");
373                         rc = -EINVAL;
374                 } else {
375                         pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
376                         rc = -errno;
377                 }
378                 goto out;
379         }
380
381         if (rec->file_new)
382                 session->evlist = evlist;
383         else {
384                 if (!perf_evlist__equal(session->evlist, evlist)) {
385                         fprintf(stderr, "incompatible append\n");
386                         rc = -1;
387                         goto out;
388                 }
389         }
390
391         perf_session__set_id_hdr_size(session);
392 out:
393         return rc;
394 }
395
396 static int process_buildids(struct perf_record *rec)
397 {
398         u64 size = lseek(rec->output, 0, SEEK_CUR);
399
400         if (size == 0)
401                 return 0;
402
403         rec->session->fd = rec->output;
404         return __perf_session__process_events(rec->session, rec->post_processing_offset,
405                                               size - rec->post_processing_offset,
406                                               size, &build_id__mark_dso_hit_ops);
407 }
408
409 static void perf_record__exit(int status, void *arg)
410 {
411         struct perf_record *rec = arg;
412
413         if (status != 0)
414                 return;
415
416         if (!rec->opts.pipe_output) {
417                 rec->session->header.data_size += rec->bytes_written;
418
419                 if (!rec->no_buildid)
420                         process_buildids(rec);
421                 perf_session__write_header(rec->session, rec->evlist,
422                                            rec->output, true);
423                 perf_session__delete(rec->session);
424                 perf_evlist__delete(rec->evlist);
425                 symbol__exit();
426         }
427 }
428
429 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
430 {
431         int err;
432         struct perf_tool *tool = data;
433
434         if (machine__is_host(machine))
435                 return;
436
437         /*
438          *As for guest kernel when processing subcommand record&report,
439          *we arrange module mmap prior to guest kernel mmap and trigger
440          *a preload dso because default guest module symbols are loaded
441          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
442          *method is used to avoid symbol missing when the first addr is
443          *in module instead of in guest kernel.
444          */
445         err = perf_event__synthesize_modules(tool, process_synthesized_event,
446                                              machine);
447         if (err < 0)
448                 pr_err("Couldn't record guest kernel [%d]'s reference"
449                        " relocation symbol.\n", machine->pid);
450
451         /*
452          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
453          * have no _text sometimes.
454          */
455         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
456                                                  machine, "_text");
457         if (err < 0)
458                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
459                                                          machine, "_stext");
460         if (err < 0)
461                 pr_err("Couldn't record guest kernel [%d]'s reference"
462                        " relocation symbol.\n", machine->pid);
463 }
464
465 static struct perf_event_header finished_round_event = {
466         .size = sizeof(struct perf_event_header),
467         .type = PERF_RECORD_FINISHED_ROUND,
468 };
469
470 static int perf_record__mmap_read_all(struct perf_record *rec)
471 {
472         int i;
473         int rc = 0;
474
475         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
476                 if (rec->evlist->mmap[i].base) {
477                         if (perf_record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
478                                 rc = -1;
479                                 goto out;
480                         }
481                 }
482         }
483
484         if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
485                 rc = write_output(rec, &finished_round_event,
486                                   sizeof(finished_round_event));
487
488 out:
489         return rc;
490 }
491
492 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
493 {
494         struct stat st;
495         int flags;
496         int err, output, feat;
497         unsigned long waking = 0;
498         const bool forks = argc > 0;
499         struct machine *machine;
500         struct perf_tool *tool = &rec->tool;
501         struct perf_record_opts *opts = &rec->opts;
502         struct perf_evlist *evsel_list = rec->evlist;
503         const char *output_name = rec->output_name;
504         struct perf_session *session;
505         bool disabled = false;
506
507         rec->progname = argv[0];
508
509         rec->page_size = sysconf(_SC_PAGE_SIZE);
510
511         on_exit(perf_record__sig_exit, rec);
512         signal(SIGCHLD, sig_handler);
513         signal(SIGINT, sig_handler);
514         signal(SIGUSR1, sig_handler);
515
516         if (!output_name) {
517                 if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
518                         opts->pipe_output = true;
519                 else
520                         rec->output_name = output_name = "perf.data";
521         }
522         if (output_name) {
523                 if (!strcmp(output_name, "-"))
524                         opts->pipe_output = true;
525                 else if (!stat(output_name, &st) && st.st_size) {
526                         if (rec->write_mode == WRITE_FORCE) {
527                                 char oldname[PATH_MAX];
528                                 snprintf(oldname, sizeof(oldname), "%s.old",
529                                          output_name);
530                                 unlink(oldname);
531                                 rename(output_name, oldname);
532                         }
533                 } else if (rec->write_mode == WRITE_APPEND) {
534                         rec->write_mode = WRITE_FORCE;
535                 }
536         }
537
538         flags = O_CREAT|O_RDWR;
539         if (rec->write_mode == WRITE_APPEND)
540                 rec->file_new = 0;
541         else
542                 flags |= O_TRUNC;
543
544         if (opts->pipe_output)
545                 output = STDOUT_FILENO;
546         else
547                 output = open(output_name, flags, S_IRUSR | S_IWUSR);
548         if (output < 0) {
549                 perror("failed to create output file");
550                 return -1;
551         }
552
553         rec->output = output;
554
555         session = perf_session__new(output_name, O_WRONLY,
556                                     rec->write_mode == WRITE_FORCE, false, NULL);
557         if (session == NULL) {
558                 pr_err("Not enough memory for reading perf file header\n");
559                 return -1;
560         }
561
562         rec->session = session;
563
564         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
565                 perf_header__set_feat(&session->header, feat);
566
567         if (rec->no_buildid)
568                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
569
570         if (!have_tracepoints(&evsel_list->entries))
571                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
572
573         if (!rec->opts.branch_stack)
574                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
575
576         if (!rec->file_new) {
577                 err = perf_session__read_header(session, output);
578                 if (err < 0)
579                         goto out_delete_session;
580         }
581
582         if (forks) {
583                 err = perf_evlist__prepare_workload(evsel_list, opts, argv);
584                 if (err < 0) {
585                         pr_err("Couldn't run the workload!\n");
586                         goto out_delete_session;
587                 }
588         }
589
590         if (perf_record__open(rec) != 0) {
591                 err = -1;
592                 goto out_delete_session;
593         }
594
595         /*
596          * perf_session__delete(session) will be called at perf_record__exit()
597          */
598         on_exit(perf_record__exit, rec);
599
600         if (opts->pipe_output) {
601                 err = perf_header__write_pipe(output);
602                 if (err < 0)
603                         goto out_delete_session;
604         } else if (rec->file_new) {
605                 err = perf_session__write_header(session, evsel_list,
606                                                  output, false);
607                 if (err < 0)
608                         goto out_delete_session;
609         }
610
611         if (!rec->no_buildid
612             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
613                 pr_err("Couldn't generate buildids. "
614                        "Use --no-buildid to profile anyway.\n");
615                 err = -1;
616                 goto out_delete_session;
617         }
618
619         rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
620
621         machine = perf_session__find_host_machine(session);
622         if (!machine) {
623                 pr_err("Couldn't find native kernel information.\n");
624                 err = -1;
625                 goto out_delete_session;
626         }
627
628         if (opts->pipe_output) {
629                 err = perf_event__synthesize_attrs(tool, session,
630                                                    process_synthesized_event);
631                 if (err < 0) {
632                         pr_err("Couldn't synthesize attrs.\n");
633                         goto out_delete_session;
634                 }
635
636                 err = perf_event__synthesize_event_types(tool, process_synthesized_event,
637                                                          machine);
638                 if (err < 0) {
639                         pr_err("Couldn't synthesize event_types.\n");
640                         goto out_delete_session;
641                 }
642
643                 if (have_tracepoints(&evsel_list->entries)) {
644                         /*
645                          * FIXME err <= 0 here actually means that
646                          * there were no tracepoints so its not really
647                          * an error, just that we don't need to
648                          * synthesize anything.  We really have to
649                          * return this more properly and also
650                          * propagate errors that now are calling die()
651                          */
652                         err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
653                                                                   process_synthesized_event);
654                         if (err <= 0) {
655                                 pr_err("Couldn't record tracing data.\n");
656                                 goto out_delete_session;
657                         }
658                         advance_output(rec, err);
659                 }
660         }
661
662         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
663                                                  machine, "_text");
664         if (err < 0)
665                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
666                                                          machine, "_stext");
667         if (err < 0)
668                 pr_err("Couldn't record kernel reference relocation symbol\n"
669                        "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
670                        "Check /proc/kallsyms permission or run as root.\n");
671
672         err = perf_event__synthesize_modules(tool, process_synthesized_event,
673                                              machine);
674         if (err < 0)
675                 pr_err("Couldn't record kernel module information.\n"
676                        "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
677                        "Check /proc/modules permission or run as root.\n");
678
679         if (perf_guest)
680                 perf_session__process_machines(session, tool,
681                                                perf_event__synthesize_guest_os);
682
683         if (!opts->target.system_wide)
684                 err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
685                                                   process_synthesized_event,
686                                                   machine);
687         else
688                 err = perf_event__synthesize_threads(tool, process_synthesized_event,
689                                                machine);
690
691         if (err != 0)
692                 goto out_delete_session;
693
694         if (rec->realtime_prio) {
695                 struct sched_param param;
696
697                 param.sched_priority = rec->realtime_prio;
698                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
699                         pr_err("Could not set realtime priority.\n");
700                         err = -1;
701                         goto out_delete_session;
702                 }
703         }
704
705         /*
706          * When perf is starting the traced process, all the events
707          * (apart from group members) have enable_on_exec=1 set,
708          * so don't spoil it by prematurely enabling them.
709          */
710         if (!perf_target__none(&opts->target))
711                 perf_evlist__enable(evsel_list);
712
713         /*
714          * Let the child rip
715          */
716         if (forks)
717                 perf_evlist__start_workload(evsel_list);
718
719         for (;;) {
720                 int hits = rec->samples;
721
722                 if (perf_record__mmap_read_all(rec) < 0) {
723                         err = -1;
724                         goto out_delete_session;
725                 }
726
727                 if (hits == rec->samples) {
728                         if (done)
729                                 break;
730                         err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
731                         waking++;
732                 }
733
734                 /*
735                  * When perf is starting the traced process, at the end events
736                  * die with the process and we wait for that. Thus no need to
737                  * disable events in this case.
738                  */
739                 if (done && !disabled && !perf_target__none(&opts->target)) {
740                         perf_evlist__disable(evsel_list);
741                         disabled = true;
742                 }
743         }
744
745         if (quiet || signr == SIGUSR1)
746                 return 0;
747
748         fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
749
750         /*
751          * Approximate RIP event size: 24 bytes.
752          */
753         fprintf(stderr,
754                 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
755                 (double)rec->bytes_written / 1024.0 / 1024.0,
756                 output_name,
757                 rec->bytes_written / 24);
758
759         return 0;
760
761 out_delete_session:
762         perf_session__delete(session);
763         return err;
764 }
765
766 #define BRANCH_OPT(n, m) \
767         { .name = n, .mode = (m) }
768
769 #define BRANCH_END { .name = NULL }
770
771 struct branch_mode {
772         const char *name;
773         int mode;
774 };
775
776 static const struct branch_mode branch_modes[] = {
777         BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
778         BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
779         BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
780         BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
781         BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
782         BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
783         BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
784         BRANCH_END
785 };
786
787 static int
788 parse_branch_stack(const struct option *opt, const char *str, int unset)
789 {
790 #define ONLY_PLM \
791         (PERF_SAMPLE_BRANCH_USER        |\
792          PERF_SAMPLE_BRANCH_KERNEL      |\
793          PERF_SAMPLE_BRANCH_HV)
794
795         uint64_t *mode = (uint64_t *)opt->value;
796         const struct branch_mode *br;
797         char *s, *os = NULL, *p;
798         int ret = -1;
799
800         if (unset)
801                 return 0;
802
803         /*
804          * cannot set it twice, -b + --branch-filter for instance
805          */
806         if (*mode)
807                 return -1;
808
809         /* str may be NULL in case no arg is passed to -b */
810         if (str) {
811                 /* because str is read-only */
812                 s = os = strdup(str);
813                 if (!s)
814                         return -1;
815
816                 for (;;) {
817                         p = strchr(s, ',');
818                         if (p)
819                                 *p = '\0';
820
821                         for (br = branch_modes; br->name; br++) {
822                                 if (!strcasecmp(s, br->name))
823                                         break;
824                         }
825                         if (!br->name) {
826                                 ui__warning("unknown branch filter %s,"
827                                             " check man page\n", s);
828                                 goto error;
829                         }
830
831                         *mode |= br->mode;
832
833                         if (!p)
834                                 break;
835
836                         s = p + 1;
837                 }
838         }
839         ret = 0;
840
841         /* default to any branch */
842         if ((*mode & ~ONLY_PLM) == 0) {
843                 *mode = PERF_SAMPLE_BRANCH_ANY;
844         }
845 error:
846         free(os);
847         return ret;
848 }
849
850 #ifdef LIBUNWIND_SUPPORT
851 static int get_stack_size(char *str, unsigned long *_size)
852 {
853         char *endptr;
854         unsigned long size;
855         unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
856
857         size = strtoul(str, &endptr, 0);
858
859         do {
860                 if (*endptr)
861                         break;
862
863                 size = round_up(size, sizeof(u64));
864                 if (!size || size > max_size)
865                         break;
866
867                 *_size = size;
868                 return 0;
869
870         } while (0);
871
872         pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
873                max_size, str);
874         return -1;
875 }
876 #endif /* LIBUNWIND_SUPPORT */
877
878 static int
879 parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
880                     int unset)
881 {
882         struct perf_record *rec = (struct perf_record *)opt->value;
883         char *tok, *name, *saveptr = NULL;
884         char *buf;
885         int ret = -1;
886
887         /* --no-call-graph */
888         if (unset)
889                 return 0;
890
891         /* We specified default option if none is provided. */
892         BUG_ON(!arg);
893
894         /* We need buffer that we know we can write to. */
895         buf = malloc(strlen(arg) + 1);
896         if (!buf)
897                 return -ENOMEM;
898
899         strcpy(buf, arg);
900
901         tok = strtok_r((char *)buf, ",", &saveptr);
902         name = tok ? : (char *)buf;
903
904         do {
905                 /* Framepointer style */
906                 if (!strncmp(name, "fp", sizeof("fp"))) {
907                         if (!strtok_r(NULL, ",", &saveptr)) {
908                                 rec->opts.call_graph = CALLCHAIN_FP;
909                                 ret = 0;
910                         } else
911                                 pr_err("callchain: No more arguments "
912                                        "needed for -g fp\n");
913                         break;
914
915 #ifdef LIBUNWIND_SUPPORT
916                 /* Dwarf style */
917                 } else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
918                         const unsigned long default_stack_dump_size = 8192;
919
920                         ret = 0;
921                         rec->opts.call_graph = CALLCHAIN_DWARF;
922                         rec->opts.stack_dump_size = default_stack_dump_size;
923
924                         tok = strtok_r(NULL, ",", &saveptr);
925                         if (tok) {
926                                 unsigned long size = 0;
927
928                                 ret = get_stack_size(tok, &size);
929                                 rec->opts.stack_dump_size = size;
930                         }
931
932                         if (!ret)
933                                 pr_debug("callchain: stack dump size %d\n",
934                                          rec->opts.stack_dump_size);
935 #endif /* LIBUNWIND_SUPPORT */
936                 } else {
937                         pr_err("callchain: Unknown -g option "
938                                "value: %s\n", arg);
939                         break;
940                 }
941
942         } while (0);
943
944         free(buf);
945
946         if (!ret)
947                 pr_debug("callchain: type %d\n", rec->opts.call_graph);
948
949         return ret;
950 }
951
952 static const char * const record_usage[] = {
953         "perf record [<options>] [<command>]",
954         "perf record [<options>] -- <command> [<options>]",
955         NULL
956 };
957
958 /*
959  * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
960  * because we need to have access to it in perf_record__exit, that is called
961  * after cmd_record() exits, but since record_options need to be accessible to
962  * builtin-script, leave it here.
963  *
964  * At least we don't ouch it in all the other functions here directly.
965  *
966  * Just say no to tons of global variables, sigh.
967  */
968 static struct perf_record record = {
969         .opts = {
970                 .mmap_pages          = UINT_MAX,
971                 .user_freq           = UINT_MAX,
972                 .user_interval       = ULLONG_MAX,
973                 .freq                = 4000,
974                 .target              = {
975                         .uses_mmap   = true,
976                 },
977         },
978         .write_mode = WRITE_FORCE,
979         .file_new   = true,
980 };
981
982 #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
983
984 #ifdef LIBUNWIND_SUPPORT
985 static const char callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
986 #else
987 static const char callchain_help[] = CALLCHAIN_HELP "[fp]";
988 #endif
989
990 /*
991  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
992  * with it and switch to use the library functions in perf_evlist that came
993  * from builtin-record.c, i.e. use perf_record_opts,
994  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
995  * using pipes, etc.
996  */
997 const struct option record_options[] = {
998         OPT_CALLBACK('e', "event", &record.evlist, "event",
999                      "event selector. use 'perf list' to list available events",
1000                      parse_events_option),
1001         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1002                      "event filter", parse_filter),
1003         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1004                     "record events on existing process id"),
1005         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1006                     "record events on existing thread id"),
1007         OPT_INTEGER('r', "realtime", &record.realtime_prio,
1008                     "collect data with this RT SCHED_FIFO priority"),
1009         OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
1010                     "collect data without buffering"),
1011         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1012                     "collect raw sample records from all opened counters"),
1013         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1014                             "system-wide collection from all CPUs"),
1015         OPT_BOOLEAN('A', "append", &record.append_file,
1016                             "append to the output file to do incremental profiling"),
1017         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1018                     "list of cpus to monitor"),
1019         OPT_BOOLEAN('f', "force", &record.force,
1020                         "overwrite existing data file (deprecated)"),
1021         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1022         OPT_STRING('o', "output", &record.output_name, "file",
1023                     "output file name"),
1024         OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
1025                     "child tasks do not inherit counters"),
1026         OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
1027         OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
1028                      "number of mmap data pages"),
1029         OPT_BOOLEAN(0, "group", &record.opts.group,
1030                     "put the counters into a counter group"),
1031         OPT_CALLBACK_DEFAULT('g', "call-graph", &record, "mode[,dump_size]",
1032                              callchain_help, &parse_callchain_opt,
1033                              "fp"),
1034         OPT_INCR('v', "verbose", &verbose,
1035                     "be more verbose (show counter open errors, etc)"),
1036         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1037         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1038                     "per thread counts"),
1039         OPT_BOOLEAN('d', "data", &record.opts.sample_address,
1040                     "Sample addresses"),
1041         OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
1042         OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
1043         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1044                     "don't sample"),
1045         OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
1046                     "do not update the buildid cache"),
1047         OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
1048                     "do not collect buildids in perf.data"),
1049         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1050                      "monitor event in cgroup name only",
1051                      parse_cgroups),
1052         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1053                    "user to profile"),
1054
1055         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1056                      "branch any", "sample any taken branches",
1057                      parse_branch_stack),
1058
1059         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1060                      "branch filter mask", "branch stack filter modes",
1061                      parse_branch_stack),
1062         OPT_END()
1063 };
1064
1065 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1066 {
1067         int err = -ENOMEM;
1068         struct perf_evsel *pos;
1069         struct perf_evlist *evsel_list;
1070         struct perf_record *rec = &record;
1071         char errbuf[BUFSIZ];
1072
1073         evsel_list = perf_evlist__new(NULL, NULL);
1074         if (evsel_list == NULL)
1075                 return -ENOMEM;
1076
1077         rec->evlist = evsel_list;
1078
1079         argc = parse_options(argc, argv, record_options, record_usage,
1080                             PARSE_OPT_STOP_AT_NON_OPTION);
1081         if (!argc && perf_target__none(&rec->opts.target))
1082                 usage_with_options(record_usage, record_options);
1083
1084         if (rec->force && rec->append_file) {
1085                 ui__error("Can't overwrite and append at the same time."
1086                           " You need to choose between -f and -A");
1087                 usage_with_options(record_usage, record_options);
1088         } else if (rec->append_file) {
1089                 rec->write_mode = WRITE_APPEND;
1090         } else {
1091                 rec->write_mode = WRITE_FORCE;
1092         }
1093
1094         if (nr_cgroups && !rec->opts.target.system_wide) {
1095                 ui__error("cgroup monitoring only available in"
1096                           " system-wide mode\n");
1097                 usage_with_options(record_usage, record_options);
1098         }
1099
1100         symbol__init();
1101
1102         if (symbol_conf.kptr_restrict)
1103                 pr_warning(
1104 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1105 "check /proc/sys/kernel/kptr_restrict.\n\n"
1106 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1107 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1108 "Samples in kernel modules won't be resolved at all.\n\n"
1109 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1110 "even with a suitable vmlinux or kallsyms file.\n\n");
1111
1112         if (rec->no_buildid_cache || rec->no_buildid)
1113                 disable_buildid_cache();
1114
1115         if (evsel_list->nr_entries == 0 &&
1116             perf_evlist__add_default(evsel_list) < 0) {
1117                 pr_err("Not enough memory for event selector list\n");
1118                 goto out_symbol_exit;
1119         }
1120
1121         err = perf_target__validate(&rec->opts.target);
1122         if (err) {
1123                 perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1124                 ui__warning("%s", errbuf);
1125         }
1126
1127         err = perf_target__parse_uid(&rec->opts.target);
1128         if (err) {
1129                 int saved_errno = errno;
1130
1131                 perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1132                 ui__error("%s", errbuf);
1133
1134                 err = -saved_errno;
1135                 goto out_free_fd;
1136         }
1137
1138         err = -ENOMEM;
1139         if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
1140                 usage_with_options(record_usage, record_options);
1141
1142         list_for_each_entry(pos, &evsel_list->entries, node) {
1143                 if (perf_header__push_event(pos->attr.config, perf_evsel__name(pos)))
1144                         goto out_free_fd;
1145         }
1146
1147         if (rec->opts.user_interval != ULLONG_MAX)
1148                 rec->opts.default_interval = rec->opts.user_interval;
1149         if (rec->opts.user_freq != UINT_MAX)
1150                 rec->opts.freq = rec->opts.user_freq;
1151
1152         /*
1153          * User specified count overrides default frequency.
1154          */
1155         if (rec->opts.default_interval)
1156                 rec->opts.freq = 0;
1157         else if (rec->opts.freq) {
1158                 rec->opts.default_interval = rec->opts.freq;
1159         } else {
1160                 ui__error("frequency and count are zero, aborting\n");
1161                 err = -EINVAL;
1162                 goto out_free_fd;
1163         }
1164
1165         err = __cmd_record(&record, argc, argv);
1166 out_free_fd:
1167         perf_evlist__delete_maps(evsel_list);
1168 out_symbol_exit:
1169         symbol__exit();
1170         return err;
1171 }