0bd3c13150b1f30b3147ebf58dd75e33cc3ba44e
[pandora-kernel.git] / Documentation / perf_counter / kerneltop.c
1 /*
2  * kerneltop.c: show top kernel functions - performance counters showcase
3
4    Build with:
5
6      cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
7
8    Sample output:
9
10 ------------------------------------------------------------------------------
11  KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
12 ------------------------------------------------------------------------------
13
14              weight         RIP          kernel function
15              ______   ________________   _______________
16
17               35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18               33.00 - ffffffff804cb740 : sock_alloc_send_skb
19               31.26 - ffffffff804ce808 : skb_push
20               22.43 - ffffffff80510004 : tcp_established_options
21               19.00 - ffffffff8027d250 : find_get_page
22               15.76 - ffffffff804e4fc9 : eth_type_trans
23               15.20 - ffffffff804d8baa : dst_release
24               14.86 - ffffffff804cf5d8 : skb_release_head_state
25               14.00 - ffffffff802217d5 : read_hpet
26               12.00 - ffffffff804ffb7f : __ip_local_out
27               11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28                8.54 - ffffffff805001a3 : ip_queue_xmit
29  */
30
31 /*
32  * perfstat:  /usr/bin/time -alike performance counter statistics utility
33
34           It summarizes the counter events of all tasks (and child tasks),
35           covering all CPUs that the command (or workload) executes on.
36           It only counts the per-task events of the workload started,
37           independent of how many other tasks run on those CPUs.
38
39    Sample output:
40
41    $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43    Performance counter stats for 'ls':
44
45            163516953 instructions
46                 2295 cache-misses
47              2855182 branch-misses
48  */
49
50  /*
51   * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52   *
53   * Improvements and fixes by:
54   *
55   *   Arjan van de Ven <arjan@linux.intel.com>
56   *   Yanmin Zhang <yanmin.zhang@intel.com>
57   *   Wu Fengguang <fengguang.wu@intel.com>
58   *   Mike Galbraith <efault@gmx.de>
59   *
60   * Released under the GPL v2. (and only v2, not any later version)
61   */
62
63 #define _GNU_SOURCE
64 #include <sys/types.h>
65 #include <sys/stat.h>
66 #include <sys/time.h>
67 #include <unistd.h>
68 #include <stdint.h>
69 #include <stdlib.h>
70 #include <string.h>
71 #include <getopt.h>
72 #include <assert.h>
73 #include <fcntl.h>
74 #include <stdio.h>
75 #include <errno.h>
76 #include <ctype.h>
77 #include <time.h>
78
79 #include <glib.h>
80
81 #include <sys/syscall.h>
82 #include <sys/ioctl.h>
83 #include <sys/poll.h>
84 #include <sys/prctl.h>
85 #include <sys/wait.h>
86 #include <sys/uio.h>
87
88 #include <linux/unistd.h>
89
90 #include "perfcounters.h"
91
92
93 #define MAX_COUNTERS                    64
94 #define MAX_NR_CPUS                     256
95
96 #define DEF_PERFSTAT_EVENTS             { -2, -5, -4, -3, 0, 1, 2, 3}
97
98 static int                      run_perfstat                    =  0;
99 static int                      system_wide                     =  0;
100
101 static int                      nr_counters                     =  0;
102 static __s64                    event_id[MAX_COUNTERS]          = DEF_PERFSTAT_EVENTS;
103 static int                      event_raw[MAX_COUNTERS];
104 static int                      event_count[MAX_COUNTERS];
105 static int                      fd[MAX_NR_CPUS][MAX_COUNTERS];
106
107 static __u64                    count_filter                   = 100;
108
109 static int                      tid                             = -1;
110 static int                      profile_cpu                     = -1;
111 static int                      nr_cpus                         =  0;
112 static int                      nmi                             =  1;
113 static int                      group                           =  0;
114
115 static char                     *vmlinux;
116
117 static char                     *sym_filter;
118 static unsigned long            filter_start;
119 static unsigned long            filter_end;
120
121 static int                      delay_secs                      =  2;
122 static int                      zero;
123 static int                      dump_symtab;
124
125 static GList                    *lines;
126
127 struct source_line {
128         uint64_t                EIP;
129         unsigned long           count;
130         char                    *line;
131 };
132
133
134 const unsigned int default_count[] = {
135           10000,
136         1000000,
137           10000,
138           10000,
139         1000000,
140           10000,
141 };
142
143 static char *hw_event_names[] = {
144         "CPU cycles",
145         "instructions",
146         "cache references",
147         "cache misses",
148         "branches",
149         "branch misses",
150         "bus cycles",
151 };
152
153 static char *sw_event_names[] = {
154         "cpu clock ticks",
155         "task clock ticks",
156         "pagefaults",
157         "context switches",
158         "CPU migrations",
159 };
160
161 struct event_symbol {
162         int event;
163         char *symbol;
164 };
165
166 static struct event_symbol event_symbols[] = {
167         {PERF_COUNT_CPU_CYCLES,                 "cpu-cycles",           },
168         {PERF_COUNT_CPU_CYCLES,                 "cycles",               },
169         {PERF_COUNT_INSTRUCTIONS,               "instructions",         },
170         {PERF_COUNT_CACHE_REFERENCES,           "cache-references",     },
171         {PERF_COUNT_CACHE_MISSES,               "cache-misses",         },
172         {PERF_COUNT_BRANCH_INSTRUCTIONS,        "branch-instructions",  },
173         {PERF_COUNT_BRANCH_INSTRUCTIONS,        "branches",             },
174         {PERF_COUNT_BRANCH_MISSES,              "branch-misses",        },
175         {PERF_COUNT_BUS_CYCLES,                 "bus-cycles",           },
176         {PERF_COUNT_CPU_CLOCK,                  "cpu-ticks",            },
177         {PERF_COUNT_CPU_CLOCK,                  "ticks",                },
178         {PERF_COUNT_TASK_CLOCK,                 "task-ticks",           },
179         {PERF_COUNT_PAGE_FAULTS,                "page-faults",          },
180         {PERF_COUNT_PAGE_FAULTS,                "faults",               },
181         {PERF_COUNT_CONTEXT_SWITCHES,           "context-switches",     },
182         {PERF_COUNT_CONTEXT_SWITCHES,           "cs",                   },
183         {PERF_COUNT_CPU_MIGRATIONS,             "cpu-migrations",       },
184         {PERF_COUNT_CPU_MIGRATIONS,             "migrations",           },
185 };
186
187 static void display_events_help(void)
188 {
189         unsigned int i;
190         int e;
191
192         printf(
193         " -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
194
195         for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
196                 if (e != event_symbols[i].event) {
197                         e = event_symbols[i].event;
198                         printf(
199         "\n                             %2d: %-20s", e, event_symbols[i].symbol);
200                 } else
201                         printf(" %s", event_symbols[i].symbol);
202         }
203
204         printf("\n"
205         "                           rNNN: raw PMU events (eventsel+umask)\n\n");
206 }
207
208 static void display_perfstat_help(void)
209 {
210         printf(
211         "Usage: perfstat [<events...>] <cmd...>\n\n"
212         "PerfStat Options (up to %d event types can be specified):\n\n",
213                  MAX_COUNTERS);
214
215         display_events_help();
216
217         printf(
218         " -a                           # system-wide collection\n");
219         exit(0);
220 }
221
222 static void display_help(void)
223 {
224         if (run_perfstat)
225                 return display_perfstat_help();
226
227         printf(
228         "Usage: kerneltop [<options>]\n"
229         "   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
230         "KernelTop Options (up to %d event types can be specified at once):\n\n",
231                  MAX_COUNTERS);
232
233         display_events_help();
234
235         printf(
236         " -S        --stat             # perfstat COMMAND\n"
237         " -a                           # system-wide collection (for perfstat)\n\n"
238         " -c CNT    --count=CNT        # event period to sample\n\n"
239         " -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
240         " -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
241         " -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
242         " -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
243         " -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
244         " -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
245         " -z        --zero             # zero counts after display\n"
246         " -D        --dump_symtab      # dump symbol table to stderr on startup\n"
247         );
248
249         exit(0);
250 }
251
252 static int type_valid(int type)
253 {
254         if (type >= PERF_HW_EVENTS_MAX)
255                 return 0;
256         if (type <= PERF_SW_EVENTS_MIN)
257                 return 0;
258
259         return 1;
260 }
261
262 static char *event_name(int ctr)
263 {
264         __s64 type = event_id[ctr];
265         static char buf[32];
266
267         if (event_raw[ctr]) {
268                 sprintf(buf, "raw 0x%llx", (long long)type);
269                 return buf;
270         }
271         if (!type_valid(type))
272                 return "unknown";
273
274         if (type >= 0)
275                 return hw_event_names[type];
276
277         return sw_event_names[-type-1];
278 }
279
280 /*
281  * Each event can have multiple symbolic names.
282  * Symbolic names are (almost) exactly matched.
283  */
284 static int match_event_symbols(char *str)
285 {
286         unsigned int i;
287
288         if (isdigit(str[0]) || str[0] == '-')
289                 return atoi(str);
290
291         for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
292                 if (!strncmp(str, event_symbols[i].symbol,
293                              strlen(event_symbols[i].symbol)))
294                         return event_symbols[i].event;
295         }
296
297         return PERF_HW_EVENTS_MAX;
298 }
299
300 static int parse_events(char *str)
301 {
302         __s64 type;
303         int raw;
304
305 again:
306         if (nr_counters == MAX_COUNTERS)
307                 return -1;
308
309         raw = 0;
310         if (*str == 'r') {
311                 raw = 1;
312                 ++str;
313                 type = strtol(str, NULL, 16);
314         } else {
315                 type = match_event_symbols(str);
316                 if (!type_valid(type))
317                         return -1;
318         }
319
320         event_id[nr_counters] = type;
321         event_raw[nr_counters] = raw;
322         nr_counters++;
323
324         str = strstr(str, ",");
325         if (str) {
326                 str++;
327                 goto again;
328         }
329
330         return 0;
331 }
332
333
334 /*
335  * perfstat
336  */
337
338 char fault_here[1000000];
339
340 static void create_perfstat_counter(int counter)
341 {
342         struct perf_counter_hw_event hw_event;
343
344         memset(&hw_event, 0, sizeof(hw_event));
345         hw_event.type           = event_id[counter];
346         hw_event.raw            = event_raw[counter];
347         hw_event.record_type    = PERF_RECORD_SIMPLE;
348         hw_event.nmi            = 0;
349
350         if (system_wide) {
351                 int cpu;
352                 for (cpu = 0; cpu < nr_cpus; cpu ++) {
353                         fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
354                         if (fd[cpu][counter] < 0) {
355                                 printf("perfstat error: syscall returned with %d (%s)\n",
356                                                 fd[cpu][counter], strerror(errno));
357                                 exit(-1);
358                         }
359                 }
360         } else {
361                 hw_event.inherit        = 1;
362                 hw_event.disabled       = 1;
363
364                 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
365                 if (fd[0][counter] < 0) {
366                         printf("perfstat error: syscall returned with %d (%s)\n",
367                                         fd[0][counter], strerror(errno));
368                         exit(-1);
369                 }
370         }
371 }
372
373 int do_perfstat(int argc, char *argv[])
374 {
375         unsigned long long t0, t1;
376         int counter;
377         ssize_t res;
378         int status;
379         int pid;
380
381         if (!system_wide)
382                 nr_cpus = 1;
383
384         for (counter = 0; counter < nr_counters; counter++)
385                 create_perfstat_counter(counter);
386
387         argc -= optind;
388         argv += optind;
389
390         /*
391          * Enable counters and exec the command:
392          */
393         t0 = rdclock();
394         prctl(PR_TASK_PERF_COUNTERS_ENABLE);
395
396         if ((pid = fork()) < 0)
397                 perror("failed to fork");
398         if (!pid) {
399                 if (execvp(argv[0], argv)) {
400                         perror(argv[0]);
401                         exit(-1);
402                 }
403         }
404         while (wait(&status) >= 0)
405                 ;
406         prctl(PR_TASK_PERF_COUNTERS_DISABLE);
407         t1 = rdclock();
408
409         fflush(stdout);
410
411         fprintf(stderr, "\n");
412         fprintf(stderr, " Performance counter stats for \'%s\':\n",
413                 argv[0]);
414         fprintf(stderr, "\n");
415
416         for (counter = 0; counter < nr_counters; counter++) {
417                 int cpu;
418                 __u64 count, single_count;
419
420                 count = 0;
421                 for (cpu = 0; cpu < nr_cpus; cpu ++) {
422                         res = read(fd[cpu][counter],
423                                         (char *) &single_count, sizeof(single_count));
424                         assert(res == sizeof(single_count));
425                         count += single_count;
426                 }
427
428                 if (!event_raw[counter] &&
429                     (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
430                      event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
431
432                         double msecs = (double)count / 1000000;
433
434                         fprintf(stderr, " %14.6f  %-20s (msecs)\n",
435                                 msecs, event_name(counter));
436                 } else {
437                         fprintf(stderr, " %14Ld  %-20s (events)\n",
438                                 count, event_name(counter));
439                 }
440                 if (!counter)
441                         fprintf(stderr, "\n");
442         }
443         fprintf(stderr, "\n");
444         fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
445                         (double)(t1-t0)/1e6);
446         fprintf(stderr, "\n");
447
448         return 0;
449 }
450
451 /*
452  * Symbols
453  */
454
455 static uint64_t                 min_ip;
456 static uint64_t                 max_ip = -1ll;
457
458 struct sym_entry {
459         unsigned long long      addr;
460         char                    *sym;
461         unsigned long           count[MAX_COUNTERS];
462         int                     skip;
463         GList                   *source;
464 };
465
466 #define MAX_SYMS                100000
467
468 static int sym_table_count;
469
470 struct sym_entry                *sym_filter_entry;
471
472 static struct sym_entry         sym_table[MAX_SYMS];
473
474 static void show_details(struct sym_entry *sym);
475
476 /*
477  * Ordering weight: count-1 * count-2 * ... / count-n
478  */
479 static double sym_weight(const struct sym_entry *sym)
480 {
481         double weight;
482         int counter;
483
484         weight = sym->count[0];
485
486         for (counter = 1; counter < nr_counters-1; counter++)
487                 weight *= sym->count[counter];
488
489         weight /= (sym->count[counter] + 1);
490
491         return weight;
492 }
493
494 static int compare(const void *__sym1, const void *__sym2)
495 {
496         const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
497
498         return sym_weight(sym1) < sym_weight(sym2);
499 }
500
501 static time_t                   last_refresh;
502 static long                     events;
503 static long                     userspace_events;
504 static const char               CONSOLE_CLEAR[] = "\e[H\e[2J";
505
506 static struct sym_entry         tmp[MAX_SYMS];
507
508 static void print_sym_table(void)
509 {
510         int i, printed;
511         int counter;
512         float events_per_sec = events/delay_secs;
513         float kevents_per_sec = (events-userspace_events)/delay_secs;
514
515         memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
516         qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
517
518         write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
519
520         printf(
521 "------------------------------------------------------------------------------\n");
522         printf( " KernelTop:%8.0f irqs/sec  kernel:%3.1f%% [%s, ",
523                 events_per_sec,
524                 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
525                 nmi ? "NMI" : "IRQ");
526
527         if (nr_counters == 1)
528                 printf("%d ", event_count[0]);
529
530         for (counter = 0; counter < nr_counters; counter++) {
531                 if (counter)
532                         printf("/");
533
534                 printf("%s", event_name(counter));
535         }
536
537         printf( "], ");
538
539         if (tid != -1)
540                 printf(" (tid: %d", tid);
541         else
542                 printf(" (all");
543
544         if (profile_cpu != -1)
545                 printf(", cpu: %d)\n", profile_cpu);
546         else {
547                 if (tid != -1)
548                         printf(")\n");
549                 else
550                         printf(", %d CPUs)\n", nr_cpus);
551         }
552
553         printf("------------------------------------------------------------------------------\n\n");
554
555         if (nr_counters == 1)
556                 printf("             events");
557         else
558                 printf("  weight     events");
559
560         printf("         RIP          kernel function\n"
561                        "  ______     ______   ________________   _______________\n\n"
562         );
563
564         printed = 0;
565         for (i = 0; i < sym_table_count; i++) {
566                 int count;
567
568                 if (nr_counters == 1) {
569                         if (printed <= 18 &&
570                                         tmp[i].count[0] >= count_filter) {
571                                 printf("%19.2f - %016llx : %s\n",
572                                   sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
573                                 printed++;
574                         }
575                 } else {
576                         if (printed <= 18 &&
577                                         tmp[i].count[0] >= count_filter) {
578                                 printf("%8.1f %10ld - %016llx : %s\n",
579                                   sym_weight(tmp + i),
580                                   tmp[i].count[0],
581                                   tmp[i].addr, tmp[i].sym);
582                                 printed++;
583                         }
584                 }
585                 /*
586                  * Add decay to the counts:
587                  */
588                 for (count = 0; count < nr_counters; count++)
589                         sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
590         }
591
592         if (sym_filter_entry)
593                 show_details(sym_filter_entry);
594
595         last_refresh = time(NULL);
596
597         {
598                 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
599
600                 if (poll(&stdin_poll, 1, 0) == 1) {
601                         printf("key pressed - exiting.\n");
602                         exit(0);
603                 }
604         }
605 }
606
607 static int read_symbol(FILE *in, struct sym_entry *s)
608 {
609         static int filter_match = 0;
610         char *sym, stype;
611         char str[500];
612         int rc, pos;
613
614         rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
615         if (rc == EOF)
616                 return -1;
617
618         assert(rc == 3);
619
620         /* skip until end of line: */
621         pos = strlen(str);
622         do {
623                 rc = fgetc(in);
624                 if (rc == '\n' || rc == EOF || pos >= 499)
625                         break;
626                 str[pos] = rc;
627                 pos++;
628         } while (1);
629         str[pos] = 0;
630
631         sym = str;
632
633         /* Filter out known duplicates and non-text symbols. */
634         if (!strcmp(sym, "_text"))
635                 return 1;
636         if (!min_ip && !strcmp(sym, "_stext"))
637                 return 1;
638         if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
639                 return 1;
640         if (stype != 'T' && stype != 't')
641                 return 1;
642         if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
643                 return 1;
644         if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
645                 return 1;
646
647         s->sym = malloc(strlen(str));
648         assert(s->sym);
649
650         strcpy((char *)s->sym, str);
651         s->skip = 0;
652
653         /* Tag events to be skipped. */
654         if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
655                 s->skip = 1;
656         if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
657                 s->skip = 1;
658
659         if (filter_match == 1) {
660                 filter_end = s->addr;
661                 filter_match = -1;
662                 if (filter_end - filter_start > 10000) {
663                         printf("hm, too large filter symbol <%s> - skipping.\n",
664                                 sym_filter);
665                         printf("symbol filter start: %016lx\n", filter_start);
666                         printf("                end: %016lx\n", filter_end);
667                         filter_end = filter_start = 0;
668                         sym_filter = NULL;
669                         sleep(1);
670                 }
671         }
672         if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
673                 filter_match = 1;
674                 filter_start = s->addr;
675         }
676
677         return 0;
678 }
679
680 int compare_addr(const void *__sym1, const void *__sym2)
681 {
682         const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
683
684         return sym1->addr > sym2->addr;
685 }
686
687 static void sort_symbol_table(void)
688 {
689         int i, dups;
690
691         do {
692                 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
693                 for (i = 0, dups = 0; i < sym_table_count; i++) {
694                         if (sym_table[i].addr == sym_table[i+1].addr) {
695                                 sym_table[i+1].addr = -1ll;
696                                 dups++;
697                         }
698                 }
699                 sym_table_count -= dups;
700         } while(dups);
701 }
702
703 static void parse_symbols(void)
704 {
705         struct sym_entry *last;
706
707         FILE *kallsyms = fopen("/proc/kallsyms", "r");
708
709         if (!kallsyms) {
710                 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
711                 exit(-1);
712         }
713
714         while (!feof(kallsyms)) {
715                 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
716                         sym_table_count++;
717                         assert(sym_table_count <= MAX_SYMS);
718                 }
719         }
720
721         sort_symbol_table();
722         min_ip = sym_table[0].addr;
723         max_ip = sym_table[sym_table_count-1].addr;
724         last = sym_table + sym_table_count++;
725
726         last->addr = -1ll;
727         last->sym = "<end>";
728
729         if (filter_end) {
730                 int count;
731                 for (count=0; count < sym_table_count; count ++) {
732                         if (!strcmp(sym_table[count].sym, sym_filter)) {
733                                 sym_filter_entry = &sym_table[count];
734                                 break;
735                         }
736                 }
737         }
738         if (dump_symtab) {
739                 int i;
740
741                 for (i = 0; i < sym_table_count; i++)
742                         fprintf(stderr, "%llx %s\n",
743                                 sym_table[i].addr, sym_table[i].sym);
744         }
745 }
746
747 /*
748  * Source lines
749  */
750
751 static void parse_vmlinux(char *filename)
752 {
753         FILE *file;
754         char command[PATH_MAX*2];
755         if (!filename)
756                 return;
757
758         sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
759
760         file = popen(command, "r");
761         if (!file)
762                 return;
763
764         while (!feof(file)) {
765                 struct source_line *src;
766                 size_t dummy = 0;
767                 char *c;
768
769                 src = malloc(sizeof(struct source_line));
770                 assert(src != NULL);
771                 memset(src, 0, sizeof(struct source_line));
772
773                 if (getline(&src->line, &dummy, file) < 0)
774                         break;
775                 if (!src->line)
776                         break;
777
778                 c = strchr(src->line, '\n');
779                 if (c)
780                         *c = 0;
781
782                 lines = g_list_prepend(lines, src);
783
784                 if (strlen(src->line)>8 && src->line[8] == ':')
785                         src->EIP = strtoull(src->line, NULL, 16);
786                 if (strlen(src->line)>8 && src->line[16] == ':')
787                         src->EIP = strtoull(src->line, NULL, 16);
788         }
789         pclose(file);
790         lines = g_list_reverse(lines);
791 }
792
793 static void record_precise_ip(uint64_t ip)
794 {
795         struct source_line *line;
796         GList *item;
797
798         item = g_list_first(lines);
799         while (item) {
800                 line = item->data;
801                 if (line->EIP == ip)
802                         line->count++;
803                 if (line->EIP > ip)
804                         break;
805                 item = g_list_next(item);
806         }
807 }
808
809 static void lookup_sym_in_vmlinux(struct sym_entry *sym)
810 {
811         struct source_line *line;
812         GList *item;
813         char pattern[PATH_MAX];
814         sprintf(pattern, "<%s>:", sym->sym);
815
816         item = g_list_first(lines);
817         while (item) {
818                 line = item->data;
819                 if (strstr(line->line, pattern)) {
820                         sym->source = item;
821                         break;
822                 }
823                 item = g_list_next(item);
824         }
825 }
826
827 void show_lines(GList *item_queue, int item_queue_count)
828 {
829         int i;
830         struct source_line *line;
831
832         for (i = 0; i < item_queue_count; i++) {
833                 line = item_queue->data;
834                 printf("%8li\t%s\n", line->count, line->line);
835                 item_queue = g_list_next(item_queue);
836         }
837 }
838
839 #define TRACE_COUNT     3
840
841 static void show_details(struct sym_entry *sym)
842 {
843         struct source_line *line;
844         GList *item;
845         int displayed = 0;
846         GList *item_queue = NULL;
847         int item_queue_count = 0;
848
849         if (!sym->source)
850                 lookup_sym_in_vmlinux(sym);
851         if (!sym->source)
852                 return;
853
854         printf("Showing details for %s\n", sym->sym);
855
856         item = sym->source;
857         while (item) {
858                 line = item->data;
859                 if (displayed && strstr(line->line, ">:"))
860                         break;
861
862                 if (!item_queue_count)
863                         item_queue = item;
864                 item_queue_count ++;
865
866                 if (line->count >= count_filter) {
867                         show_lines(item_queue, item_queue_count);
868                         item_queue_count = 0;
869                         item_queue = NULL;
870                 } else if (item_queue_count > TRACE_COUNT) {
871                         item_queue = g_list_next(item_queue);
872                         item_queue_count --;
873                 }
874
875                 line->count = 0;
876                 displayed++;
877                 if (displayed > 300)
878                         break;
879                 item = g_list_next(item);
880         }
881 }
882
883 /*
884  * Binary search in the histogram table and record the hit:
885  */
886 static void record_ip(uint64_t ip, int counter)
887 {
888         int left_idx, middle_idx, right_idx, idx;
889         unsigned long left, middle, right;
890
891         record_precise_ip(ip);
892
893         left_idx = 0;
894         right_idx = sym_table_count-1;
895         assert(ip <= max_ip && ip >= min_ip);
896
897         while (left_idx + 1 < right_idx) {
898                 middle_idx = (left_idx + right_idx) / 2;
899
900                 left   = sym_table[  left_idx].addr;
901                 middle = sym_table[middle_idx].addr;
902                 right  = sym_table[ right_idx].addr;
903
904                 if (!(left <= middle && middle <= right)) {
905                         printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
906                         printf("%d %d %d\n", left_idx, middle_idx, right_idx);
907                 }
908                 assert(left <= middle && middle <= right);
909                 if (!(left <= ip && ip <= right)) {
910                         printf(" left: %016lx\n", left);
911                         printf("   ip: %016lx\n", ip);
912                         printf("right: %016lx\n", right);
913                 }
914                 assert(left <= ip && ip <= right);
915                 /*
916                  * [ left .... target .... middle .... right ]
917                  *   => right := middle
918                  */
919                 if (ip < middle) {
920                         right_idx = middle_idx;
921                         continue;
922                 }
923                 /*
924                  * [ left .... middle ... target ... right ]
925                  *   => left := middle
926                  */
927                 left_idx = middle_idx;
928         }
929
930         idx = left_idx;
931
932         if (!sym_table[idx].skip)
933                 sym_table[idx].count[counter]++;
934         else events--;
935 }
936
937 static void process_event(uint64_t ip, int counter)
938 {
939         events++;
940
941         if (ip < min_ip || ip > max_ip) {
942                 userspace_events++;
943                 return;
944         }
945
946         record_ip(ip, counter);
947 }
948
949 static void process_options(int argc, char *argv[])
950 {
951         int error = 0, counter;
952
953         if (strstr(argv[0], "perfstat"))
954                 run_perfstat = 1;
955
956         for (;;) {
957                 int option_index = 0;
958                 /** Options for getopt */
959                 static struct option long_options[] = {
960                         {"count",       required_argument,      NULL, 'c'},
961                         {"cpu",         required_argument,      NULL, 'C'},
962                         {"delay",       required_argument,      NULL, 'd'},
963                         {"dump_symtab", no_argument,            NULL, 'D'},
964                         {"event",       required_argument,      NULL, 'e'},
965                         {"filter",      required_argument,      NULL, 'f'},
966                         {"group",       required_argument,      NULL, 'g'},
967                         {"help",        no_argument,            NULL, 'h'},
968                         {"nmi",         required_argument,      NULL, 'n'},
969                         {"pid",         required_argument,      NULL, 'p'},
970                         {"vmlinux",     required_argument,      NULL, 'x'},
971                         {"symbol",      required_argument,      NULL, 's'},
972                         {"stat",        no_argument,            NULL, 'S'},
973                         {"zero",        no_argument,            NULL, 'z'},
974                         {NULL,          0,                      NULL,  0 }
975                 };
976                 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
977                                     long_options, &option_index);
978                 if (c == -1)
979                         break;
980
981                 switch (c) {
982                 case 'a': system_wide                   =              1; break;
983                 case 'c': event_count[nr_counters]      =   atoi(optarg); break;
984                 case 'C':
985                         /* CPU and PID are mutually exclusive */
986                         if (tid != -1) {
987                                 printf("WARNING: CPU switch overriding PID\n");
988                                 sleep(1);
989                                 tid = -1;
990                         }
991                         profile_cpu                     =   atoi(optarg); break;
992                 case 'd': delay_secs                    =   atoi(optarg); break;
993                 case 'D': dump_symtab                   =              1; break;
994
995                 case 'e': error                         = parse_events(optarg); break;
996
997                 case 'f': count_filter                  =   atoi(optarg); break;
998                 case 'g': group                         =   atoi(optarg); break;
999                 case 'h':                                 display_help(); break;
1000                 case 'n': nmi                           =   atoi(optarg); break;
1001                 case 'p':
1002                         /* CPU and PID are mutually exclusive */
1003                         if (profile_cpu != -1) {
1004                                 printf("WARNING: PID switch overriding CPU\n");
1005                                 sleep(1);
1006                                 profile_cpu = -1;
1007                         }
1008                         tid                             =   atoi(optarg); break;
1009                 case 's': sym_filter                    = strdup(optarg); break;
1010                 case 'S': run_perfstat                  =              1; break;
1011                 case 'x': vmlinux                       = strdup(optarg); break;
1012                 case 'z': zero                          =              1; break;
1013                 default: error = 1; break;
1014                 }
1015         }
1016         if (error)
1017                 display_help();
1018
1019         if (!nr_counters) {
1020                 if (run_perfstat)
1021                         nr_counters = 8;
1022                 else {
1023                         nr_counters = 1;
1024                         event_id[0] = 0;
1025                 }
1026         }
1027
1028         for (counter = 0; counter < nr_counters; counter++) {
1029                 if (event_count[counter])
1030                         continue;
1031
1032                 if (event_id[counter] < PERF_HW_EVENTS_MAX)
1033                         event_count[counter] = default_count[event_id[counter]];
1034                 else
1035                         event_count[counter] = 100000;
1036         }
1037 }
1038
1039 int main(int argc, char *argv[])
1040 {
1041         struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
1042         struct perf_counter_hw_event hw_event;
1043         int i, counter, group_fd;
1044         unsigned int cpu;
1045         uint64_t ip;
1046         ssize_t res;
1047         int ret;
1048
1049         process_options(argc, argv);
1050
1051         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1052         assert(nr_cpus <= MAX_NR_CPUS);
1053         assert(nr_cpus >= 0);
1054
1055         if (run_perfstat)
1056                 return do_perfstat(argc, argv);
1057
1058         if (tid != -1 || profile_cpu != -1)
1059                 nr_cpus = 1;
1060
1061         for (i = 0; i < nr_cpus; i++) {
1062                 group_fd = -1;
1063                 for (counter = 0; counter < nr_counters; counter++) {
1064
1065                         cpu     = profile_cpu;
1066                         if (tid == -1 && profile_cpu == -1)
1067                                 cpu = i;
1068
1069                         memset(&hw_event, 0, sizeof(hw_event));
1070                         hw_event.type           = event_id[counter];
1071                         hw_event.raw            = event_raw[counter];
1072                         hw_event.irq_period     = event_count[counter];
1073                         hw_event.record_type    = PERF_RECORD_IRQ;
1074                         hw_event.nmi            = nmi;
1075
1076                         fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1077                         fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1078                         if (fd[i][counter] < 0) {
1079                                 printf("kerneltop error: syscall returned with %d (%s)\n",
1080                                         fd[i][counter], strerror(-fd[i][counter]));
1081                                 if (fd[i][counter] == -1)
1082                                         printf("Are you root?\n");
1083                                 exit(-1);
1084                         }
1085                         assert(fd[i][counter] >= 0);
1086
1087                         /*
1088                          * First counter acts as the group leader:
1089                          */
1090                         if (group && group_fd == -1)
1091                                 group_fd = fd[i][counter];
1092
1093                         event_array[i][counter].fd = fd[i][counter];
1094                         event_array[i][counter].events = POLLIN;
1095                 }
1096         }
1097
1098         parse_symbols();
1099         if (vmlinux && sym_filter_entry)
1100                 parse_vmlinux(vmlinux);
1101
1102         printf("KernelTop refresh period: %d seconds\n", delay_secs);
1103         last_refresh = time(NULL);
1104
1105         while (1) {
1106                 int hits = events;
1107
1108                 for (i = 0; i < nr_cpus; i++) {
1109                         for (counter = 0; counter < nr_counters; counter++) {
1110                                 res = read(fd[i][counter], (char *) &ip, sizeof(ip));
1111                                 if (res > 0) {
1112                                         assert(res == sizeof(ip));
1113
1114                                         process_event(ip, counter);
1115                                 }
1116                         }
1117                 }
1118
1119                 if (time(NULL) >= last_refresh + delay_secs) {
1120                         print_sym_table();
1121                         events = userspace_events = 0;
1122                 }
1123
1124                 if (hits == events)
1125                         ret = poll(event_array[0], nr_cpus, 1000);
1126                 hits = events;
1127         }
1128
1129         return 0;
1130 }