arch/x86/kernel/ftrace.c

   1 /*
   2  * Code for replacing ftrace calls with jumps.
   3  *
   4  * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
   5  *
   6  * Thanks goes to Ingo Molnar, for suggesting the idea.
   7  * Mathieu Desnoyers, for suggesting postponing the modifications.
   8  * Arjan van de Ven, for keeping me straight, and explaining to me
   9  * the dangers of modifying code on the run.
  10  */
  11
  12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  13
  14 #include <linux/spinlock.h>
  15 #include <linux/hardirq.h>
  16 #include <linux/uaccess.h>
  17 #include <linux/ftrace.h>
  18 #include <linux/percpu.h>
  19 #include <linux/sched.h>
  20 #include <linux/init.h>
  21 #include <linux/list.h>
  22
  23 #include <trace/syscall.h>
  24
  25 #include <asm/cacheflush.h>
  26 #include <asm/ftrace.h>
  27 #include <asm/nops.h>
  28 #include <asm/nmi.h>
  29
  30
  31 #ifdef CONFIG_DYNAMIC_FTRACE
  32
  33 /*
  34  * modifying_code is set to notify NMIs that they need to use
  35  * memory barriers when entering or exiting. But we don't want
  36  * to burden NMIs with unnecessary memory barriers when code
  37  * modification is not being done (which is most of the time).
  38  *
  39  * A mutex is already held when ftrace_arch_code_modify_prepare
  40  * and post_process are called. No locks need to be taken here.
  41  *
  42  * Stop machine will make sure currently running NMIs are done
  43  * and new NMIs will see the updated variable before we need
  44  * to worry about NMIs doing memory barriers.
  45  */
  46 static int modifying_code __read_mostly;
  47 static DEFINE_PER_CPU(int, save_modifying_code);
  48
  49 int ftrace_arch_code_modify_prepare(void)
  50 {
  51         set_kernel_text_rw();
  52         modifying_code = 1;
  53         return 0;
  54 }
  55
  56 int ftrace_arch_code_modify_post_process(void)
  57 {
  58         modifying_code = 0;
  59         set_kernel_text_ro();
  60         return 0;
  61 }
  62
  63 union ftrace_code_union {
  64         char code[MCOUNT_INSN_SIZE];
  65         struct {
  66                 char e8;
  67                 int offset;
  68         } __attribute__((packed));
  69 };
  70
  71 static int ftrace_calc_offset(long ip, long addr)
  72 {
  73         return (int)(addr - ip);
  74 }
  75
  76 static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
  77 {
  78         static union ftrace_code_union calc;
  79
  80         calc.e8         = 0xe8;
  81         calc.offset     = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
  82
  83         /*
  84          * No locking needed, this must be called via kstop_machine
  85          * which in essence is like running on a uniprocessor machine.
  86          */
  87         return calc.code;
  88 }
  89
  90 /*
  91  * Modifying code must take extra care. On an SMP machine, if
  92  * the code being modified is also being executed on another CPU
  93  * that CPU will have undefined results and possibly take a GPF.
  94  * We use kstop_machine to stop other CPUS from exectuing code.
  95  * But this does not stop NMIs from happening. We still need
  96  * to protect against that. We separate out the modification of
  97  * the code to take care of this.
  98  *
  99  * Two buffers are added: An IP buffer and a "code" buffer.
 100  *
 101  * 1) Put the instruction pointer into the IP buffer
 102  *    and the new code into the "code" buffer.
 103  * 2) Wait for any running NMIs to finish and set a flag that says
 104  *    we are modifying code, it is done in an atomic operation.
 105  * 3) Write the code
 106  * 4) clear the flag.
 107  * 5) Wait for any running NMIs to finish.
 108  *
 109  * If an NMI is executed, the first thing it does is to call
 110  * "ftrace_nmi_enter". This will check if the flag is set to write
 111  * and if it is, it will write what is in the IP and "code" buffers.
 112  *
 113  * The trick is, it does not matter if everyone is writing the same
 114  * content to the code location. Also, if a CPU is executing code
 115  * it is OK to write to that code location if the contents being written
 116  * are the same as what exists.
 117  */
 118
 119 #define MOD_CODE_WRITE_FLAG (1 << 31)   /* set when NMI should do the write */
 120 static atomic_t nmi_running = ATOMIC_INIT(0);
 121 static int mod_code_status;             /* holds return value of text write */
 122 static void *mod_code_ip;               /* holds the IP to write to */
 123 static void *mod_code_newcode;          /* holds the text to write to the IP */
 124
 125 static unsigned nmi_wait_count;
 126 static atomic_t nmi_update_count = ATOMIC_INIT(0);
 127
 128 int ftrace_arch_read_dyn_info(char *buf, int size)
 129 {
 130         int r;
 131
 132         r = snprintf(buf, size, "%u %u",
 133                      nmi_wait_count,
 134                      atomic_read(&nmi_update_count));
 135         return r;
 136 }
 137
 138 static void clear_mod_flag(void)
 139 {
 140         int old = atomic_read(&nmi_running);
 141
 142         for (;;) {
 143                 int new = old & ~MOD_CODE_WRITE_FLAG;
 144
 145                 if (old == new)
 146                         break;
 147
 148                 old = atomic_cmpxchg(&nmi_running, old, new);
 149         }
 150 }
 151
 152 static void ftrace_mod_code(void)
 153 {
 154         /*
 155          * Yes, more than one CPU process can be writing to mod_code_status.
 156          *    (and the code itself)
 157          * But if one were to fail, then they all should, and if one were
 158          * to succeed, then they all should.
 159          */
 160         mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
 161                                              MCOUNT_INSN_SIZE);
 162
 163         /* if we fail, then kill any new writers */
 164         if (mod_code_status)
 165                 clear_mod_flag();
 166 }
 167
 168 void ftrace_nmi_enter(void)
 169 {
 170         __get_cpu_var(save_modifying_code) = modifying_code;
 171
 172         if (!__get_cpu_var(save_modifying_code))
 173                 return;
 174
 175         if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
 176                 smp_rmb();
 177                 ftrace_mod_code();
 178                 atomic_inc(&nmi_update_count);
 179         }
 180         /* Must have previous changes seen before executions */
 181         smp_mb();
 182 }
 183
 184 void ftrace_nmi_exit(void)
 185 {
 186         if (!__get_cpu_var(save_modifying_code))
 187                 return;
 188
 189         /* Finish all executions before clearing nmi_running */
 190         smp_mb();
 191         atomic_dec(&nmi_running);
 192 }
 193
 194 static void wait_for_nmi_and_set_mod_flag(void)
 195 {
 196         if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
 197                 return;
 198
 199         do {
 200                 cpu_relax();
 201         } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
 202
 203         nmi_wait_count++;
 204 }
 205
 206 static void wait_for_nmi(void)
 207 {
 208         if (!atomic_read(&nmi_running))
 209                 return;
 210
 211         do {
 212                 cpu_relax();
 213         } while (atomic_read(&nmi_running));
 214
 215         nmi_wait_count++;
 216 }
 217
 218 static inline int
 219 within(unsigned long addr, unsigned long start, unsigned long end)
 220 {
 221         return addr >= start && addr < end;
 222 }
 223
 224 static int
 225 do_ftrace_mod_code(unsigned long ip, void *new_code)
 226 {
 227         /*
 228          * On x86_64, kernel text mappings are mapped read-only with
 229          * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
 230          * of the kernel text mapping to modify the kernel text.
 231          *
 232          * For 32bit kernels, these mappings are same and we can use
 233          * kernel identity mapping to modify code.
 234          */
 235         if (within(ip, (unsigned long)_text, (unsigned long)_etext))
 236                 ip = (unsigned long)__va(__pa(ip));
 237
 238         mod_code_ip = (void *)ip;
 239         mod_code_newcode = new_code;
 240
 241         /* The buffers need to be visible before we let NMIs write them */
 242         smp_mb();
 243
 244         wait_for_nmi_and_set_mod_flag();
 245
 246         /* Make sure all running NMIs have finished before we write the code */
 247         smp_mb();
 248
 249         ftrace_mod_code();
 250
 251         /* Make sure the write happens before clearing the bit */
 252         smp_mb();
 253
 254         clear_mod_flag();
 255         wait_for_nmi();
 256
 257         return mod_code_status;
 258 }
 259
 260
 261
 262
 263 static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
 264
 265 static unsigned char *ftrace_nop_replace(void)
 266 {
 267         return ftrace_nop;
 268 }
 269
 270 static int
 271 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 272                    unsigned char *new_code)
 273 {
 274         unsigned char replaced[MCOUNT_INSN_SIZE];
 275
 276         /*
 277          * Note: Due to modules and __init, code can
 278          *  disappear and change, we need to protect against faulting
 279          *  as well as code changing. We do this by using the
 280          *  probe_kernel_* functions.
 281          *
 282          * No real locking needed, this code is run through
 283          * kstop_machine, or before SMP starts.
 284          */
 285
 286         /* read the text we want to modify */
 287         if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
 288                 return -EFAULT;
 289
 290         /* Make sure it is what we expect it to be */
 291         if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
 292                 return -EINVAL;
 293
 294         /* replace the text with the new text */
 295         if (do_ftrace_mod_code(ip, new_code))
 296                 return -EPERM;
 297
 298         sync_core();
 299
 300         return 0;
 301 }
 302
 303 int ftrace_make_nop(struct module *mod,
 304                     struct dyn_ftrace *rec, unsigned long addr)
 305 {
 306         unsigned char *new, *old;
 307         unsigned long ip = rec->ip;
 308
 309         old = ftrace_call_replace(ip, addr);
 310         new = ftrace_nop_replace();
 311
 312         return ftrace_modify_code(rec->ip, old, new);
 313 }
 314
 315 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 316 {
 317         unsigned char *new, *old;
 318         unsigned long ip = rec->ip;
 319
 320         old = ftrace_nop_replace();
 321         new = ftrace_call_replace(ip, addr);
 322
 323         return ftrace_modify_code(rec->ip, old, new);
 324 }
 325
 326 int ftrace_update_ftrace_func(ftrace_func_t func)
 327 {
 328         unsigned long ip = (unsigned long)(&ftrace_call);
 329         unsigned char old[MCOUNT_INSN_SIZE], *new;
 330         int ret;
 331
 332         memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
 333         new = ftrace_call_replace(ip, (unsigned long)func);
 334         ret = ftrace_modify_code(ip, old, new);
 335
 336         return ret;
 337 }
 338
 339 int __init ftrace_dyn_arch_init(void *data)
 340 {
 341         extern const unsigned char ftrace_test_p6nop[];
 342         extern const unsigned char ftrace_test_nop5[];
 343         extern const unsigned char ftrace_test_jmp[];
 344         int faulted = 0;
 345
 346         /*
 347          * There is no good nop for all x86 archs.
 348          * We will default to using the P6_NOP5, but first we
 349          * will test to make sure that the nop will actually
 350          * work on this CPU. If it faults, we will then
 351          * go to a lesser efficient 5 byte nop. If that fails
 352          * we then just use a jmp as our nop. This isn't the most
 353          * efficient nop, but we can not use a multi part nop
 354          * since we would then risk being preempted in the middle
 355          * of that nop, and if we enabled tracing then, it might
 356          * cause a system crash.
 357          *
 358          * TODO: check the cpuid to determine the best nop.
 359          */
 360         asm volatile (
 361                 "ftrace_test_jmp:"
 362                 "jmp ftrace_test_p6nop\n"
 363                 "nop\n"
 364                 "nop\n"
 365                 "nop\n"  /* 2 byte jmp + 3 bytes */
 366                 "ftrace_test_p6nop:"
 367                 P6_NOP5
 368                 "jmp 1f\n"
 369                 "ftrace_test_nop5:"
 370                 ".byte 0x66,0x66,0x66,0x66,0x90\n"
 371                 "1:"
 372                 ".section .fixup, \"ax\"\n"
 373                 "2:     movl $1, %0\n"
 374                 "       jmp ftrace_test_nop5\n"
 375                 "3:     movl $2, %0\n"
 376                 "       jmp 1b\n"
 377                 ".previous\n"
 378                 _ASM_EXTABLE(ftrace_test_p6nop, 2b)
 379                 _ASM_EXTABLE(ftrace_test_nop5, 3b)
 380                 : "=r"(faulted) : "0" (faulted));
 381
 382         switch (faulted) {
 383         case 0:
 384                 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
 385                 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
 386                 break;
 387         case 1:
 388                 pr_info("converting mcount calls to 66 66 66 66 90\n");
 389                 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
 390                 break;
 391         case 2:
 392                 pr_info("converting mcount calls to jmp . + 5\n");
 393                 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
 394                 break;
 395         }
 396
 397         /* The return code is retured via data */
 398         *(unsigned long *)data = 0;
 399
 400         return 0;
 401 }
 402 #endif
 403
 404 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 405
 406 #ifdef CONFIG_DYNAMIC_FTRACE
 407 extern void ftrace_graph_call(void);
 408
 409 static int ftrace_mod_jmp(unsigned long ip,
 410                           int old_offset, int new_offset)
 411 {
 412         unsigned char code[MCOUNT_INSN_SIZE];
 413
 414         if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
 415                 return -EFAULT;
 416
 417         if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
 418                 return -EINVAL;
 419
 420         *(int *)(&code[1]) = new_offset;
 421
 422         if (do_ftrace_mod_code(ip, &code))
 423                 return -EPERM;
 424
 425         return 0;
 426 }
 427
 428 int ftrace_enable_ftrace_graph_caller(void)
 429 {
 430         unsigned long ip = (unsigned long)(&ftrace_graph_call);
 431         int old_offset, new_offset;
 432
 433         old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
 434         new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
 435
 436         return ftrace_mod_jmp(ip, old_offset, new_offset);
 437 }
 438
 439 int ftrace_disable_ftrace_graph_caller(void)
 440 {
 441         unsigned long ip = (unsigned long)(&ftrace_graph_call);
 442         int old_offset, new_offset;
 443
 444         old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
 445         new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
 446
 447         return ftrace_mod_jmp(ip, old_offset, new_offset);
 448 }
 449
 450 #endif /* !CONFIG_DYNAMIC_FTRACE */
 451
 452 /*
 453  * Hook the return address and push it in the stack of return addrs
 454  * in current thread info.
 455  */
 456 void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
 457                            unsigned long frame_pointer)
 458 {
 459         unsigned long old;
 460         int faulted;
 461         struct ftrace_graph_ent trace;
 462         unsigned long return_hooker = (unsigned long)
 463                                 &return_to_handler;
 464
 465         if (unlikely(atomic_read(&current->tracing_graph_pause)))
 466                 return;
 467
 468         /*
 469          * Protect against fault, even if it shouldn't
 470          * happen. This tool is too much intrusive to
 471          * ignore such a protection.
 472          */
 473         asm volatile(
 474                 "1: " _ASM_MOV " (%[parent]), %[old]\n"
 475                 "2: " _ASM_MOV " %[return_hooker], (%[parent])\n"
 476                 "   movl $0, %[faulted]\n"
 477                 "3:\n"
 478
 479                 ".section .fixup, \"ax\"\n"
 480                 "4: movl $1, %[faulted]\n"
 481                 "   jmp 3b\n"
 482                 ".previous\n"
 483
 484                 _ASM_EXTABLE(1b, 4b)
 485                 _ASM_EXTABLE(2b, 4b)
 486
 487                 : [old] "=&r" (old), [faulted] "=r" (faulted)
 488                 : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
 489                 : "memory"
 490         );
 491
 492         if (unlikely(faulted)) {
 493                 ftrace_graph_stop();
 494                 WARN_ON(1);
 495                 return;
 496         }
 497
 498         if (ftrace_push_return_trace(old, self_addr, &trace.depth,
 499                     frame_pointer) == -EBUSY) {
 500                 *parent = old;
 501                 return;
 502         }
 503
 504         trace.func = self_addr;
 505
 506         /* Only trace if the calling function expects to */
 507         if (!ftrace_graph_entry(&trace)) {
 508                 current->curr_ret_stack--;
 509                 *parent = old;
 510         }
 511 }
 512 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */