arch/x86/xen/time.c

   1 /*
   2  * Xen time implementation.
   3  *
   4  * This is implemented in terms of a clocksource driver which uses
   5  * the hypervisor clock as a nanosecond timebase, and a clockevent
   6  * driver which uses the hypervisor's timer mechanism.
   7  *
   8  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
   9  */
  10 #include <linux/kernel.h>
  11 #include <linux/interrupt.h>
  12 #include <linux/clocksource.h>
  13 #include <linux/clockchips.h>
  14 #include <linux/kernel_stat.h>
  15 #include <linux/math64.h>
  16
  17 #include <asm/pvclock.h>
  18 #include <asm/xen/hypervisor.h>
  19 #include <asm/xen/hypercall.h>
  20
  21 #include <xen/events.h>
  22 #include <xen/interface/xen.h>
  23 #include <xen/interface/vcpu.h>
  24
  25 #include "xen-ops.h"
  26
  27 #define XEN_SHIFT 22
  28
  29 /* Xen may fire a timer up to this many ns early */
  30 #define TIMER_SLOP      100000
  31 #define NS_PER_TICK     (1000000000LL / HZ)
  32
  33 static cycle_t xen_clocksource_read(void);
  34
  35 /* runstate info updated by Xen */
  36 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
  37
  38 /* snapshots of runstate info */
  39 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
  40
  41 /* unused ns of stolen and blocked time */
  42 static DEFINE_PER_CPU(u64, residual_stolen);
  43 static DEFINE_PER_CPU(u64, residual_blocked);
  44
  45 /* return an consistent snapshot of 64-bit time/counter value */
  46 static u64 get64(const u64 *p)
  47 {
  48         u64 ret;
  49
  50         if (BITS_PER_LONG < 64) {
  51                 u32 *p32 = (u32 *)p;
  52                 u32 h, l;
  53
  54                 /*
  55                  * Read high then low, and then make sure high is
  56                  * still the same; this will only loop if low wraps
  57                  * and carries into high.
  58                  * XXX some clean way to make this endian-proof?
  59                  */
  60                 do {
  61                         h = p32[1];
  62                         barrier();
  63                         l = p32[0];
  64                         barrier();
  65                 } while (p32[1] != h);
  66
  67                 ret = (((u64)h) << 32) | l;
  68         } else
  69                 ret = *p;
  70
  71         return ret;
  72 }
  73
  74 /*
  75  * Runstate accounting
  76  */
  77 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
  78 {
  79         u64 state_time;
  80         struct vcpu_runstate_info *state;
  81
  82         BUG_ON(preemptible());
  83
  84         state = &__get_cpu_var(runstate);
  85
  86         /*
  87          * The runstate info is always updated by the hypervisor on
  88          * the current CPU, so there's no need to use anything
  89          * stronger than a compiler barrier when fetching it.
  90          */
  91         do {
  92                 state_time = get64(&state->state_entry_time);
  93                 barrier();
  94                 *res = *state;
  95                 barrier();
  96         } while (get64(&state->state_entry_time) != state_time);
  97 }
  98
  99 /* return true when a vcpu could run but has no real cpu to run on */
 100 bool xen_vcpu_stolen(int vcpu)
 101 {
 102         return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
 103 }
 104
 105 static void setup_runstate_info(int cpu)
 106 {
 107         struct vcpu_register_runstate_memory_area area;
 108
 109         area.addr.v = &per_cpu(runstate, cpu);
 110
 111         if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
 112                                cpu, &area))
 113                 BUG();
 114 }
 115
 116 static void do_stolen_accounting(void)
 117 {
 118         struct vcpu_runstate_info state;
 119         struct vcpu_runstate_info *snap;
 120         s64 blocked, runnable, offline, stolen;
 121         cputime_t ticks;
 122
 123         get_runstate_snapshot(&state);
 124
 125         WARN_ON(state.state != RUNSTATE_running);
 126
 127         snap = &__get_cpu_var(runstate_snapshot);
 128
 129         /* work out how much time the VCPU has not been runn*ing*  */
 130         blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
 131         runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
 132         offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
 133
 134         *snap = state;
 135
 136         /* Add the appropriate number of ticks of stolen time,
 137            including any left-overs from last time.  Passing NULL to
 138            account_steal_time accounts the time as stolen. */
 139         stolen = runnable + offline + __get_cpu_var(residual_stolen);
 140
 141         if (stolen < 0)
 142                 stolen = 0;
 143
 144         ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
 145         __get_cpu_var(residual_stolen) = stolen;
 146         account_steal_time(NULL, ticks);
 147
 148         /* Add the appropriate number of ticks of blocked time,
 149            including any left-overs from last time.  Passing idle to
 150            account_steal_time accounts the time as idle/wait. */
 151         blocked += __get_cpu_var(residual_blocked);
 152
 153         if (blocked < 0)
 154                 blocked = 0;
 155
 156         ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
 157         __get_cpu_var(residual_blocked) = blocked;
 158         account_steal_time(idle_task(smp_processor_id()), ticks);
 159 }
 160
 161 /*
 162  * Xen sched_clock implementation.  Returns the number of unstolen
 163  * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
 164  * states.
 165  */
 166 unsigned long long xen_sched_clock(void)
 167 {
 168         struct vcpu_runstate_info state;
 169         cycle_t now;
 170         u64 ret;
 171         s64 offset;
 172
 173         /*
 174          * Ideally sched_clock should be called on a per-cpu basis
 175          * anyway, so preempt should already be disabled, but that's
 176          * not current practice at the moment.
 177          */
 178         preempt_disable();
 179
 180         now = xen_clocksource_read();
 181
 182         get_runstate_snapshot(&state);
 183
 184         WARN_ON(state.state != RUNSTATE_running);
 185
 186         offset = now - state.state_entry_time;
 187         if (offset < 0)
 188                 offset = 0;
 189
 190         ret = state.time[RUNSTATE_blocked] +
 191                 state.time[RUNSTATE_running] +
 192                 offset;
 193
 194         preempt_enable();
 195
 196         return ret;
 197 }
 198
 199
 200 /* Get the TSC speed from Xen */
 201 unsigned long xen_tsc_khz(void)
 202 {
 203         u64 xen_khz = 1000000ULL << 32;
 204         const struct pvclock_vcpu_time_info *info =
 205                 &HYPERVISOR_shared_info->vcpu_info[0].time;
 206
 207         do_div(xen_khz, info->tsc_to_system_mul);
 208         if (info->tsc_shift < 0)
 209                 xen_khz <<= -info->tsc_shift;
 210         else
 211                 xen_khz >>= info->tsc_shift;
 212
 213         return xen_khz;
 214 }
 215
 216 static cycle_t xen_clocksource_read(void)
 217 {
 218         struct pvclock_vcpu_time_info *src;
 219         cycle_t ret;
 220
 221         src = &get_cpu_var(xen_vcpu)->time;
 222         ret = pvclock_clocksource_read(src);
 223         put_cpu_var(xen_vcpu);
 224         return ret;
 225 }
 226
 227 static void xen_read_wallclock(struct timespec *ts)
 228 {
 229         struct shared_info *s = HYPERVISOR_shared_info;
 230         struct pvclock_wall_clock *wall_clock = &(s->wc);
 231         struct pvclock_vcpu_time_info *vcpu_time;
 232
 233         vcpu_time = &get_cpu_var(xen_vcpu)->time;
 234         pvclock_read_wallclock(wall_clock, vcpu_time, ts);
 235         put_cpu_var(xen_vcpu);
 236 }
 237
 238 unsigned long xen_get_wallclock(void)
 239 {
 240         struct timespec ts;
 241
 242         xen_read_wallclock(&ts);
 243         return ts.tv_sec;
 244 }
 245
 246 int xen_set_wallclock(unsigned long now)
 247 {
 248         /* do nothing for domU */
 249         return -1;
 250 }
 251
 252 static struct clocksource xen_clocksource __read_mostly = {
 253         .name = "xen",
 254         .rating = 400,
 255         .read = xen_clocksource_read,
 256         .mask = ~0,
 257         .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
 258         .shift = XEN_SHIFT,
 259         .flags = CLOCK_SOURCE_IS_CONTINUOUS,
 260 };
 261
 262 /*
 263    Xen clockevent implementation
 264
 265    Xen has two clockevent implementations:
 266
 267    The old timer_op one works with all released versions of Xen prior
 268    to version 3.0.4.  This version of the hypervisor provides a
 269    single-shot timer with nanosecond resolution.  However, sharing the
 270    same event channel is a 100Hz tick which is delivered while the
 271    vcpu is running.  We don't care about or use this tick, but it will
 272    cause the core time code to think the timer fired too soon, and
 273    will end up resetting it each time.  It could be filtered, but
 274    doing so has complications when the ktime clocksource is not yet
 275    the xen clocksource (ie, at boot time).
 276
 277    The new vcpu_op-based timer interface allows the tick timer period
 278    to be changed or turned off.  The tick timer is not useful as a
 279    periodic timer because events are only delivered to running vcpus.
 280    The one-shot timer can report when a timeout is in the past, so
 281    set_next_event is capable of returning -ETIME when appropriate.
 282    This interface is used when available.
 283 */
 284
 285
 286 /*
 287   Get a hypervisor absolute time.  In theory we could maintain an
 288   offset between the kernel's time and the hypervisor's time, and
 289   apply that to a kernel's absolute timeout.  Unfortunately the
 290   hypervisor and kernel times can drift even if the kernel is using
 291   the Xen clocksource, because ntp can warp the kernel's clocksource.
 292 */
 293 static s64 get_abs_timeout(unsigned long delta)
 294 {
 295         return xen_clocksource_read() + delta;
 296 }
 297
 298 static void xen_timerop_set_mode(enum clock_event_mode mode,
 299                                  struct clock_event_device *evt)
 300 {
 301         switch (mode) {
 302         case CLOCK_EVT_MODE_PERIODIC:
 303                 /* unsupported */
 304                 WARN_ON(1);
 305                 break;
 306
 307         case CLOCK_EVT_MODE_ONESHOT:
 308         case CLOCK_EVT_MODE_RESUME:
 309                 break;
 310
 311         case CLOCK_EVT_MODE_UNUSED:
 312         case CLOCK_EVT_MODE_SHUTDOWN:
 313                 HYPERVISOR_set_timer_op(0);  /* cancel timeout */
 314                 break;
 315         }
 316 }
 317
 318 static int xen_timerop_set_next_event(unsigned long delta,
 319                                       struct clock_event_device *evt)
 320 {
 321         WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
 322
 323         if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
 324                 BUG();
 325
 326         /* We may have missed the deadline, but there's no real way of
 327            knowing for sure.  If the event was in the past, then we'll
 328            get an immediate interrupt. */
 329
 330         return 0;
 331 }
 332
 333 static const struct clock_event_device xen_timerop_clockevent = {
 334         .name = "xen",
 335         .features = CLOCK_EVT_FEAT_ONESHOT,
 336
 337         .max_delta_ns = 0xffffffff,
 338         .min_delta_ns = TIMER_SLOP,
 339
 340         .mult = 1,
 341         .shift = 0,
 342         .rating = 500,
 343
 344         .set_mode = xen_timerop_set_mode,
 345         .set_next_event = xen_timerop_set_next_event,
 346 };
 347
 348
 349
 350 static void xen_vcpuop_set_mode(enum clock_event_mode mode,
 351                                 struct clock_event_device *evt)
 352 {
 353         int cpu = smp_processor_id();
 354
 355         switch (mode) {
 356         case CLOCK_EVT_MODE_PERIODIC:
 357                 WARN_ON(1);     /* unsupported */
 358                 break;
 359
 360         case CLOCK_EVT_MODE_ONESHOT:
 361                 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 362                         BUG();
 363                 break;
 364
 365         case CLOCK_EVT_MODE_UNUSED:
 366         case CLOCK_EVT_MODE_SHUTDOWN:
 367                 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
 368                     HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 369                         BUG();
 370                 break;
 371         case CLOCK_EVT_MODE_RESUME:
 372                 break;
 373         }
 374 }
 375
 376 static int xen_vcpuop_set_next_event(unsigned long delta,
 377                                      struct clock_event_device *evt)
 378 {
 379         int cpu = smp_processor_id();
 380         struct vcpu_set_singleshot_timer single;
 381         int ret;
 382
 383         WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
 384
 385         single.timeout_abs_ns = get_abs_timeout(delta);
 386         single.flags = VCPU_SSHOTTMR_future;
 387
 388         ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
 389
 390         BUG_ON(ret != 0 && ret != -ETIME);
 391
 392         return ret;
 393 }
 394
 395 static const struct clock_event_device xen_vcpuop_clockevent = {
 396         .name = "xen",
 397         .features = CLOCK_EVT_FEAT_ONESHOT,
 398
 399         .max_delta_ns = 0xffffffff,
 400         .min_delta_ns = TIMER_SLOP,
 401
 402         .mult = 1,
 403         .shift = 0,
 404         .rating = 500,
 405
 406         .set_mode = xen_vcpuop_set_mode,
 407         .set_next_event = xen_vcpuop_set_next_event,
 408 };
 409
 410 static const struct clock_event_device *xen_clockevent =
 411         &xen_timerop_clockevent;
 412 static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
 413
 414 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
 415 {
 416         struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
 417         irqreturn_t ret;
 418
 419         ret = IRQ_NONE;
 420         if (evt->event_handler) {
 421                 evt->event_handler(evt);
 422                 ret = IRQ_HANDLED;
 423         }
 424
 425         do_stolen_accounting();
 426
 427         return ret;
 428 }
 429
 430 void xen_setup_timer(int cpu)
 431 {
 432         const char *name;
 433         struct clock_event_device *evt;
 434         int irq;
 435
 436         printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
 437
 438         name = kasprintf(GFP_KERNEL, "timer%d", cpu);
 439         if (!name)
 440                 name = "<timer kasprintf failed>";
 441
 442         irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
 443                                       IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
 444                                       name, NULL);
 445
 446         evt = &per_cpu(xen_clock_events, cpu);
 447         memcpy(evt, xen_clockevent, sizeof(*evt));
 448
 449         evt->cpumask = cpumask_of_cpu(cpu);
 450         evt->irq = irq;
 451
 452         setup_runstate_info(cpu);
 453 }
 454
 455 void xen_setup_cpu_clockevents(void)
 456 {
 457         BUG_ON(preemptible());
 458
 459         clockevents_register_device(&__get_cpu_var(xen_clock_events));
 460 }
 461
 462 void xen_timer_resume(void)
 463 {
 464         int cpu;
 465
 466         if (xen_clockevent != &xen_vcpuop_clockevent)
 467                 return;
 468
 469         for_each_online_cpu(cpu) {
 470                 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 471                         BUG();
 472         }
 473 }
 474
 475 __init void xen_time_init(void)
 476 {
 477         int cpu = smp_processor_id();
 478
 479         clocksource_register(&xen_clocksource);
 480
 481         if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
 482                 /* Successfully turned off 100Hz tick, so we have the
 483                    vcpuop-based timer interface */
 484                 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
 485                 xen_clockevent = &xen_vcpuop_clockevent;
 486         }
 487
 488         /* Set initial system time with full resolution */
 489         xen_read_wallclock(&xtime);
 490         set_normalized_timespec(&wall_to_monotonic,
 491                                 -xtime.tv_sec, -xtime.tv_nsec);
 492
 493         setup_force_cpu_cap(X86_FEATURE_TSC);
 494
 495         xen_setup_timer(cpu);
 496         xen_setup_cpu_clockevents();
 497 }