Merge tag 'sched-cputime-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel...

author Ingo Molnar <mingo@kernel.org>

Sat, 8 Dec 2012 14:44:43 +0000 (15:44 +0100)

committer Ingo Molnar <mingo@kernel.org>

Sat, 8 Dec 2012 14:44:43 +0000 (15:44 +0100)
author Ingo Molnar <mingo@kernel.org>
Sat, 8 Dec 2012 14:44:43 +0000 (15:44 +0100)
committer Ingo Molnar <mingo@kernel.org>
Sat, 8 Dec 2012 14:44:43 +0000 (15:44 +0100)
diff --combined include/linux/kvm_host.h

index 3738c26,f17158b..d5cddd8
--- 1/include/linux/kvm_host.h
--- 2/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@@ -42,8 -42,19 +42,8 @@@
    */
   #define KVM_MEMSLOT_INVALID   (1UL << 16)
   
- -/*
- - * If we support unaligned MMIO, at most one fragment will be split into two:
- - */
- -#ifdef KVM_UNALIGNED_MMIO
- -#  define KVM_EXTRA_MMIO_FRAGMENTS 1
- -#else
- -#  define KVM_EXTRA_MMIO_FRAGMENTS 0
- -#endif
- -
- -#define KVM_USER_MMIO_SIZE 8
- -
- -#define KVM_MAX_MMIO_FRAGMENTS \
- -      (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS)
+ +/* Two fragments for cross MMIO pages. */
+ +#define KVM_MAX_MMIO_FRAGMENTS        2
   
   /*
    * For the normal pfn, the highest 12 bits should be zero,
@@@ -730,7 -741,7 +730,7 @@@ static inline void kvm_guest_enter(void
          * This is running in ioctl context so we can avoid
          * the call to vtime_account() with its unnecessary idle check.
          */
-       vtime_account_system(current);
+       vtime_account_system_irqsafe(current);
         current->flags |= PF_VCPU;
         /* KVM does not hold any references to rcu protected data when it
          * switches CPU into a guest mode. In fact switching to a guest mode
@@@ -748,7 -759,7 +748,7 @@@ static inline void kvm_guest_exit(void
          * This is running in ioctl context so we can avoid
          * the call to vtime_account() with its unnecessary idle check.
          */
-       vtime_account_system(current);
+       vtime_account_system_irqsafe(current);
         current->flags &= ~PF_VCPU;
   }
   
diff --combined kernel/sched/cputime.c

index b7f7317,80b2fd5..293b202
--- 1/kernel/sched/cputime.c
--- 2/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@@ -288,34 -288,6 +288,34 @@@ static __always_inline bool steal_accou
         return false;
   }
   
+ +/*
+ + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
+ + * tasks (sum on group iteration) belonging to @tsk's group.
+ + */
+ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+ +{
+ +      struct signal_struct *sig = tsk->signal;
+ +      struct task_struct *t;
+ +
+ +      times->utime = sig->utime;
+ +      times->stime = sig->stime;
+ +      times->sum_exec_runtime = sig->sum_sched_runtime;
+ +
+ +      rcu_read_lock();
+ +      /* make sure we can trust tsk->thread_group list */
+ +      if (!likely(pid_alive(tsk)))
+ +              goto out;
+ +
+ +      t = tsk;
+ +      do {
+ +              times->utime += t->utime;
+ +              times->stime += t->stime;
+ +              times->sum_exec_runtime += task_sched_runtime(t);
+ +      } while_each_thread(tsk, t);
+ +out:
+ +      rcu_read_unlock();
+ +}
+ +
   #ifndef CONFIG_VIRT_CPU_ACCOUNTING
   
   #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@@ -445,13 -417,13 +445,13 @@@ void account_idle_ticks(unsigned long t
    * Use precise platform statistics if available:
    */
   #ifdef CONFIG_VIRT_CPU_ACCOUNTING
- -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+ +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
   {
         *ut = p->utime;
         *st = p->stime;
   }
   
- -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+ +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
   {
         struct task_cputime cputime;
   
@@@ -461,20 -433,33 +461,33 @@@
         *st = cputime.stime;
   }
   
- void vtime_account_system(struct task_struct *tsk)
+ void vtime_account_system_irqsafe(struct task_struct *tsk)
   {
         unsigned long flags;
   
         local_irq_save(flags);
-       __vtime_account_system(tsk);
+       vtime_account_system(tsk);
         local_irq_restore(flags);
   }
- EXPORT_SYMBOL_GPL(vtime_account_system);
+ EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
+ 
+ #ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+ void vtime_task_switch(struct task_struct *prev)
+ {
+       if (is_idle_task(prev))
+               vtime_account_idle(prev);
+       else
+               vtime_account_system(prev);
+ 
+       vtime_account_user(prev);
+       arch_vtime_task_switch(prev);
+ }
+ #endif
   
   /*
    * Archs that account the whole time spent in the idle task
    * (outside irq) as idle time can rely on this and just implement
-  * __vtime_account_system() and __vtime_account_idle(). Archs that
+  * vtime_account_system() and vtime_account_idle(). Archs that
    * have other meaning of the idle time (s390 only includes the
    * time spent by the CPU when it's in low power mode) must override
    * vtime_account().
@@@ -482,16 -467,10 +495,10 @@@
   #ifndef __ARCH_HAS_VTIME_ACCOUNT
   void vtime_account(struct task_struct *tsk)
   {
-       unsigned long flags;
- 
-       local_irq_save(flags);
- 
         if (in_interrupt() || !is_idle_task(tsk))
-               __vtime_account_system(tsk);
+               vtime_account_system(tsk);
         else
-               __vtime_account_idle(tsk);
- 
-       local_irq_restore(flags);
+               vtime_account_idle(tsk);
   }
   EXPORT_SYMBOL_GPL(vtime_account);
   #endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@@ -516,30 -495,14 +523,30 @@@ static cputime_t scale_utime(cputime_t 
         return (__force cputime_t) temp;
   }
   
- -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+ +/*
+ + * Adjust tick based cputime random precision against scheduler
+ + * runtime accounting.
+ + */
+ +static void cputime_adjust(struct task_cputime *curr,
+ +                         struct cputime *prev,
+ +                         cputime_t *ut, cputime_t *st)
   {
- -      cputime_t rtime, utime = p->utime, total = utime + p->stime;
+ +      cputime_t rtime, utime, total;
+ +
+ +      utime = curr->utime;
+ +      total = utime + curr->stime;
   
         /*
- -       * Use CFS's precise accounting:
+ +       * Tick based cputime accounting depend on random scheduling
+ +       * timeslices of a task to be interrupted or not by the timer.
+ +       * Depending on these circumstances, the number of these interrupts
+ +       * may be over or under-optimistic, matching the real user and system
+ +       * cputime with a variable precision.
+ +       *
+ +       * Fix this by scaling these tick based values against the total
+ +       * runtime accounted by the CFS scheduler.
          */
- -      rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
+ +      rtime = nsecs_to_cputime(curr->sum_exec_runtime);
   
         if (total)
                 utime = scale_utime(utime, rtime, total);
@@@ -547,36 -510,38 +554,36 @@@
                 utime = rtime;
   
         /*
- -       * Compare with previous values, to keep monotonicity:
+ +       * If the tick based count grows faster than the scheduler one,
+ +       * the result of the scaling may go backward.
+ +       * Let's enforce monotonicity.
          */
- -      p->prev_utime = max(p->prev_utime, utime);
- -      p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+ +      prev->utime = max(prev->utime, utime);
+ +      prev->stime = max(prev->stime, rtime - prev->utime);
   
- -      *ut = p->prev_utime;
- -      *st = p->prev_stime;
+ +      *ut = prev->utime;
+ +      *st = prev->stime;
+ +}
+ +
+ +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+ +{
+ +      struct task_cputime cputime = {
+ +              .utime = p->utime,
+ +              .stime = p->stime,
+ +              .sum_exec_runtime = p->se.sum_exec_runtime,
+ +      };
+ +
+ +      cputime_adjust(&cputime, &p->prev_cputime, ut, st);
   }
   
   /*
    * Must be called with siglock held.
    */
- -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+ +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
   {
- -      struct signal_struct *sig = p->signal;
         struct task_cputime cputime;
- -      cputime_t rtime, utime, total;
   
         thread_group_cputime(p, &cputime);
- -
- -      total = cputime.utime + cputime.stime;
- -      rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
- -
- -      if (total)
- -              utime = scale_utime(cputime.utime, rtime, total);
- -      else
- -              utime = rtime;
- -
- -      sig->prev_utime = max(sig->prev_utime, utime);
- -      sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
- -
- -      *ut = sig->prev_utime;
- -      *st = sig->prev_stime;
+ +      cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
   }
   #endif
author	Ingo Molnar <mingo@kernel.org>
	Sat, 8 Dec 2012 14:44:43 +0000 (15:44 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Sat, 8 Dec 2012 14:44:43 +0000 (15:44 +0100)
		1	2
include/linux/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/cputime.c	patch \|	diff1 \|	diff2 \|	blob \| history