Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 21 Oct 2010 19:55:43 +0000 (12:55 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 21 Oct 2010 19:55:43 +0000 (12:55 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 21 Oct 2010 19:55:43 +0000 (12:55 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 21 Oct 2010 19:55:43 +0000 (12:55 -0700)
diff --combined arch/s390/Kconfig

index 958f0da,74a2f1b..75976a1
--- 1/arch/s390/Kconfig
--- 2/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@@ -95,7 -95,6 +95,7 @@@ config S39
         select HAVE_KVM if 64BIT
         select HAVE_ARCH_TRACEHOOK
         select INIT_ALL_POSSIBLE
+ +      select HAVE_IRQ_WORK
         select HAVE_PERF_EVENTS
         select HAVE_KERNEL_GZIP
         select HAVE_KERNEL_BZIP2
@@@ -199,6 -198,13 +199,13 @@@ config HOTPLUG_CP
           can be controlled through /sys/devices/system/cpu/cpu#.
           Say N if you want to disable CPU hotplug.
   
+ config SCHED_BOOK
+       bool "Book scheduler support"
+       depends on SMP
+       help
+         Book scheduler support improves the CPU scheduler's decision making
+         when dealing with machines that have several books.
+ 
   config MATHEMU
         bool "IEEE FPU emulation"
         depends on MARCH_G5
diff --combined arch/x86/Kconfig

index fd227d6,f4c70c2..89b88e3
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -25,7 -25,6 +25,7 @@@ config X8
         select HAVE_IDE
         select HAVE_OPROFILE
         select HAVE_PERF_EVENTS if (!M386 && !M486)
+ +      select HAVE_IRQ_WORK
         select HAVE_IOREMAP_PROT
         select HAVE_KPROBES
         select ARCH_WANT_OPTIONAL_GPIOLIB
@@@ -34,7 -33,6 +34,7 @@@
         select HAVE_KRETPROBES
         select HAVE_OPTPROBES
         select HAVE_FTRACE_MCOUNT_RECORD
+ +      select HAVE_C_RECORDMCOUNT
         select HAVE_DYNAMIC_FTRACE
         select HAVE_FUNCTION_TRACER
         select HAVE_FUNCTION_GRAPH_TRACER
@@@ -61,8 -59,6 +61,8 @@@
         select ANON_INODES
         select HAVE_ARCH_KMEMCHECK
         select HAVE_USER_RETURN_NOTIFIER
+ +      select HAVE_ARCH_JUMP_LABEL
+ +      select HAVE_TEXT_POKE_SMP
   
   config INSTRUCTION_DECODER
         def_bool (KPROBES || PERF_EVENTS)
@@@ -799,6 -795,17 +799,17 @@@ config SCHED_M
           making when dealing with multi-core CPU chips at a cost of slightly
           increased overhead in some places. If unsure say N here.
   
+ config IRQ_TIME_ACCOUNTING
+       bool "Fine granularity task level IRQ time accounting"
+       default n
+       ---help---
+         Select this option to enable fine granularity task irq time
+         accounting. This is done by reading a timestamp on each
+         transitions between softirq and hardirq state, so there can be a
+         small performance impact.
+ 
+         If in doubt, say N here.
+ 
   source "kernel/Kconfig.preempt"
   
   config X86_UP_APIC
@@@ -2129,10 -2136,6 +2140,10 @@@ config HAVE_ATOMIC_IOMA
         def_bool y
         depends on X86_32
   
+ +config HAVE_TEXT_POKE_SMP
+ +      bool
+ +      select STOP_MACHINE if SMP
+ +
   source "net/Kconfig"
   
   source "drivers/Kconfig"
diff --combined include/linux/hardirq.h

index 1f4517d,ff43e92..96c323a
--- 1/include/linux/hardirq.h
--- 2/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@@ -64,6 -64,8 +64,8 @@@
   #define HARDIRQ_OFFSET        (1UL << HARDIRQ_SHIFT)
   #define NMI_OFFSET    (1UL << NMI_SHIFT)
   
+ #define SOFTIRQ_DISABLE_OFFSET        (2 * SOFTIRQ_OFFSET)
+ 
   #ifndef PREEMPT_ACTIVE
   #define PREEMPT_ACTIVE_BITS   1
   #define PREEMPT_ACTIVE_SHIFT  (NMI_SHIFT + NMI_BITS)
@@@ -82,10 -84,13 +84,13 @@@
   /*
    * Are we doing bottom half or hardware interrupt processing?
    * Are we in a softirq context? Interrupt context?
+  * in_softirq - Are we currently processing softirq or have bh disabled?
+  * in_serving_softirq - Are we currently processing softirq?
    */
   #define in_irq()              (hardirq_count())
   #define in_softirq()          (softirq_count())
   #define in_interrupt()                (irq_count())
+ #define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
   
   /*
    * Are we in NMI context?
@@@ -132,14 -137,16 +137,16 @@@ extern void synchronize_irq(unsigned in
   
   struct task_struct;
   
- #ifndef CONFIG_VIRT_CPU_ACCOUNTING
+ #if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
   static inline void account_system_vtime(struct task_struct *tsk)
   {
   }
+ #else
+ extern void account_system_vtime(struct task_struct *tsk);
   #endif
   
   #if defined(CONFIG_NO_HZ)
- -#if defined(CONFIG_TINY_RCU)
+ +#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
   extern void rcu_enter_nohz(void);
   extern void rcu_exit_nohz(void);
   
diff --combined include/linux/sched.h

index 61b4ecf,2cca9a9..0383601
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -875,6 -875,7 +875,7 @@@ enum sched_domain_level 
         SD_LV_NONE = 0,
         SD_LV_SIBLING,
         SD_LV_MC,
+       SD_LV_BOOK,
         SD_LV_CPU,
         SD_LV_NODE,
         SD_LV_ALLNODES,
@@@ -1160,13 -1161,6 +1161,13 @@@ struct sched_rt_entity 
   
   struct rcu_node;
   
+ +enum perf_event_task_context {
+ +      perf_invalid_context = -1,
+ +      perf_hw_context = 0,
+ +      perf_sw_context,
+ +      perf_nr_task_contexts,
+ +};
+ +
   struct task_struct {
         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
         void *stack;
@@@ -1209,13 -1203,11 +1210,13 @@@
         unsigned int policy;
         cpumask_t cpus_allowed;
   
- -#ifdef CONFIG_TREE_PREEMPT_RCU
+ +#ifdef CONFIG_PREEMPT_RCU
         int rcu_read_lock_nesting;
         char rcu_read_unlock_special;
- -      struct rcu_node *rcu_blocked_node;
         struct list_head rcu_node_entry;
+ +#endif /* #ifdef CONFIG_PREEMPT_RCU */
+ +#ifdef CONFIG_TREE_PREEMPT_RCU
+ +      struct rcu_node *rcu_blocked_node;
   #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
   
   #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@@ -1297,9 -1289,9 +1298,9 @@@
         struct list_head cpu_timers[3];
   
   /* process credentials */
- -      const struct cred *real_cred;   /* objective and real subjective task
+ +      const struct cred __rcu *real_cred; /* objective and real subjective task
                                          * credentials (COW) */
- -      const struct cred *cred;        /* effective (overridable) subjective task
+ +      const struct cred __rcu *cred;  /* effective (overridable) subjective task
                                          * credentials (COW) */
         struct mutex cred_guard_mutex;  /* guard against foreign influences on
                                          * credential calculations
@@@ -1427,7 -1419,7 +1428,7 @@@
   #endif
   #ifdef CONFIG_CGROUPS
         /* Control Group info protected by css_set_lock */
- -      struct css_set *cgroups;
+ +      struct css_set __rcu *cgroups;
         /* cg_list protected by css_set_lock and tsk->alloc_lock */
         struct list_head cg_list;
   #endif
@@@ -1440,7 -1432,7 +1441,7 @@@
         struct futex_pi_state *pi_state_cache;
   #endif
   #ifdef CONFIG_PERF_EVENTS
- -      struct perf_event_context *perf_event_ctxp;
+ +      struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
         struct mutex perf_event_mutex;
         struct list_head perf_event_list;
   #endif
@@@ -1690,8 -1682,7 +1691,7 @@@ extern void thread_group_times(struct t
   /*
    * Per process flags
    */
- #define PF_ALIGNWARN  0x00000001      /* Print alignment warning msgs */
-                                       /* Not implemented yet, only for 486*/
+ #define PF_KSOFTIRQD  0x00000001      /* I am ksoftirqd */
   #define PF_STARTING   0x00000002      /* being created */
   #define PF_EXITING    0x00000004      /* getting shut down */
   #define PF_EXITPIDONE 0x00000008      /* pi exit done on shut down */
@@@ -1749,7 -1740,7 +1749,7 @@@
   #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
   #define used_math() tsk_used_math(current)
   
- -#ifdef CONFIG_TREE_PREEMPT_RCU
+ +#ifdef CONFIG_PREEMPT_RCU
   
   #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
   #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
@@@ -1758,9 -1749,7 +1758,9 @@@ static inline void rcu_copy_process(str
   {
         p->rcu_read_lock_nesting = 0;
         p->rcu_read_unlock_special = 0;
+ +#ifdef CONFIG_TREE_PREEMPT_RCU
         p->rcu_blocked_node = NULL;
+ +#endif
         INIT_LIST_HEAD(&p->rcu_node_entry);
   }
   
@@@ -1837,6 -1826,19 +1837,19 @@@ extern void sched_clock_idle_sleep_even
   extern void sched_clock_idle_wakeup_event(u64 delta_ns);
   #endif
   
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ /*
+  * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
+  * The reason for this explicit opt-in is not to have perf penalty with
+  * slow sched_clocks.
+  */
+ extern void enable_sched_clock_irqtime(void);
+ extern void disable_sched_clock_irqtime(void);
+ #else
+ static inline void enable_sched_clock_irqtime(void) {}
+ static inline void disable_sched_clock_irqtime(void) {}
+ #endif
+ 
   extern unsigned long long
   task_sched_runtime(struct task_struct *task);
   extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
@@@ -2378,9 -2380,9 +2391,9 @@@ extern int __cond_resched_lock(spinlock
   
   extern int __cond_resched_softirq(void);
   
- #define cond_resched_softirq() ({                             \
-       __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET);      \
-       __cond_resched_softirq();                               \
+ #define cond_resched_softirq() ({                                     \
+       __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);      \
+       __cond_resched_softirq();                                       \
   })
   
   /*
diff --combined kernel/sched.c

index 5a5cc33,5998222..d42992b
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -426,9 -426,7 +426,7 @@@ struct root_domain 
          */
         cpumask_var_t rto_mask;
         atomic_t rto_count;
- #ifdef CONFIG_SMP
         struct cpupri cpupri;
- #endif
   };
   
   /*
@@@ -437,7 -435,7 +435,7 @@@
    */
   static struct root_domain def_root_domain;
   
- #endif
+ #endif /* CONFIG_SMP */
   
   /*
    * This is the main, per-CPU runqueue data structure.
@@@ -488,11 -486,12 +486,12 @@@ struct rq 
          */
         unsigned long nr_uninterruptible;
   
-       struct task_struct *curr, *idle;
+       struct task_struct *curr, *idle, *stop;
         unsigned long next_balance;
         struct mm_struct *prev_mm;
   
         u64 clock;
+       u64 clock_task;
   
         atomic_t nr_iowait;
   
@@@ -520,6 -519,10 +519,10 @@@
         u64 avg_idle;
   #endif
   
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       u64 prev_irq_time;
+ #endif
+ 
         /* calc_load related fields */
         unsigned long calc_load_update;
         long calc_load_active;
@@@ -643,10 -646,22 +646,22 @@@ static inline struct task_group *task_g
   
   #endif /* CONFIG_CGROUP_SCHED */
   
+ static u64 irq_time_cpu(int cpu);
+ static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+ 
   inline void update_rq_clock(struct rq *rq)
   {
-       if (!rq->skip_clock_update)
-               rq->clock = sched_clock_cpu(cpu_of(rq));
+       if (!rq->skip_clock_update) {
+               int cpu = cpu_of(rq);
+               u64 irq_time;
+ 
+               rq->clock = sched_clock_cpu(cpu);
+               irq_time = irq_time_cpu(cpu);
+               if (rq->clock - irq_time > rq->clock_task)
+                       rq->clock_task = rq->clock - irq_time;
+ 
+               sched_irq_time_avg_update(rq, irq_time);
+       }
   }
   
   /*
@@@ -723,7 -738,7 +738,7 @@@ sched_feat_write(struct file *filp, con
                 size_t cnt, loff_t *ppos)
   {
         char buf[64];
-       char *cmp = buf;
+       char *cmp;
         int neg = 0;
         int i;
   
@@@ -734,6 -749,7 +749,7 @@@
                 return -EFAULT;
   
         buf[cnt] = 0;
+       cmp = strstrip(buf);
   
         if (strncmp(buf, "NO_", 3) == 0) {
                 neg = 1;
@@@ -741,9 -757,7 +757,7 @@@
         }
   
         for (i = 0; sched_feat_names[i]; i++) {
-               int len = strlen(sched_feat_names[i]);
- 
-               if (strncmp(cmp, sched_feat_names[i], len) == 0) {
+               if (strcmp(cmp, sched_feat_names[i]) == 0) {
                         if (neg)
                                 sysctl_sched_features &= ~(1UL << i);
                         else
@@@ -1840,7 -1854,7 +1854,7 @@@ static inline void __set_task_cpu(struc
   
   static const struct sched_class rt_sched_class;
   
- #define sched_class_highest (&rt_sched_class)
+ #define sched_class_highest (&stop_sched_class)
   #define for_each_class(class) \
      for (class = sched_class_highest; class; class = class->next)
   
@@@ -1858,12 -1872,6 +1872,6 @@@ static void dec_nr_running(struct rq *r
   
   static void set_load_weight(struct task_struct *p)
   {
-       if (task_has_rt_policy(p)) {
-               p->se.load.weight = 0;
-               p->se.load.inv_weight = WMULT_CONST;
-               return;
-       }
- 
         /*
          * SCHED_IDLE tasks get minimal weight:
          */
@@@ -1917,13 -1925,132 +1925,132 @@@ static void deactivate_task(struct rq *
         dec_nr_running(rq);
   }
   
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ 
+ /*
+  * There are no locks covering percpu hardirq/softirq time.
+  * They are only modified in account_system_vtime, on corresponding CPU
+  * with interrupts disabled. So, writes are safe.
+  * They are read and saved off onto struct rq in update_rq_clock().
+  * This may result in other CPU reading this CPU's irq time and can
+  * race with irq/account_system_vtime on this CPU. We would either get old
+  * or new value (or semi updated value on 32 bit) with a side effect of
+  * accounting a slice of irq time to wrong task when irq is in progress
+  * while we read rq->clock. That is a worthy compromise in place of having
+  * locks on each irq in account_system_time.
+  */
+ static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+ static DEFINE_PER_CPU(u64, cpu_softirq_time);
+ 
+ static DEFINE_PER_CPU(u64, irq_start_time);
+ static int sched_clock_irqtime;
+ 
+ void enable_sched_clock_irqtime(void)
+ {
+       sched_clock_irqtime = 1;
+ }
+ 
+ void disable_sched_clock_irqtime(void)
+ {
+       sched_clock_irqtime = 0;
+ }
+ 
+ static u64 irq_time_cpu(int cpu)
+ {
+       if (!sched_clock_irqtime)
+               return 0;
+ 
+       return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+ }
+ 
+ void account_system_vtime(struct task_struct *curr)
+ {
+       unsigned long flags;
+       int cpu;
+       u64 now, delta;
+ 
+       if (!sched_clock_irqtime)
+               return;
+ 
+       local_irq_save(flags);
+ 
+       cpu = smp_processor_id();
+       now = sched_clock_cpu(cpu);
+       delta = now - per_cpu(irq_start_time, cpu);
+       per_cpu(irq_start_time, cpu) = now;
+       /*
+        * We do not account for softirq time from ksoftirqd here.
+        * We want to continue accounting softirq time to ksoftirqd thread
+        * in that case, so as not to confuse scheduler with a special task
+        * that do not consume any time, but still wants to run.
+        */
+       if (hardirq_count())
+               per_cpu(cpu_hardirq_time, cpu) += delta;
+       else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+               per_cpu(cpu_softirq_time, cpu) += delta;
+ 
+       local_irq_restore(flags);
+ }
+ EXPORT_SYMBOL_GPL(account_system_vtime);
+ 
+ static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+ {
+       if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+               u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+               rq->prev_irq_time = curr_irq_time;
+               sched_rt_avg_update(rq, delta_irq);
+       }
+ }
+ 
+ #else
+ 
+ static u64 irq_time_cpu(int cpu)
+ {
+       return 0;
+ }
+ 
+ static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+ 
+ #endif
+ 
   #include "sched_idletask.c"
   #include "sched_fair.c"
   #include "sched_rt.c"
+ #include "sched_stoptask.c"
   #ifdef CONFIG_SCHED_DEBUG
   # include "sched_debug.c"
   #endif
   
+ void sched_set_stop_task(int cpu, struct task_struct *stop)
+ {
+       struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+       struct task_struct *old_stop = cpu_rq(cpu)->stop;
+ 
+       if (stop) {
+               /*
+                * Make it appear like a SCHED_FIFO task, its something
+                * userspace knows about and won't get confused about.
+                *
+                * Also, it will make PI more or less work without too
+                * much confusion -- but then, stop work should not
+                * rely on PI working anyway.
+                */
+               sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+ 
+               stop->sched_class = &stop_sched_class;
+       }
+ 
+       cpu_rq(cpu)->stop = stop;
+ 
+       if (old_stop) {
+               /*
+                * Reset it back to a normal scheduling class so that
+                * it can die in pieces.
+                */
+               old_stop->sched_class = &rt_sched_class;
+       }
+ }
+ 
   /*
    * __normal_prio - return the priority that is based on the static prio
    */
@@@ -2003,6 -2130,9 +2130,9 @@@ task_hot(struct task_struct *p, u64 now
         if (p->sched_class != &fair_sched_class)
                 return 0;
   
+       if (unlikely(p->policy == SCHED_IDLE))
+               return 0;
+ 
         /*
          * Buddy candidates are cache hot:
          */
@@@ -2852,14 -2982,14 +2982,14 @@@ context_switch(struct rq *rq, struct ta
          */
         arch_start_context_switch(prev);
   
-       if (likely(!mm)) {
+       if (!mm) {
                 next->active_mm = oldmm;
                 atomic_inc(&oldmm->mm_count);
                 enter_lazy_tlb(oldmm, next);
         } else
                 switch_mm(oldmm, mm, next);
   
-       if (likely(!prev->mm)) {
+       if (!prev->mm) {
                 prev->active_mm = NULL;
                 rq->prev_mm = oldmm;
         }
@@@ -3248,7 -3378,7 +3378,7 @@@ static u64 do_task_delta_exec(struct ta
   
         if (task_current(rq, p)) {
                 update_rq_clock(rq);
-               ns = rq->clock - p->se.exec_start;
+               ns = rq->clock_task - p->se.exec_start;
                 if ((s64)ns < 0)
                         ns = 0;
         }
@@@ -3397,7 -3527,7 +3527,7 @@@ void account_system_time(struct task_st
         tmp = cputime_to_cputime64(cputime);
         if (hardirq_count() - hardirq_offset)
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
-       else if (softirq_count())
+       else if (in_serving_softirq())
                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
         else
                 cpustat->system = cputime64_add(cpustat->system, tmp);
@@@ -3584,7 -3714,7 +3714,7 @@@ void scheduler_tick(void
         curr->sched_class->task_tick(rq, curr, 0);
         raw_spin_unlock(&rq->lock);
   
- -      perf_event_task_tick(curr);
+ +      perf_event_task_tick();
   
   #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
@@@ -3723,17 -3853,13 +3853,13 @@@ pick_next_task(struct rq *rq
                         return p;
         }
   
-       class = sched_class_highest;
-       for ( ; ; ) {
+       for_each_class(class) {
                 p = class->pick_next_task(rq);
                 if (p)
                         return p;
-               /*
-                * Will never be NULL as the idle class always
-                * returns a non-NULL p:
-                */
-               class = class->next;
         }
+ 
+       BUG(); /* the idle class will always have a runnable task */
   }
   
   /*
@@@ -4358,6 -4484,7 +4484,7 @@@ void rt_mutex_setprio(struct task_struc
   
         rq = task_rq_lock(p, &flags);
   
+       trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
         prev_class = p->sched_class;
         on_rq = p->se.on_rq;
@@@ -4645,7 -4772,7 +4772,7 @@@ recheck
         }
   
         if (user) {
- -              retval = security_task_setscheduler(p, policy, param);
+ +              retval = security_task_setscheduler(p);
                 if (retval)
                         return retval;
         }
@@@ -4661,6 -4788,15 +4788,15 @@@
          */
         rq = __task_rq_lock(p);
   
+       /*
+        * Changing the policy of the stop threads its a very bad idea
+        */
+       if (p == rq->stop) {
+               __task_rq_unlock(rq);
+               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               return -EINVAL;
+       }
+ 
   #ifdef CONFIG_RT_GROUP_SCHED
         if (user) {
                 /*
@@@ -4887,13 -5023,13 +5023,13 @@@ long sched_setaffinity(pid_t pid, cons
         if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
   
- -      retval = security_task_setscheduler(p, 0, NULL);
+ +      retval = security_task_setscheduler(p);
         if (retval)
                 goto out_unlock;
   
         cpuset_cpus_allowed(p, cpus_allowed);
         cpumask_and(new_mask, in_mask, cpus_allowed);
-  again:
+ again:
         retval = set_cpus_allowed_ptr(p, new_mask);
   
         if (!retval) {
@@@ -5337,19 -5473,7 +5473,19 @@@ void __cpuinit init_idle(struct task_st
         idle->se.exec_start = sched_clock();
   
         cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+ +      /*
+ +       * We're having a chicken and egg problem, even though we are
+ +       * holding rq->lock, the cpu isn't yet set to this cpu so the
+ +       * lockdep check in task_group() will fail.
+ +       *
+ +       * Similar case to sched_fork(). / Alternatively we could
+ +       * use task_rq_lock() here and obtain the other rq->lock.
+ +       *
+ +       * Silence PROVE_RCU
+ +       */
+ +      rcu_read_lock();
         __set_task_cpu(idle, cpu);
+ +      rcu_read_unlock();
   
         rq->curr = rq->idle = idle;
   #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@@ -6526,6 -6650,7 +6662,7 @@@ struct s_data 
         cpumask_var_t           nodemask;
         cpumask_var_t           this_sibling_map;
         cpumask_var_t           this_core_map;
+       cpumask_var_t           this_book_map;
         cpumask_var_t           send_covered;
         cpumask_var_t           tmpmask;
         struct sched_group      **sched_group_nodes;
@@@ -6537,6 -6662,7 +6674,7 @@@ enum s_alloc 
         sa_rootdomain,
         sa_tmpmask,
         sa_send_covered,
+       sa_this_book_map,
         sa_this_core_map,
         sa_this_sibling_map,
         sa_nodemask,
@@@ -6572,31 -6698,48 +6710,48 @@@ cpu_to_cpu_group(int cpu, const struct 
   #ifdef CONFIG_SCHED_MC
   static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
   static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
- #endif /* CONFIG_SCHED_MC */
   
- #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
   static int
   cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
                   struct sched_group **sg, struct cpumask *mask)
   {
         int group;
- 
+ #ifdef CONFIG_SCHED_SMT
         cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
         group = cpumask_first(mask);
+ #else
+       group = cpu;
+ #endif
         if (sg)
                 *sg = &per_cpu(sched_group_core, group).sg;
         return group;
   }
- #elif defined(CONFIG_SCHED_MC)
+ #endif /* CONFIG_SCHED_MC */
+ 
+ /*
+  * book sched-domains:
+  */
+ #ifdef CONFIG_SCHED_BOOK
+ static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
+ 
   static int
- cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
-                 struct sched_group **sg, struct cpumask *unused)
+ cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
   {
+       int group = cpu;
+ #ifdef CONFIG_SCHED_MC
+       cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+       group = cpumask_first(mask);
+ #elif defined(CONFIG_SCHED_SMT)
+       cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+       group = cpumask_first(mask);
+ #endif
         if (sg)
-               *sg = &per_cpu(sched_group_core, cpu).sg;
-       return cpu;
+               *sg = &per_cpu(sched_group_book, group).sg;
+       return group;
   }
- #endif
+ #endif /* CONFIG_SCHED_BOOK */
   
   static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
   static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@@ -6606,7 -6749,10 +6761,10 @@@ cpu_to_phys_group(int cpu, const struc
                   struct sched_group **sg, struct cpumask *mask)
   {
         int group;
- #ifdef CONFIG_SCHED_MC
+ #ifdef CONFIG_SCHED_BOOK
+       cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
+       group = cpumask_first(mask);
+ #elif defined(CONFIG_SCHED_MC)
         cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
         group = cpumask_first(mask);
   #elif defined(CONFIG_SCHED_SMT)
@@@ -6867,6 -7013,9 +7025,9 @@@ SD_INIT_FUNC(CPU
   #ifdef CONFIG_SCHED_MC
    SD_INIT_FUNC(MC)
   #endif
+ #ifdef CONFIG_SCHED_BOOK
+  SD_INIT_FUNC(BOOK)
+ #endif
   
   static int default_relax_domain_level = -1;
   
@@@ -6916,6 -7065,8 +7077,8 @@@ static void __free_domain_allocs(struc
                 free_cpumask_var(d->tmpmask); /* fall through */
         case sa_send_covered:
                 free_cpumask_var(d->send_covered); /* fall through */
+       case sa_this_book_map:
+               free_cpumask_var(d->this_book_map); /* fall through */
         case sa_this_core_map:
                 free_cpumask_var(d->this_core_map); /* fall through */
         case sa_this_sibling_map:
@@@ -6962,8 -7113,10 +7125,10 @@@ static enum s_alloc __visit_domain_allo
                 return sa_nodemask;
         if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
                 return sa_this_sibling_map;
-       if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+       if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
                 return sa_this_core_map;
+       if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+               return sa_this_book_map;
         if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
                 return sa_send_covered;
         d->rd = alloc_rootdomain();
@@@ -7021,6 -7174,23 +7186,23 @@@ static struct sched_domain *__build_cpu
         return sd;
   }
   
+ static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+       struct sched_domain *parent, int i)
+ {
+       struct sched_domain *sd = parent;
+ #ifdef CONFIG_SCHED_BOOK
+       sd = &per_cpu(book_domains, i).sd;
+       SD_INIT(sd, BOOK);
+       set_domain_attribute(sd, attr);
+       cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
+       sd->parent = parent;
+       parent->child = sd;
+       cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
+ #endif
+       return sd;
+ }
+ 
   static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
         const struct cpumask *cpu_map, struct sched_domain_attr *attr,
         struct sched_domain *parent, int i)
@@@ -7077,6 -7247,15 +7259,15 @@@ static void build_sched_groups(struct s
                                                 &cpu_to_core_group,
                                                 d->send_covered, d->tmpmask);
                 break;
+ #endif
+ #ifdef CONFIG_SCHED_BOOK
+       case SD_LV_BOOK: /* set up book groups */
+               cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
+               if (cpu == cpumask_first(d->this_book_map))
+                       init_sched_build_groups(d->this_book_map, cpu_map,
+                                               &cpu_to_book_group,
+                                               d->send_covered, d->tmpmask);
+               break;
   #endif
         case SD_LV_CPU: /* set up physical groups */
                 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
@@@ -7125,12 -7304,14 +7316,14 @@@ static int __build_sched_domains(const 
   
                 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
                 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+               sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
                 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
                 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
         }
   
         for_each_cpu(i, cpu_map) {
                 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+               build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
                 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
         }
   
@@@ -7161,6 -7342,12 +7354,12 @@@
                 init_sched_groups_power(i, sd);
         }
   #endif
+ #ifdef CONFIG_SCHED_BOOK
+       for_each_cpu(i, cpu_map) {
+               sd = &per_cpu(book_domains, i).sd;
+               init_sched_groups_power(i, sd);
+       }
+ #endif
   
         for_each_cpu(i, cpu_map) {
                 sd = &per_cpu(phys_domains, i).sd;
@@@ -7186,6 -7373,8 +7385,8 @@@
                 sd = &per_cpu(cpu_domains, i).sd;
   #elif defined(CONFIG_SCHED_MC)
                 sd = &per_cpu(core_domains, i).sd;
+ #elif defined(CONFIG_SCHED_BOOK)
+               sd = &per_cpu(book_domains, i).sd;
   #else
                 sd = &per_cpu(phys_domains, i).sd;
   #endif
@@@ -8090,9 -8279,9 +8291,9 @@@ int alloc_fair_sched_group(struct task_
   
         return 1;
   
-  err_free_rq:
+ err_free_rq:
         kfree(cfs_rq);
-  err:
+ err:
         return 0;
   }
   
@@@ -8180,9 -8369,9 +8381,9 @@@ int alloc_rt_sched_group(struct task_gr
   
         return 1;
   
-  err_free_rq:
+ err_free_rq:
         kfree(rt_rq);
-  err:
+ err:
         return 0;
   }
   
@@@ -8540,7 -8729,7 +8741,7 @@@ static int tg_set_bandwidth(struct task
                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
         }
         raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-  unlock:
+ unlock:
         read_unlock(&tasklist_lock);
         mutex_unlock(&rt_constraints_mutex);
   
diff --combined kernel/sched_fair.c

index 5f996d3,74cccfa..933f3d1
--- 1/kernel/sched_fair.c
--- 2/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@ -25,7 -25,7 +25,7 @@@
   
   /*
    * Targeted preemption latency for CPU-bound tasks:
-  * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
    *
    * NOTE: this latency value is not the same as the concept of
    * 'timeslice length' - timeslices in CFS are of variable length
@@@ -52,7 -52,7 +52,7 @@@ enum sched_tunable_scaling sysctl_sched
   
   /*
    * Minimal preemption granularity for CPU-bound tasks:
-  * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
    */
   unsigned int sysctl_sched_min_granularity = 750000ULL;
   unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@@ -519,7 -519,7 +519,7 @@@ __update_curr(struct cfs_rq *cfs_rq, st
   static void update_curr(struct cfs_rq *cfs_rq)
   {
         struct sched_entity *curr = cfs_rq->curr;
-       u64 now = rq_of(cfs_rq)->clock;
+       u64 now = rq_of(cfs_rq)->clock_task;
         unsigned long delta_exec;
   
         if (unlikely(!curr))
@@@ -602,7 -602,7 +602,7 @@@ update_stats_curr_start(struct cfs_rq *
         /*
          * We are starting a new run period:
          */
-       se->exec_start = rq_of(cfs_rq)->clock;
+       se->exec_start = rq_of(cfs_rq)->clock_task;
   }
   
   /**************************************************
@@@ -1764,6 -1764,10 +1764,10 @@@ static void pull_task(struct rq *src_rq
         set_task_cpu(p, this_cpu);
         activate_task(this_rq, p, 0);
         check_preempt_curr(this_rq, p, 0);
+ 
+       /* re-arm NEWIDLE balancing when moving tasks */
+       src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
+       this_rq->idle_stamp = 0;
   }
   
   /*
@@@ -1798,7 -1802,7 +1802,7 @@@ int can_migrate_task(struct task_struc
          * 2) too many balance attempts have failed.
          */
   
-       tsk_cache_hot = task_hot(p, rq->clock, sd);
+       tsk_cache_hot = task_hot(p, rq->clock_task, sd);
         if (!tsk_cache_hot ||
                 sd->nr_balance_failed > sd->cache_nice_tries) {
   #ifdef CONFIG_SCHEDSTATS
@@@ -2030,12 -2034,14 +2034,14 @@@ struct sd_lb_stats 
         unsigned long this_load;
         unsigned long this_load_per_task;
         unsigned long this_nr_running;
+       unsigned long this_has_capacity;
   
         /* Statistics of the busiest group */
         unsigned long max_load;
         unsigned long busiest_load_per_task;
         unsigned long busiest_nr_running;
         unsigned long busiest_group_capacity;
+       unsigned long busiest_has_capacity;
   
         int group_imb; /* Is there imbalance in this sd */
   #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@@ -2058,6 -2064,7 +2064,7 @@@ struct sg_lb_stats 
         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
         unsigned long group_capacity;
         int group_imb; /* Is there an imbalance in the group ? */
+       int group_has_capacity; /* Is there extra capacity in the group? */
   };
   
   /**
@@@ -2268,7 -2275,13 +2275,13 @@@ unsigned long scale_rt_power(int cpu
         u64 total, available;
   
         total = sched_avg_period() + (rq->clock - rq->age_stamp);
-       available = total - rq->rt_avg;
+ 
+       if (unlikely(total < rq->rt_avg)) {
+               /* Ensures that power won't end up being negative */
+               available = 0;
+       } else {
+               available = total - rq->rt_avg;
+       }
   
         if (unlikely((s64)total < SCHED_LOAD_SCALE))
                 total = SCHED_LOAD_SCALE;
@@@ -2378,7 -2391,7 +2391,7 @@@ static inline void update_sg_lb_stats(s
                         int local_group, const struct cpumask *cpus,
                         int *balance, struct sg_lb_stats *sgs)
   {
-       unsigned long load, max_cpu_load, min_cpu_load;
+       unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
         int i;
         unsigned int balance_cpu = -1, first_idle_cpu = 0;
         unsigned long avg_load_per_task = 0;
@@@ -2389,6 -2402,7 +2402,7 @@@
         /* Tally up the load of all CPUs in the group */
         max_cpu_load = 0;
         min_cpu_load = ~0UL;
+       max_nr_running = 0;
   
         for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                 struct rq *rq = cpu_rq(i);
@@@ -2406,8 -2420,10 +2420,10 @@@
                         load = target_load(i, load_idx);
                 } else {
                         load = source_load(i, load_idx);
-                       if (load > max_cpu_load)
+                       if (load > max_cpu_load) {
                                 max_cpu_load = load;
+                               max_nr_running = rq->nr_running;
+                       }
                         if (min_cpu_load > load)
                                 min_cpu_load = load;
                 }
@@@ -2447,13 -2463,15 +2463,15 @@@
         if (sgs->sum_nr_running)
                 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
   
-       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
                 sgs->group_imb = 1;
   
-       sgs->group_capacity =
-               DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+       sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
         if (!sgs->group_capacity)
                 sgs->group_capacity = fix_small_capacity(sd, group);
+ 
+       if (sgs->group_capacity > sgs->sum_nr_running)
+               sgs->group_has_capacity = 1;
   }
   
   /**
@@@ -2542,9 -2560,14 +2560,14 @@@ static inline void update_sd_lb_stats(s
                 /*
                  * In case the child domain prefers tasks go to siblings
                  * first, lower the sg capacity to one so that we'll try
-                * and move all the excess tasks away.
+                * and move all the excess tasks away. We lower the capacity
+                * of a group only if the local group has the capacity to fit
+                * these excess tasks, i.e. nr_running < group_capacity. The
+                * extra check prevents the case where you always pull from the
+                * heaviest group when it is already under-utilized (possible
+                * with a large weight task outweighs the tasks on the system).
                  */
-               if (prefer_sibling)
+               if (prefer_sibling && !local_group && sds->this_has_capacity)
                         sgs.group_capacity = min(sgs.group_capacity, 1UL);
   
                 if (local_group) {
@@@ -2552,12 -2575,14 +2575,14 @@@
                         sds->this = sg;
                         sds->this_nr_running = sgs.sum_nr_running;
                         sds->this_load_per_task = sgs.sum_weighted_load;
+                       sds->this_has_capacity = sgs.group_has_capacity;
                 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                         sds->max_load = sgs.avg_load;
                         sds->busiest = sg;
                         sds->busiest_nr_running = sgs.sum_nr_running;
                         sds->busiest_group_capacity = sgs.group_capacity;
                         sds->busiest_load_per_task = sgs.sum_weighted_load;
+                       sds->busiest_has_capacity = sgs.group_has_capacity;
                         sds->group_imb = sgs.group_imb;
                 }
   
@@@ -2754,6 -2779,7 +2779,7 @@@ static inline void calculate_imbalance(
                 return fix_small_imbalance(sds, this_cpu, imbalance);
   
   }
+ 
   /******* find_busiest_group() helpers end here *********************/
   
   /**
@@@ -2805,6 -2831,11 +2831,11 @@@ find_busiest_group(struct sched_domain 
          * 4) This group is more busy than the avg busieness at this
          *    sched_domain.
          * 5) The imbalance is within the specified limit.
+        *
+        * Note: when doing newidle balance, if the local group has excess
+        * capacity (i.e. nr_running < group_capacity) and the busiest group
+        * does not have any capacity, we force a load balance to pull tasks
+        * to the local group. In this case, we skip past checks 3, 4 and 5.
          */
         if (!(*balance))
                 goto ret;
@@@ -2816,6 -2847,11 +2847,11 @@@
         if (!sds.busiest || sds.busiest_nr_running == 0)
                 goto out_balanced;
   
+       /*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+       if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+                       !sds.busiest_has_capacity)
+               goto force_balance;
+ 
         if (sds.this_load >= sds.max_load)
                 goto out_balanced;
   
@@@ -2827,6 -2863,7 +2863,7 @@@
         if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
                 goto out_balanced;
   
+ force_balance:
         /* Looks like there is an imbalance. Compute it */
         calculate_imbalance(&sds, this_cpu, imbalance);
         return sds.busiest;
@@@ -3031,7 -3068,14 +3068,14 @@@ redo
   
         if (!ld_moved) {
                 schedstat_inc(sd, lb_failed[idle]);
-               sd->nr_balance_failed++;
+               /*
+                * Increment the failure counter only on periodic balance.
+                * We do not want newidle balance, which can be very
+                * frequent, pollute the failure counter causing
+                * excessive cache_hot migrations and active balances.
+                */
+               if (idle != CPU_NEWLY_IDLE)
+                       sd->nr_balance_failed++;
   
                 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
                                         this_cpu)) {
@@@ -3153,10 -3197,8 +3197,8 @@@ static void idle_balance(int this_cpu, 
                 interval = msecs_to_jiffies(sd->balance_interval);
                 if (time_after(next_balance, sd->last_balance + interval))
                         next_balance = sd->last_balance + interval;
-               if (pulled_task) {
-                       this_rq->idle_stamp = 0;
+               if (pulled_task)
                         break;
-               }
         }
   
         raw_spin_lock(&this_rq->lock);
@@@ -3751,11 -3793,8 +3793,11 @@@ static void task_fork_fair(struct task_
   
         update_rq_clock(rq);
   
- -      if (unlikely(task_cpu(p) != this_cpu))
+ +      if (unlikely(task_cpu(p) != this_cpu)) {
+ +              rcu_read_lock();
                 __set_task_cpu(p, this_cpu);
+ +              rcu_read_unlock();
+ +      }
   
         update_curr(cfs_rq);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 21 Oct 2010 19:55:43 +0000 (12:55 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 21 Oct 2010 19:55:43 +0000 (12:55 -0700)
		1	2
arch/s390/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/hardirq.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_fair.c	patch \|	diff1 \|	diff2 \|	blob \| history