Merge branch 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 24 Jul 2011 16:07:03 +0000 (09:07 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sun, 24 Jul 2011 16:07:03 +0000 (09:07 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 24 Jul 2011 16:07:03 +0000 (09:07 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sun, 24 Jul 2011 16:07:03 +0000 (09:07 -0700)
diff --combined Documentation/kernel-parameters.txt

index aa47be7,1810a6b..40cc653
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -1159,10 -1159,6 +1159,6 @@@ bytes respectively. Such letter suffixe
                         for all guests.
                         Default is 1 (enabled) if in 64bit or 32bit-PAE mode
   
-       kvm-intel.bypass_guest_pf=
-                       [KVM,Intel] Disables bypassing of guest page faults
-                       on Intel chips. Default is 1 (enabled)
- 
         kvm-intel.ept=  [KVM,Intel] Disable extended page tables
                         (virtualized MMU) support on capable Intel chips.
                         Default is 1 (enabled)
@@@ -1737,6 -1733,10 +1733,10 @@@
         no-kvmapf       [X86,KVM] Disable paravirtualized asynchronous page
                         fault handling.
   
+       no-steal-acc    [X86,KVM] Disable paravirtualized steal time accounting.
+                       steal time is computed, but won't influence scheduler
+                       behaviour
+ 
         nolapic         [X86-32,APIC] Do not enable or use the local APIC.
   
         nolapic_timer   [X86-32,APIC] Do not use the local APIC timer.
@@@ -2015,8 -2015,6 +2015,8 @@@
                                 the default.
                                 off: Turn ECRC off
                                 on: Turn ECRC on.
+ +              realloc         reallocate PCI resources if allocations done by BIOS
+ +                              are erroneous.
   
         pcie_aspm=      [PCIE] Forcibly enable or disable PCIe Active State Power
                         Management.
diff --combined arch/powerpc/kvm/Kconfig

index 105b691,eeb42e0..78133de
--- 1/arch/powerpc/kvm/Kconfig
--- 2/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@@ -20,7 -20,6 +20,6 @@@ config KV
         bool
         select PREEMPT_NOTIFIERS
         select ANON_INODES
-       select KVM_MMIO
   
   config KVM_BOOK3S_HANDLER
         bool
@@@ -28,16 -27,22 +27,22 @@@
   config KVM_BOOK3S_32_HANDLER
         bool
         select KVM_BOOK3S_HANDLER
+       select KVM_MMIO
   
   config KVM_BOOK3S_64_HANDLER
         bool
         select KVM_BOOK3S_HANDLER
   
+ config KVM_BOOK3S_PR
+       bool
+       select KVM_MMIO
+ 
   config KVM_BOOK3S_32
         tristate "KVM support for PowerPC book3s_32 processors"
         depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
         select KVM
         select KVM_BOOK3S_32_HANDLER
+       select KVM_BOOK3S_PR
         ---help---
           Support running unmodified book3s_32 guest kernels
           in virtual machines on book3s_32 host processors.
@@@ -50,8 -55,8 +55,8 @@@
   config KVM_BOOK3S_64
         tristate "KVM support for PowerPC book3s_64 processors"
         depends on EXPERIMENTAL && PPC_BOOK3S_64
-       select KVM
         select KVM_BOOK3S_64_HANDLER
+       select KVM
         ---help---
           Support running unmodified book3s_64 and book3s_32 guest kernels
           in virtual machines on book3s_64 host processors.
@@@ -61,10 -66,34 +66,34 @@@
   
           If unsure, say N.
   
+ config KVM_BOOK3S_64_HV
+       bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
+       depends on KVM_BOOK3S_64
+       ---help---
+         Support running unmodified book3s_64 guest kernels in
+         virtual machines on POWER7 and PPC970 processors that have
+         hypervisor mode available to the host.
+ 
+         If you say Y here, KVM will use the hardware virtualization
+         facilities of POWER7 (and later) processors, meaning that
+         guest operating systems will run at full hardware speed
+         using supervisor and user modes.  However, this also means
+         that KVM is not usable under PowerVM (pHyp), is only usable
+         on POWER7 (or later) processors and PPC970-family processors,
+         and cannot emulate a different processor from the host processor.
+ 
+         If unsure, say N.
+ 
+ config KVM_BOOK3S_64_PR
+       def_bool y
+       depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV
+       select KVM_BOOK3S_PR
+ 
   config KVM_440
         bool "KVM support for PowerPC 440 processors"
         depends on EXPERIMENTAL && 44x
         select KVM
+       select KVM_MMIO
         ---help---
           Support running unmodified 440 guest kernels in virtual machines on
           440 host processors.
@@@ -89,6 -118,7 +118,7 @@@ config KVM_E50
         bool "KVM support for PowerPC E500 processors"
         depends on EXPERIMENTAL && E500
         select KVM
+       select KVM_MMIO
         ---help---
           Support running unmodified E500 guest kernels in virtual machines on
           E500 host processors.
@@@ -99,5 -129,6 +129,5 @@@
           If unsure, say N.
   
   source drivers/vhost/Kconfig
- -source drivers/virtio/Kconfig
   
   endif # VIRTUALIZATION
diff --combined arch/x86/Kconfig

index b212754,1f03e22..a67e014
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -20,7 -20,6 +20,7 @@@ config X8
         select HAVE_UNSTABLE_SCHED_CLOCK
         select HAVE_IDE
         select HAVE_OPROFILE
+ +      select HAVE_PCSPKR_PLATFORM
         select HAVE_PERF_EVENTS
         select HAVE_IRQ_WORK
         select HAVE_IOREMAP_PROT
@@@ -71,7 -70,6 +71,7 @@@
         select IRQ_FORCED_THREADING
         select USE_GENERIC_SMP_HELPERS if SMP
         select HAVE_BPF_JIT if (X86_64 && NET)
+ +      select CLKEVT_I8253
   
   config INSTRUCTION_DECODER
         def_bool (KPROBES || PERF_EVENTS)
@@@ -95,10 -93,6 +95,10 @@@ config CLOCKSOURCE_WATCHDO
   config GENERIC_CLOCKEVENTS
         def_bool y
   
+ +config ARCH_CLOCKSOURCE_DATA
+ +      def_bool y
+ +      depends on X86_64
+ +
   config GENERIC_CLOCKEVENTS_BROADCAST
         def_bool y
         depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
@@@ -390,21 -384,12 +390,21 @@@ config X86_INTEL_C
           This option compiles in support for the CE4100 SOC for settop
           boxes and media devices.
   
+ +config X86_INTEL_MID
+ +      bool "Intel MID platform support"
+ +      depends on X86_32
+ +      depends on X86_EXTENDED_PLATFORM
+ +      ---help---
+ +        Select to build a kernel capable of supporting Intel MID platform
+ +        systems which do not have the PCI legacy interfaces (Moorestown,
+ +        Medfield). If you are building for a PC class system say N here.
+ +
+ +if X86_INTEL_MID
+ +
   config X86_MRST
          bool "Moorestown MID platform"
         depends on PCI
         depends on PCI_GOANY
- -      depends on X86_32
- -      depends on X86_EXTENDED_PLATFORM
         depends on X86_IO_APIC
         select APB_TIMER
         select I2C
@@@ -419,8 -404,6 +419,8 @@@
           nor standard legacy replacement devices/features. e.g. Moorestown does
           not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
   
+ +endif
+ +
   config X86_RDC321X
         bool "RDC R-321x SoC"
         depends on X86_32
@@@ -529,6 -512,18 +529,18 @@@ menuconfig PARAVIRT_GUES
   
   if PARAVIRT_GUEST
   
+ config PARAVIRT_TIME_ACCOUNTING
+       bool "Paravirtual steal time accounting"
+       select PARAVIRT
+       default n
+       ---help---
+         Select this option to enable fine granularity task steal time
+         accounting. Time spent executing other tasks in parallel with
+         the current vCPU is discounted from the vCPU power. To account for
+         that, there can be a small performance impact.
+ 
+         If in doubt, say N here.
+ 
   source "arch/x86/xen/Kconfig"
   
   config KVM_CLOCK
@@@ -634,7 -629,6 +646,7 @@@ config HPET_EMULATE_RT
   config APB_TIMER
          def_bool y if MRST
          prompt "Langwell APB Timer Support" if X86_MRST
+ +       select DW_APB_TIMER
          help
            APB timer is the replacement for 8254, HPET on X86 MID platforms.
            The APBT provides a stable time base on SMP
@@@ -698,6 -692,33 +710,6 @@@ config CALGARY_IOMMU_ENABLED_BY_DEFAUL
           Calgary anyway, pass 'iommu=calgary' on the kernel command line.
           If unsure, say Y.
   
- -config AMD_IOMMU
- -      bool "AMD IOMMU support"
- -      select SWIOTLB
- -      select PCI_MSI
- -      select PCI_IOV
- -      depends on X86_64 && PCI && ACPI
- -      ---help---
- -        With this option you can enable support for AMD IOMMU hardware in
- -        your system. An IOMMU is a hardware component which provides
- -        remapping of DMA memory accesses from devices. With an AMD IOMMU you
- -        can isolate the the DMA memory of different devices and protect the
- -        system from misbehaving device drivers or hardware.
- -
- -        You can find out if your system has an AMD IOMMU if you look into
- -        your BIOS for an option to enable it or if you have an IVRS ACPI
- -        table.
- -
- -config AMD_IOMMU_STATS
- -      bool "Export AMD IOMMU statistics to debugfs"
- -      depends on AMD_IOMMU
- -      select DEBUG_FS
- -      ---help---
- -        This option enables code in the AMD IOMMU driver to collect various
- -        statistics about whats happening in the driver and exports that
- -        information to userspace via debugfs.
- -        If unsure, say N.
- -
   # need this always selected by IOMMU for the VIA workaround
   config SWIOTLB
         def_bool y if X86_64
@@@ -711,6 -732,9 +723,6 @@@
   config IOMMU_HELPER
         def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
   
- -config IOMMU_API
- -      def_bool (AMD_IOMMU || DMAR)
- -
   config MAXSMP
         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
         depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
@@@ -1158,7 -1182,7 +1170,7 @@@ comment "NUMA (Summit) requires SMP, 64
   config AMD_NUMA
         def_bool y
         prompt "Old style AMD Opteron NUMA detection"
- -      depends on NUMA && PCI
+ +      depends on X86_64 && NUMA && PCI
         ---help---
           Enable AMD NUMA node topology detection.  You should say Y here if
           you have a multi processor AMD system. This uses an old method to
@@@ -1930,6 -1954,55 +1942,6 @@@ config PCI_CNB20LE_QUIR
   
           You should say N unless you know you need this.
   
- -config DMAR
- -      bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
- -      depends on PCI_MSI && ACPI && EXPERIMENTAL
- -      help
- -        DMA remapping (DMAR) devices support enables independent address
- -        translations for Direct Memory Access (DMA) from devices.
- -        These DMA remapping devices are reported via ACPI tables
- -        and include PCI device scope covered by these DMA
- -        remapping devices.
- -
- -config DMAR_DEFAULT_ON
- -      def_bool y
- -      prompt "Enable DMA Remapping Devices by default"
- -      depends on DMAR
- -      help
- -        Selecting this option will enable a DMAR device at boot time if
- -        one is found. If this option is not selected, DMAR support can
- -        be enabled by passing intel_iommu=on to the kernel. It is
- -        recommended you say N here while the DMAR code remains
- -        experimental.
- -
- -config DMAR_BROKEN_GFX_WA
- -      bool "Workaround broken graphics drivers (going away soon)"
- -      depends on DMAR && BROKEN
- -      ---help---
- -        Current Graphics drivers tend to use physical address
- -        for DMA and avoid using DMA APIs. Setting this config
- -        option permits the IOMMU driver to set a unity map for
- -        all the OS-visible memory. Hence the driver can continue
- -        to use physical addresses for DMA, at least until this
- -        option is removed in the 2.6.32 kernel.
- -
- -config DMAR_FLOPPY_WA
- -      def_bool y
- -      depends on DMAR
- -      ---help---
- -        Floppy disk drivers are known to bypass DMA API calls
- -        thereby failing to work when IOMMU is enabled. This
- -        workaround will setup a 1:1 mapping for the first
- -        16MiB to make floppy (an ISA device) work.
- -
- -config INTR_REMAP
- -      bool "Support for Interrupt Remapping (EXPERIMENTAL)"
- -      depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
- -      ---help---
- -        Supports Interrupt remapping for IO-APIC and MSI devices.
- -        To use x2apic mode in the CPU's which support x2APIC enhancements or
- -        to support platforms with CPU's having > 8 bit APIC ID, say Y.
- -
   source "drivers/pci/pcie/Kconfig"
   
   source "drivers/pci/Kconfig"
diff --combined arch/x86/include/asm/msr-index.h

index d96bdb2,e3022cc..d52609a
--- 1/arch/x86/include/asm/msr-index.h
--- 2/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@@ -259,9 -259,6 +259,9 @@@
   #define MSR_IA32_TEMPERATURE_TARGET   0x000001a2
   
   #define MSR_IA32_ENERGY_PERF_BIAS     0x000001b0
+ +#define ENERGY_PERF_BIAS_PERFORMANCE  0
+ +#define ENERGY_PERF_BIAS_NORMAL               6
+ +#define ENERGY_PERF_BIAS_POWERSAVE    15
   
   #define MSR_IA32_PACKAGE_THERM_STATUS         0x000001b1
   
@@@ -441,6 -438,18 +441,18 @@@
   #define MSR_IA32_VMX_VMCS_ENUM          0x0000048a
   #define MSR_IA32_VMX_PROCBASED_CTLS2    0x0000048b
   #define MSR_IA32_VMX_EPT_VPID_CAP       0x0000048c
+ #define MSR_IA32_VMX_TRUE_PINBASED_CTLS  0x0000048d
+ #define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
+ #define MSR_IA32_VMX_TRUE_EXIT_CTLS      0x0000048f
+ #define MSR_IA32_VMX_TRUE_ENTRY_CTLS     0x00000490
+ 
+ /* VMX_BASIC bits and bitmasks */
+ #define VMX_BASIC_VMCS_SIZE_SHIFT     32
+ #define VMX_BASIC_64          0x0001000000000000LLU
+ #define VMX_BASIC_MEM_TYPE_SHIFT      50
+ #define VMX_BASIC_MEM_TYPE_MASK       0x003c000000000000LLU
+ #define VMX_BASIC_MEM_TYPE_WB 6LLU
+ #define VMX_BASIC_INOUT               0x0040000000000000LLU
   
   /* AMD-V MSRs */
   
diff --combined arch/x86/kvm/Kconfig

index 65cf823,99c3f05..988724b
--- 1/arch/x86/kvm/Kconfig
--- 2/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@@ -31,6 -31,7 +31,7 @@@ config KV
         select KVM_ASYNC_PF
         select USER_RETURN_NOTIFIER
         select KVM_MMIO
+       select TASK_DELAY_ACCT
         ---help---
           Support hosting fully virtualized guest machines using hardware
           virtualization extensions.  You will need a fairly recent
@@@ -76,5 -77,6 +77,5 @@@ config KVM_MMU_AUDI
   # the virtualization menu.
   source drivers/vhost/Kconfig
   source drivers/lguest/Kconfig
- -source drivers/virtio/Kconfig
   
   endif # VIRTUALIZATION
diff --combined kernel/sched.c

index 9aaf567,b35ac50..751a7cc
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -75,6 -75,9 +75,9 @@@
   #include <asm/tlb.h>
   #include <asm/irq_regs.h>
   #include <asm/mutex.h>
+ #ifdef CONFIG_PARAVIRT
+ #include <asm/paravirt.h>
+ #endif
   
   #include "sched_cpupri.h"
   #include "workqueue_sched.h"
@@@ -124,7 -127,7 +127,7 @@@
   
   static inline int rt_policy(int policy)
   {
- -      if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+ +      if (policy == SCHED_FIFO || policy == SCHED_RR)
                 return 1;
         return 0;
   }
@@@ -292,8 -295,8 +295,8 @@@ static DEFINE_SPINLOCK(task_group_lock)
    * (The default weight is 1024 - so there's no practical
    *  limitation from this.)
    */
- -#define MIN_SHARES    2
- -#define MAX_SHARES    (1UL << (18 + SCHED_LOAD_RESOLUTION))
+ +#define MIN_SHARES    (1UL <<  1)
+ +#define MAX_SHARES    (1UL << 18)
   
   static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
   #endif
@@@ -422,7 -425,6 +425,7 @@@ struct rt_rq 
    */
   struct root_domain {
         atomic_t refcount;
+ +      atomic_t rto_count;
         struct rcu_head rcu;
         cpumask_var_t span;
         cpumask_var_t online;
@@@ -432,6 -434,7 +435,6 @@@
          * one runnable RT task.
          */
         cpumask_var_t rto_mask;
- -      atomic_t rto_count;
         struct cpupri cpupri;
   };
   
@@@ -528,6 -531,12 +531,12 @@@ struct rq 
   #ifdef CONFIG_IRQ_TIME_ACCOUNTING
         u64 prev_irq_time;
   #endif
+ #ifdef CONFIG_PARAVIRT
+       u64 prev_steal_time;
+ #endif
+ #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       u64 prev_steal_time_rq;
+ #endif
   
         /* calc_load related fields */
         unsigned long calc_load_update;
@@@ -1568,6 -1577,38 +1577,6 @@@ static unsigned long cpu_avg_load_per_t
         return rq->avg_load_per_task;
   }
   
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -
- -/*
- - * Compute the cpu's hierarchical load factor for each task group.
- - * This needs to be done in a top-down fashion because the load of a child
- - * group is a fraction of its parents load.
- - */
- -static int tg_load_down(struct task_group *tg, void *data)
- -{
- -      unsigned long load;
- -      long cpu = (long)data;
- -
- -      if (!tg->parent) {
- -              load = cpu_rq(cpu)->load.weight;
- -      } else {
- -              load = tg->parent->cfs_rq[cpu]->h_load;
- -              load *= tg->se[cpu]->load.weight;
- -              load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
- -      }
- -
- -      tg->cfs_rq[cpu]->h_load = load;
- -
- -      return 0;
- -}
- -
- -static void update_h_load(long cpu)
- -{
- -      walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
- -}
- -
- -#endif
- -
   #ifdef CONFIG_PREEMPT
   
   static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@@ -1921,10 -1962,28 +1930,28 @@@ void account_system_vtime(struct task_s
   }
   EXPORT_SYMBOL_GPL(account_system_vtime);
   
- static void update_rq_clock_task(struct rq *rq, s64 delta)
+ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+ 
+ #ifdef CONFIG_PARAVIRT
+ static inline u64 steal_ticks(u64 steal)
   {
-       s64 irq_delta;
+       if (unlikely(steal > NSEC_PER_SEC))
+               return div_u64(steal, TICK_NSEC);
   
+       return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+ }
+ #endif
+ 
+ static void update_rq_clock_task(struct rq *rq, s64 delta)
+ {
+ /*
+  * In theory, the compile should just see 0 here, and optimize out the call
+  * to sched_rt_avg_update. But I don't trust it...
+  */
+ #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+       s64 steal = 0, irq_delta = 0;
+ #endif
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
         irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
   
         /*
@@@ -1947,12 -2006,35 +1974,35 @@@
   
         rq->prev_irq_time += irq_delta;
         delta -= irq_delta;
+ #endif
+ #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       if (static_branch((&paravirt_steal_rq_enabled))) {
+               u64 st;
+ 
+               steal = paravirt_steal_clock(cpu_of(rq));
+               steal -= rq->prev_steal_time_rq;
+ 
+               if (unlikely(steal > delta))
+                       steal = delta;
+ 
+               st = steal_ticks(steal);
+               steal = st * TICK_NSEC;
+ 
+               rq->prev_steal_time_rq += steal;
+ 
+               delta -= steal;
+       }
+ #endif
+ 
         rq->clock_task += delta;
   
-       if (irq_delta && sched_feat(NONIRQ_POWER))
-               sched_rt_avg_update(rq, irq_delta);
+ #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+       if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+               sched_rt_avg_update(rq, irq_delta + steal);
+ #endif
   }
   
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
   static int irqtime_account_hi_update(void)
   {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@@ -1987,12 -2069,7 +2037,7 @@@ static int irqtime_account_si_update(vo
   
   #define sched_clock_irqtime   (0)
   
- static void update_rq_clock_task(struct rq *rq, s64 delta)
- {
-       rq->clock_task += delta;
- }
- 
- #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+ #endif
   
   #include "sched_idletask.c"
   #include "sched_fair.c"
@@@ -2188,7 -2265,7 +2233,7 @@@ void set_task_cpu(struct task_struct *p
   
         if (task_cpu(p) != new_cpu) {
                 p->se.nr_migrations++;
- -              perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
+ +              perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
         }
   
         __set_task_cpu(p, new_cpu);
@@@ -2465,7 -2542,7 +2510,7 @@@ ttwu_do_wakeup(struct rq *rq, struct ta
         if (p->sched_class->task_woken)
                 p->sched_class->task_woken(rq, p);
   
- -      if (unlikely(rq->idle_stamp)) {
+ +      if (rq->idle_stamp) {
                 u64 delta = rq->clock - rq->idle_stamp;
                 u64 max = 2*sysctl_sched_migration_cost;
   
@@@ -2512,9 -2589,13 +2557,9 @@@ static int ttwu_remote(struct task_stru
   }
   
   #ifdef CONFIG_SMP
- -static void sched_ttwu_pending(void)
+ +static void sched_ttwu_do_pending(struct task_struct *list)
   {
         struct rq *rq = this_rq();
- -      struct task_struct *list = xchg(&rq->wake_list, NULL);
- -
- -      if (!list)
- -              return;
   
         raw_spin_lock(&rq->lock);
   
@@@ -2527,45 -2608,9 +2572,45 @@@
         raw_spin_unlock(&rq->lock);
   }
   
+ +#ifdef CONFIG_HOTPLUG_CPU
+ +
+ +static void sched_ttwu_pending(void)
+ +{
+ +      struct rq *rq = this_rq();
+ +      struct task_struct *list = xchg(&rq->wake_list, NULL);
+ +
+ +      if (!list)
+ +              return;
+ +
+ +      sched_ttwu_do_pending(list);
+ +}
+ +
+ +#endif /* CONFIG_HOTPLUG_CPU */
+ +
   void scheduler_ipi(void)
   {
- -      sched_ttwu_pending();
+ +      struct rq *rq = this_rq();
+ +      struct task_struct *list = xchg(&rq->wake_list, NULL);
+ +
+ +      if (!list)
+ +              return;
+ +
+ +      /*
+ +       * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+ +       * traditionally all their work was done from the interrupt return
+ +       * path. Now that we actually do some work, we need to make sure
+ +       * we do call them.
+ +       *
+ +       * Some archs already do call them, luckily irq_enter/exit nest
+ +       * properly.
+ +       *
+ +       * Arguably we should visit all archs and update all handlers,
+ +       * however a fair share of IPIs are still resched only so this would
+ +       * somewhat pessimize the simple resched case.
+ +       */
+ +      irq_enter();
+ +      sched_ttwu_do_pending(list);
+ +      irq_exit();
   }
   
   static void ttwu_queue_remote(struct task_struct *p, int cpu)
@@@ -2854,7 -2899,7 +2899,7 @@@ void sched_fork(struct task_struct *p
   #if defined(CONFIG_SMP)
         p->on_cpu = 0;
   #endif
- -#ifdef CONFIG_PREEMPT
+ +#ifdef CONFIG_PREEMPT_COUNT
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
   #endif
@@@ -3845,6 -3890,25 +3890,25 @@@ void account_idle_time(cputime_t cputim
                 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
   }
   
+ static __always_inline bool steal_account_process_tick(void)
+ {
+ #ifdef CONFIG_PARAVIRT
+       if (static_branch(&paravirt_steal_enabled)) {
+               u64 steal, st = 0;
+ 
+               steal = paravirt_steal_clock(smp_processor_id());
+               steal -= this_rq()->prev_steal_time;
+ 
+               st = steal_ticks(steal);
+               this_rq()->prev_steal_time += st * TICK_NSEC;
+ 
+               account_steal_time(st);
+               return st;
+       }
+ #endif
+       return false;
+ }
+ 
   #ifndef CONFIG_VIRT_CPU_ACCOUNTING
   
   #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@@ -3876,6 -3940,9 +3940,9 @@@ static void irqtime_account_process_tic
         cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
   
+       if (steal_account_process_tick())
+               return;
+ 
         if (irqtime_account_hi_update()) {
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
         } else if (irqtime_account_si_update()) {
@@@ -3929,6 -3996,9 +3996,9 @@@ void account_process_tick(struct task_s
                 return;
         }
   
+       if (steal_account_process_tick())
+               return;
+ 
         if (user_tick)
                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
         else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@@ -4306,8 -4376,11 +4376,8 @@@ EXPORT_SYMBOL(schedule)
   
   static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
   {
- -      bool ret = false;
- -
- -      rcu_read_lock();
         if (lock->owner != owner)
- -              goto fail;
+ +              return false;
   
         /*
          * Ensure we emit the owner->on_cpu, dereference _after_ checking
@@@ -4317,7 -4390,11 +4387,7 @@@
          */
         barrier();
   
- -      ret = owner->on_cpu;
- -fail:
- -      rcu_read_unlock();
- -
- -      return ret;
+ +      return owner->on_cpu;
   }
   
   /*
@@@ -4329,21 -4406,21 +4399,21 @@@ int mutex_spin_on_owner(struct mutex *l
         if (!sched_feat(OWNER_SPIN))
                 return 0;
   
+ +      rcu_read_lock();
         while (owner_running(lock, owner)) {
                 if (need_resched())
- -                      return 0;
+ +                      break;
   
                 arch_mutex_cpu_relax();
         }
+ +      rcu_read_unlock();
   
         /*
- -       * If the owner changed to another task there is likely
- -       * heavy contention, stop spinning.
+ +       * We break out the loop above on need_resched() and when the
+ +       * owner changed, which is a sign for heavy contention. Return
+ +       * success only when lock->owner is NULL.
          */
- -      if (lock->owner)
- -              return 0;
- -
- -      return 1;
+ +      return lock->owner == NULL;
   }
   #endif
   
@@@ -6550,7 -6627,7 +6620,7 @@@ static int sched_domain_debug_one(struc
                         break;
                 }
   
- -              if (!group->cpu_power) {
+ +              if (!group->sgp->power) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: domain->cpu_power not "
                                         "set\n");
@@@ -6574,9 -6651,9 +6644,9 @@@
                 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
   
                 printk(KERN_CONT " %s", str);
- -              if (group->cpu_power != SCHED_POWER_SCALE) {
+ +              if (group->sgp->power != SCHED_POWER_SCALE) {
                         printk(KERN_CONT " (cpu_power = %d)",
- -                              group->cpu_power);
+ +                              group->sgp->power);
                 }
   
                 group = group->next;
@@@ -6767,39 -6844,11 +6837,39 @@@ static struct root_domain *alloc_rootdo
         return rd;
   }
   
+ +static void free_sched_groups(struct sched_group *sg, int free_sgp)
+ +{
+ +      struct sched_group *tmp, *first;
+ +
+ +      if (!sg)
+ +              return;
+ +
+ +      first = sg;
+ +      do {
+ +              tmp = sg->next;
+ +
+ +              if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+ +                      kfree(sg->sgp);
+ +
+ +              kfree(sg);
+ +              sg = tmp;
+ +      } while (sg != first);
+ +}
+ +
   static void free_sched_domain(struct rcu_head *rcu)
   {
         struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
- -      if (atomic_dec_and_test(&sd->groups->ref))
+ +
+ +      /*
+ +       * If its an overlapping domain it has private groups, iterate and
+ +       * nuke them all.
+ +       */
+ +      if (sd->flags & SD_OVERLAP) {
+ +              free_sched_groups(sd->groups, 1);
+ +      } else if (atomic_dec_and_test(&sd->groups->ref)) {
+ +              kfree(sd->groups->sgp);
                 kfree(sd->groups);
+ +      }
         kfree(sd);
   }
   
@@@ -6966,7 -7015,6 +7036,7 @@@ int sched_smt_power_savings = 0, sched_
   struct sd_data {
         struct sched_domain **__percpu sd;
         struct sched_group **__percpu sg;
+ +      struct sched_group_power **__percpu sgp;
   };
   
   struct s_data {
@@@ -6986,73 -7034,15 +7056,73 @@@ struct sched_domain_topology_level
   typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
   typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
   
+ +#define SDTL_OVERLAP  0x01
+ +
   struct sched_domain_topology_level {
         sched_domain_init_f init;
         sched_domain_mask_f mask;
+ +      int                 flags;
         struct sd_data      data;
   };
   
- -/*
- - * Assumes the sched_domain tree is fully constructed
- - */
+ +static int
+ +build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+ +{
+ +      struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ +      const struct cpumask *span = sched_domain_span(sd);
+ +      struct cpumask *covered = sched_domains_tmpmask;
+ +      struct sd_data *sdd = sd->private;
+ +      struct sched_domain *child;
+ +      int i;
+ +
+ +      cpumask_clear(covered);
+ +
+ +      for_each_cpu(i, span) {
+ +              struct cpumask *sg_span;
+ +
+ +              if (cpumask_test_cpu(i, covered))
+ +                      continue;
+ +
+ +              sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ +                              GFP_KERNEL, cpu_to_node(i));
+ +
+ +              if (!sg)
+ +                      goto fail;
+ +
+ +              sg_span = sched_group_cpus(sg);
+ +
+ +              child = *per_cpu_ptr(sdd->sd, i);
+ +              if (child->child) {
+ +                      child = child->child;
+ +                      cpumask_copy(sg_span, sched_domain_span(child));
+ +              } else
+ +                      cpumask_set_cpu(i, sg_span);
+ +
+ +              cpumask_or(covered, covered, sg_span);
+ +
+ +              sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+ +              atomic_inc(&sg->sgp->ref);
+ +
+ +              if (cpumask_test_cpu(cpu, sg_span))
+ +                      groups = sg;
+ +
+ +              if (!first)
+ +                      first = sg;
+ +              if (last)
+ +                      last->next = sg;
+ +              last = sg;
+ +              last->next = first;
+ +      }
+ +      sd->groups = groups;
+ +
+ +      return 0;
+ +
+ +fail:
+ +      free_sched_groups(first, 0);
+ +
+ +      return -ENOMEM;
+ +}
+ +
   static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
   {
         struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@@ -7061,24 -7051,24 +7131,24 @@@
         if (child)
                 cpu = cpumask_first(sched_domain_span(child));
   
- -      if (sg)
+ +      if (sg) {
                 *sg = *per_cpu_ptr(sdd->sg, cpu);
+ +              (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+ +              atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+ +      }
   
         return cpu;
   }
   
   /*
- - * build_sched_groups takes the cpumask we wish to span, and a pointer
- - * to a function which identifies what group(along with sched group) a CPU
- - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- - * (due to the fact that we keep track of groups covered with a struct cpumask).
- - *
    * build_sched_groups will build a circular linked list of the groups
    * covered by the given span, and will set each group's ->cpumask correctly,
    * and ->cpu_power to 0.
+ + *
+ + * Assumes the sched_domain tree is fully constructed
    */
- -static void
- -build_sched_groups(struct sched_domain *sd)
+ +static int
+ +build_sched_groups(struct sched_domain *sd, int cpu)
   {
         struct sched_group *first = NULL, *last = NULL;
         struct sd_data *sdd = sd->private;
@@@ -7086,12 -7076,6 +7156,12 @@@
         struct cpumask *covered;
         int i;
   
+ +      get_group(cpu, sdd, &sd->groups);
+ +      atomic_inc(&sd->groups->ref);
+ +
+ +      if (cpu != cpumask_first(sched_domain_span(sd)))
+ +              return 0;
+ +
         lockdep_assert_held(&sched_domains_mutex);
         covered = sched_domains_tmpmask;
   
@@@ -7106,7 -7090,7 +7176,7 @@@
                         continue;
   
                 cpumask_clear(sched_group_cpus(sg));
- -              sg->cpu_power = 0;
+ +              sg->sgp->power = 0;
   
                 for_each_cpu(j, span) {
                         if (get_group(j, sdd, NULL) != group)
@@@ -7123,8 -7107,6 +7193,8 @@@
                 last = sg;
         }
         last->next = first;
+ +
+ +      return 0;
   }
   
   /*
@@@ -7139,17 -7121,12 +7209,17 @@@
    */
   static void init_sched_groups_power(int cpu, struct sched_domain *sd)
   {
- -      WARN_ON(!sd || !sd->groups);
+ +      struct sched_group *sg = sd->groups;
   
- -      if (cpu != group_first_cpu(sd->groups))
- -              return;
+ +      WARN_ON(!sd || !sg);
   
- -      sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+ +      do {
+ +              sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ +              sg = sg->next;
+ +      } while (sg != sd->groups);
+ +
+ +      if (cpu != group_first_cpu(sg))
+ +              return;
   
         update_group_power(sd, cpu);
   }
@@@ -7270,15 -7247,15 +7340,15 @@@ static enum s_alloc __visit_domain_allo
   static void claim_allocations(int cpu, struct sched_domain *sd)
   {
         struct sd_data *sdd = sd->private;
- -      struct sched_group *sg = sd->groups;
   
         WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
         *per_cpu_ptr(sdd->sd, cpu) = NULL;
   
- -      if (cpu == cpumask_first(sched_group_cpus(sg))) {
- -              WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+ +      if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
                 *per_cpu_ptr(sdd->sg, cpu) = NULL;
- -      }
+ +
+ +      if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
+ +              *per_cpu_ptr(sdd->sgp, cpu) = NULL;
   }
   
   #ifdef CONFIG_SCHED_SMT
@@@ -7303,7 -7280,7 +7373,7 @@@ static struct sched_domain_topology_lev
   #endif
         { sd_init_CPU, cpu_cpu_mask, },
   #ifdef CONFIG_NUMA
- -      { sd_init_NODE, cpu_node_mask, },
+ +      { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
         { sd_init_ALLNODES, cpu_allnodes_mask, },
   #endif
         { NULL, },
@@@ -7327,14 -7304,9 +7397,14 @@@ static int __sdt_alloc(const struct cpu
                 if (!sdd->sg)
                         return -ENOMEM;
   
+ +              sdd->sgp = alloc_percpu(struct sched_group_power *);
+ +              if (!sdd->sgp)
+ +                      return -ENOMEM;
+ +
                 for_each_cpu(j, cpu_map) {
                         struct sched_domain *sd;
                         struct sched_group *sg;
+ +                      struct sched_group_power *sgp;
   
                         sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
                                         GFP_KERNEL, cpu_to_node(j));
@@@ -7349,13 -7321,6 +7419,13 @@@
                                 return -ENOMEM;
   
                         *per_cpu_ptr(sdd->sg, j) = sg;
+ +
+ +                      sgp = kzalloc_node(sizeof(struct sched_group_power),
+ +                                      GFP_KERNEL, cpu_to_node(j));
+ +                      if (!sgp)
+ +                              return -ENOMEM;
+ +
+ +                      *per_cpu_ptr(sdd->sgp, j) = sgp;
                 }
         }
   
@@@ -7371,15 -7336,11 +7441,15 @@@ static void __sdt_free(const struct cpu
                 struct sd_data *sdd = &tl->data;
   
                 for_each_cpu(j, cpu_map) {
- -                      kfree(*per_cpu_ptr(sdd->sd, j));
+ +                      struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+ +                      if (sd && (sd->flags & SD_OVERLAP))
+ +                              free_sched_groups(sd->groups, 0);
                         kfree(*per_cpu_ptr(sdd->sg, j));
+ +                      kfree(*per_cpu_ptr(sdd->sgp, j));
                 }
                 free_percpu(sdd->sd);
                 free_percpu(sdd->sg);
+ +              free_percpu(sdd->sgp);
         }
   }
   
@@@ -7425,13 -7386,8 +7495,13 @@@ static int build_sched_domains(const st
                 struct sched_domain_topology_level *tl;
   
                 sd = NULL;
- -              for (tl = sched_domain_topology; tl->init; tl++)
+ +              for (tl = sched_domain_topology; tl->init; tl++) {
                         sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+ +                      if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ +                              sd->flags |= SD_OVERLAP;
+ +                      if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ +                              break;
+ +              }
   
                 while (sd->child)
                         sd = sd->child;
@@@ -7443,13 -7399,13 +7513,13 @@@
         for_each_cpu(i, cpu_map) {
                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
                         sd->span_weight = cpumask_weight(sched_domain_span(sd));
- -                      get_group(i, sd->private, &sd->groups);
- -                      atomic_inc(&sd->groups->ref);
- -
- -                      if (i != cpumask_first(sched_domain_span(sd)))
- -                              continue;
- -
- -                      build_sched_groups(sd);
+ +                      if (sd->flags & SD_OVERLAP) {
+ +                              if (build_overlap_sched_groups(sd, i))
+ +                                      goto error;
+ +                      } else {
+ +                              if (build_sched_groups(sd, i))
+ +                                      goto error;
+ +                      }
                 }
         }
   
@@@ -7859,14 -7815,18 +7929,14 @@@ int in_sched_functions(unsigned long ad
                 && addr < (unsigned long)__sched_text_end);
   }
   
- -static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+ +static void init_cfs_rq(struct cfs_rq *cfs_rq)
   {
         cfs_rq->tasks_timeline = RB_ROOT;
         INIT_LIST_HEAD(&cfs_rq->tasks);
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -      cfs_rq->rq = rq;
- -      /* allow initial update_cfs_load() to truncate */
- -#ifdef CONFIG_SMP
- -      cfs_rq->load_stamp = 1;
- -#endif
- -#endif
         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+ +#ifndef CONFIG_64BIT
+ +      cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+ +#endif
   }
   
   static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@@ -7882,18 -7842,27 +7952,18 @@@
         /* delimiter for bitsearch: */
         __set_bit(MAX_RT_PRIO, array->bitmap);
   
- -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ +#if defined CONFIG_SMP
         rt_rq->highest_prio.curr = MAX_RT_PRIO;
- -#ifdef CONFIG_SMP
         rt_rq->highest_prio.next = MAX_RT_PRIO;
- -#endif
- -#endif
- -#ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
- -      plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
+ +      plist_head_init(&rt_rq->pushable_tasks);
   #endif
   
         rt_rq->rt_time = 0;
         rt_rq->rt_throttled = 0;
         rt_rq->rt_runtime = 0;
         raw_spin_lock_init(&rt_rq->rt_runtime_lock);
- -
- -#ifdef CONFIG_RT_GROUP_SCHED
- -      rt_rq->rt_nr_boosted = 0;
- -      rt_rq->rq = rq;
- -#endif
   }
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -7902,17 -7871,11 +7972,17 @@@ static void init_tg_cfs_entry(struct ta
                                 struct sched_entity *parent)
   {
         struct rq *rq = cpu_rq(cpu);
- -      tg->cfs_rq[cpu] = cfs_rq;
- -      init_cfs_rq(cfs_rq, rq);
+ +
         cfs_rq->tg = tg;
+ +      cfs_rq->rq = rq;
+ +#ifdef CONFIG_SMP
+ +      /* allow initial update_cfs_load() to truncate */
+ +      cfs_rq->load_stamp = 1;
+ +#endif
   
+ +      tg->cfs_rq[cpu] = cfs_rq;
         tg->se[cpu] = se;
+ +
         /* se could be NULL for root_task_group */
         if (!se)
                 return;
@@@ -7935,14 -7898,12 +8005,14 @@@ static void init_tg_rt_entry(struct tas
   {
         struct rq *rq = cpu_rq(cpu);
   
- -      tg->rt_rq[cpu] = rt_rq;
- -      init_rt_rq(rt_rq, rq);
+ +      rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ +      rt_rq->rt_nr_boosted = 0;
+ +      rt_rq->rq = rq;
         rt_rq->tg = tg;
- -      rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
   
+ +      tg->rt_rq[cpu] = rt_rq;
         tg->rt_se[cpu] = rt_se;
+ +
         if (!rt_se)
                 return;
   
@@@ -8024,7 -7985,7 +8094,7 @@@ void __init sched_init(void
                 rq->nr_running = 0;
                 rq->calc_load_active = 0;
                 rq->calc_load_update = jiffies + LOAD_FREQ;
- -              init_cfs_rq(&rq->cfs, rq);
+ +              init_cfs_rq(&rq->cfs);
                 init_rt_rq(&rq->rt, rq);
   #ifdef CONFIG_FAIR_GROUP_SCHED
                 root_task_group.shares = root_task_group_load;
@@@ -8095,7 -8056,7 +8165,7 @@@
   #endif
   
   #ifdef CONFIG_RT_MUTEXES
- -      plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
+ +      plist_head_init(&init_task.pi_waiters);
   #endif
   
         /*
@@@ -8138,7 -8099,7 +8208,7 @@@
         scheduler_running = 1;
   }
   
- -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+ +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
   static inline int preempt_count_equals(int preempt_offset)
   {
         int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@@ -8148,6 -8109,7 +8218,6 @@@
   
   void __might_sleep(const char *file, int line, int preempt_offset)
   {
- -#ifdef in_atomic
         static unsigned long prev_jiffy;        /* ratelimiting */
   
         if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@@ -8169,6 -8131,7 +8239,6 @@@
         if (irqs_disabled())
                 print_irqtrace_events(current);
         dump_stack();
- -#endif
   }
   EXPORT_SYMBOL(__might_sleep);
   #endif
@@@ -8327,7 -8290,6 +8397,7 @@@ int alloc_fair_sched_group(struct task_
                 if (!se)
                         goto err_free_rq;
   
+ +              init_cfs_rq(cfs_rq);
                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
         }
   
@@@ -8355,7 -8317,7 +8425,7 @@@ static inline void unregister_fair_sche
         list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
   }
- -#else /* !CONFG_FAIR_GROUP_SCHED */
+ +#else /* !CONFIG_FAIR_GROUP_SCHED */
   static inline void free_fair_sched_group(struct task_group *tg)
   {
   }
@@@ -8376,8 -8338,7 +8446,8 @@@ static void free_rt_sched_group(struct 
   {
         int i;
   
- -      destroy_rt_bandwidth(&tg->rt_bandwidth);
+ +      if (tg->rt_se)
+ +              destroy_rt_bandwidth(&tg->rt_bandwidth);
   
         for_each_possible_cpu(i) {
                 if (tg->rt_rq)
@@@ -8418,8 -8379,6 +8488,8 @@@ int alloc_rt_sched_group(struct task_gr
                 if (!rt_se)
                         goto err_free_rq;
   
+ +              init_rt_rq(rt_rq, cpu_rq(i));
+ +              rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
                 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
         }
   
@@@ -8561,7 -8520,10 +8631,7 @@@ int sched_group_set_shares(struct task_
         if (!tg->se[0])
                 return -EINVAL;
   
- -      if (shares < MIN_SHARES)
- -              shares = MIN_SHARES;
- -      else if (shares > MAX_SHARES)
- -              shares = MAX_SHARES;
+ +      shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
   
         mutex_lock(&shares_mutex);
         if (tg->shares == shares)
diff --combined kernel/sched_features.h

index 1e7066d,ca3b025..2e74677
--- 1/kernel/sched_features.h
--- 2/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@@ -61,14 -61,12 +61,14 @@@ SCHED_FEAT(LB_BIAS, 1
   SCHED_FEAT(OWNER_SPIN, 1)
   
   /*
-  * Decrement CPU power based on irq activity
+  * Decrement CPU power based on time not spent running tasks
    */
- SCHED_FEAT(NONIRQ_POWER, 1)
+ SCHED_FEAT(NONTASK_POWER, 1)
   
   /*
    * Queue remote wakeups on the target CPU and process them
    * using the scheduler IPI. Reduces rq->lock contention/bounces.
    */
   SCHED_FEAT(TTWU_QUEUE, 1)
+ +
+ +SCHED_FEAT(FORCE_SD_OVERLAP, 0)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 24 Jul 2011 16:07:03 +0000 (09:07 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 24 Jul 2011 16:07:03 +0000 (09:07 -0700)
		1	2
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/msr-index.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_features.h	patch \|	diff1 \|	diff2 \|	blob \| history