Merge branch 'stable-3.2' into pandora-3.2

author Grazvydas Ignotas <notasas@gmail.com>

Sat, 26 Oct 2013 22:47:09 +0000 (01:47 +0300)

committer Grazvydas Ignotas <notasas@gmail.com>

Sat, 26 Oct 2013 22:47:09 +0000 (01:47 +0300)
author Grazvydas Ignotas <notasas@gmail.com>
Sat, 26 Oct 2013 22:47:09 +0000 (01:47 +0300)
committer Grazvydas Ignotas <notasas@gmail.com>
Sat, 26 Oct 2013 22:47:09 +0000 (01:47 +0300)
diff --combined Documentation/kernel-parameters.txt

index 74f6fdd,2ba8272..7b17bb3
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -503,11 -503,6 +503,11 @@@ bytes respectively. Such letter suffixe
                         Also note the kernel might malfunction if you disable
                         some critical bits.
   
+ +      cma=nn[MG]      [ARM,KNL]
+ +                      Sets the size of kernel global memory area for contiguous
+ +                      memory allocations. For more information, see
+ +                      include/linux/dma-contiguous.h
+ +
         cmo_free_hint=  [PPC] Format: { yes | no }
                         Specify whether pages are marked as being inactive
                         when they are freed.  This is used in CMO environments
@@@ -515,10 -510,6 +515,10 @@@
                         a hypervisor.
                         Default: yes
   
+ +      coherent_pool=nn[KMG]   [ARM,KNL]
+ +                      Sets the size of memory pool for coherent, atomic dma
+ +                      allocations if Contiguous Memory Allocator (CMA) is used.
+ +
         code_bytes      [X86] How many bytes of object code to print
                         in an oops report.
                         Range: 0 - 8192
@@@ -634,25 -625,6 +634,25 @@@
         no_debug_objects
                         [KNL] Disable object debugging
   
+ +      debug_guardpage_minorder=
+ +                      [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
+ +                      parameter allows control of the order of pages that will
+ +                      be intentionally kept free (and hence protected) by the
+ +                      buddy allocator. Bigger value increase the probability
+ +                      of catching random memory corruption, but reduce the
+ +                      amount of memory for normal system use. The maximum
+ +                      possible value is MAX_ORDER/2.  Setting this parameter
+ +                      to 1 or 2 should be enough to identify most random
+ +                      memory corruption problems caused by bugs in kernel or
+ +                      driver code when a CPU writes to (or reads from) a
+ +                      random memory location. Note that there exists a class
+ +                      of memory corruptions problems caused by buggy H/W or
+ +                      F/W or by drivers badly programing DMA (basically when
+ +                      memory is written at bus level and the CPU MMU is
+ +                      bypassed) which are not detectable by
+ +                      CONFIG_DEBUG_PAGEALLOC, hence this option will not help
+ +                      tracking down these problems.
+ +
         debugpat        [X86] Enable PAT debugging
   
         decnet.addr=    [HW,NET]
@@@ -762,6 -734,12 +762,12 @@@
         edd=            [EDD]
                         Format: {"off" | "on" | "skip[mbr]"}
   
+       efi_no_storage_paranoia [EFI; X86]
+                       Using this parameter you can use more than 50% of
+                       your efi variable storage. Use this parameter only if
+                       you are really sure that your UEFI does sane gc and
+                       fulfills the spec otherwise your board may brick.
+ 
         eisa_irq_edge=  [PARISC,HW]
                         See header of drivers/parisc/eisa.c.
   
diff --combined arch/arm/Kconfig

index 733f9b3,790ea68..7e8ab4e
--- 1/arch/arm/Kconfig
--- 2/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@@ -1,11 -1,8 +1,10 @@@
   config ARM
         bool
         default y
-       select HAVE_AOUT
         select HAVE_DMA_API_DEBUG
         select HAVE_IDE if PCI || ISA || PCMCIA
+ +      select HAVE_DMA_ATTRS
+ +      select HAVE_DMA_CONTIGUOUS if MMU
         select HAVE_MEMBLOCK
         select RTC_LIB
         select SYS_SUPPORTS_APM_EMULATION
@@@ -43,14 -40,6 +42,14 @@@
   config ARM_HAS_SG_CHAIN
         bool
   
+ +config NEED_SG_DMA_LENGTH
+ +      bool
+ +
+ +config ARM_DMA_USE_IOMMU
+ +      select NEED_SG_DMA_LENGTH
+ +      select ARM_HAS_SG_CHAIN
+ +      bool
+ +
   config HAVE_PWM
         bool
   
@@@ -188,9 -177,6 +187,9 @@@ config ZONE_DM
   config NEED_DMA_MAP_STATE
          def_bool y
   
+ +config ARCH_HAS_DMA_SET_COHERENT_MASK
+ +      bool
+ +
   config GENERIC_ISA_DMA
         bool
   
@@@ -531,7 -517,6 +530,7 @@@ config ARCH_IXP200
   config ARCH_IXP4XX
         bool "IXP4xx-based"
         depends on MMU
+ +      select ARCH_HAS_DMA_SET_COHERENT_MASK
         select CLKSRC_MMIO
         select CPU_XSCALE
         select ARCH_REQUIRE_GPIOLIB
@@@ -1718,14 -1703,6 +1717,14 @@@ config HW_PERF_EVENT
           Enable hardware performance counter support for perf events. If
           disabled, perf events will use software events only.
   
+ +config SYS_SUPPORTS_HUGETLBFS
+ +       def_bool y
+ +       depends on ARM_LPAE || (!CPU_USE_DOMAINS && !MEMORY_FAILURE)
+ +
+ +config HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ +       def_bool y
+ +       depends on SYS_SUPPORTS_HUGETLBFS
+ +
   source "mm/Kconfig"
   
   config FORCE_MAX_ZONEORDER
@@@ -1798,7 -1775,7 +1797,7 @@@ config LEDS_CP
           will overrule the CPU usage LED.
   
   config ALIGNMENT_TRAP
- -      bool
+ +      bool "Enable alignment trap"
         depends on CPU_CP15_MMU
         default y if !ARCH_EBSA110
         select HAVE_PROC_CPU if PROC_FS
@@@ -1860,11 -1837,6 +1859,11 @@@ config DEPRECATED_PARAM_STRUC
           This was deprecated in 2001 and announced to live on for 5 years.
           Some old boot loaders still use this way.
   
+ +config CPU_V7_SYSFS
+ +      bool
+ +      depends on CPU_V7 && SYSFS
+ +      default y
+ +
   endmenu
   
   menu "Boot options"
@@@ -2009,7 -1981,7 +2008,7 @@@ endchoic
   
   config XIP_KERNEL
         bool "Kernel Execute-In-Place from ROM"
- -      depends on !ZBOOT_ROM
+ +      depends on !ZBOOT_ROM && !ARM_LPAE
         help
           Execute-In-Place allows the kernel to run from non-volatile storage
           directly addressable by the CPU, such as NOR flash. This saves RAM
@@@ -2039,7 -2011,7 +2038,7 @@@ config XIP_PHYS_ADD
   
   config KEXEC
         bool "Kexec system call (EXPERIMENTAL)"
- -      depends on EXPERIMENTAL
+ +      depends on EXPERIMENTAL && (!SMP || HOTPLUG_CPU)
         help
           kexec is a system call that implements the ability to shutdown your
           current kernel, and to start another kernel.  It is like a reboot
diff --combined arch/arm/boot/compressed/head.S

index a35bbd8,8c57359..db712ad
--- 1/arch/arm/boot/compressed/head.S
--- 2/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@@ -10,6 -10,7 +10,7 @@@
    */
   #include <linux/linkage.h>
   
+       .arch   armv7-a
   /*
    * Debugging stuff
    *
@@@ -660,7 -661,6 +661,7 @@@ __armv7_mmu_cache_on
                 mcrne   p15, 0, r3, c2, c0, 0   @ load page table pointer
                 mcrne   p15, 0, r1, c3, c0, 0   @ load domain access control
   #endif
+ +              mcr     p15, 0, r0, c7, c5, 4   @ ISB
                 mcr     p15, 0, r0, c1, c0, 0   @ load control register
                 mrc     p15, 0, r0, c1, c0, 0   @ and read it back
                 mov     r0, #0
diff --combined arch/arm/mm/flush.c

index 711c842,fe61cab..ac5416a
--- 1/arch/arm/mm/flush.c
--- 2/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@@ -18,7 -18,6 +18,7 @@@
   #include <asm/smp_plat.h>
   #include <asm/system.h>
   #include <asm/tlbflush.h>
+ +#include <linux/hugetlb.h>
   
   #include "mm.h"
   
@@@ -174,22 -173,17 +174,22 @@@ void __flush_dcache_page(struct address
          * coherent with the kernels mapping.
          */
         if (!PageHighMem(page)) {
- -              __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
+ +              size_t page_size = PAGE_SIZE << compound_order(page);
+ +              __cpuc_flush_dcache_area(page_address(page), page_size);
         } else {
- -              void *addr = kmap_high_get(page);
- -              if (addr) {
- -                      __cpuc_flush_dcache_area(addr, PAGE_SIZE);
- -                      kunmap_high(page);
- -              } else if (cache_is_vipt()) {
- -                      /* unmapped pages might still be cached */
- -                      addr = kmap_atomic(page);
- -                      __cpuc_flush_dcache_area(addr, PAGE_SIZE);
- -                      kunmap_atomic(addr);
+ +              unsigned long i;
+ +              for(i = 0; i < (1 << compound_order(page)); i++) {
+ +                      struct page *cpage = page + i;
+ +                      void *addr = kmap_high_get(cpage);
+ +                      if (addr) {
+ +                              __cpuc_flush_dcache_area(addr, PAGE_SIZE);
+ +                              kunmap_high(cpage);
+ +                      } else if (cache_is_vipt()) {
+ +                              /* unmapped pages might still be cached */
+ +                              addr = kmap_atomic(cpage);
+ +                              __cpuc_flush_dcache_area(addr, PAGE_SIZE);
+ +                              kunmap_atomic(addr);
+ +                      }
                 }
         }
   
@@@ -296,7 -290,7 +296,7 @@@ void flush_dcache_page(struct page *pag
         mapping = page_mapping(page);
   
         if (!cache_ops_need_broadcast() &&
- -          mapping && !mapping_mapped(mapping))
+ +          mapping && !page_mapped(page))
                 clear_bit(PG_dcache_clean, &page->flags);
         else {
                 __flush_dcache_page(mapping, page);
@@@ -308,6 -302,39 +308,39 @@@
         }
   }
   EXPORT_SYMBOL(flush_dcache_page);
+ 
+ /*
+  * Ensure cache coherency for the kernel mapping of this page. We can
+  * assume that the page is pinned via kmap.
+  *
+  * If the page only exists in the page cache and there are no user
+  * space mappings, this is a no-op since the page was already marked
+  * dirty at creation.  Otherwise, we need to flush the dirty kernel
+  * cache lines directly.
+  */
+ void flush_kernel_dcache_page(struct page *page)
+ {
+       if (cache_is_vivt() || cache_is_vipt_aliasing()) {
+               struct address_space *mapping;
+ 
+               mapping = page_mapping(page);
+ 
+               if (!mapping || mapping_mapped(mapping)) {
+                       void *addr;
+ 
+                       addr = page_address(page);
+                       /*
+                        * kmap_atomic() doesn't set the page virtual
+                        * address for highmem pages, and
+                        * kunmap_atomic() takes care of cache
+                        * flushing already.
+                        */
+                       if (!IS_ENABLED(CONFIG_HIGHMEM) || addr)
+                               __cpuc_flush_dcache_area(addr, PAGE_SIZE);
+               }
+       }
+ }
+ EXPORT_SYMBOL(flush_kernel_dcache_page);
   
   /*
    * Flush an anonymous page so that users of get_user_pages()
diff --combined arch/arm/mm/init.c

index 8848136,cc3f35d..6386e70
--- 1/arch/arm/mm/init.c
--- 2/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@@ -20,7 -20,7 +20,7 @@@
   #include <linux/highmem.h>
   #include <linux/gfp.h>
   #include <linux/memblock.h>
- -#include <linux/sort.h>
+ +#include <linux/dma-contiguous.h>
   
   #include <asm/mach-types.h>
   #include <asm/prom.h>
@@@ -98,6 -98,9 +98,9 @@@ void show_mem(unsigned int filter
         printk("Mem-info:\n");
         show_free_areas(filter);
   
+       if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+               return;
+ 
         for_each_bank (i, mi) {
                 struct membank *bank = &mi->bank[i];
                 unsigned int pfn1, pfn2;
@@@ -134,18 -137,30 +137,18 @@@
   }
   
   static void __init find_limits(unsigned long *min, unsigned long *max_low,
- -      unsigned long *max_high)
+ +                             unsigned long *max_high)
   {
         struct meminfo *mi = &meminfo;
         int i;
   
- -      *min = -1UL;
- -      *max_low = *max_high = 0;
- -
- -      for_each_bank (i, mi) {
- -              struct membank *bank = &mi->bank[i];
- -              unsigned long start, end;
- -
- -              start = bank_pfn_start(bank);
- -              end = bank_pfn_end(bank);
- -
- -              if (*min > start)
- -                      *min = start;
- -              if (*max_high < end)
- -                      *max_high = end;
- -              if (bank->highmem)
- -                      continue;
- -              if (*max_low < end)
- -                      *max_low = end;
- -      }
+ +      /* This assumes the meminfo array is properly sorted */
+ +      *min = bank_pfn_start(&mi->bank[0]);
+ +      for_each_bank (i, mi)
+ +              if (mi->bank[i].highmem)
+ +                              break;
+ +      *max_low = bank_pfn_end(&mi->bank[i - 1]);
+ +      *max_high = bank_pfn_end(&mi->bank[mi->nr_banks - 1]);
   }
   
   static void __init arm_bootmem_init(unsigned long start_pfn,
@@@ -211,7 -226,7 +214,7 @@@ EXPORT_SYMBOL(arm_dma_zone_size)
    * allocations.  This must be the smallest DMA mask in the system,
    * so a successful GFP_DMA allocation will always satisfy this.
    */
- -u32 arm_dma_limit;
+ +phys_addr_t arm_dma_limit;
   
   static void __init arm_adjust_dma_zone(unsigned long *size, unsigned long *hole,
         unsigned long dma_size)
@@@ -226,17 -241,6 +229,17 @@@
   }
   #endif
   
+ +void __init setup_dma_zone(struct machine_desc *mdesc)
+ +{
+ +#ifdef CONFIG_ZONE_DMA
+ +      if (mdesc->dma_zone_size) {
+ +              arm_dma_zone_size = mdesc->dma_zone_size;
+ +              arm_dma_limit = PHYS_OFFSET + arm_dma_zone_size - 1;
+ +      } else
+ +              arm_dma_limit = 0xffffffff;
+ +#endif
+ +}
+ +
   static void __init arm_bootmem_free(unsigned long min, unsigned long max_low,
         unsigned long max_high)
   {
@@@ -284,9 -288,12 +287,9 @@@
          * Adjust the sizes according to any special requirements for
          * this machine type.
          */
- -      if (arm_dma_zone_size) {
+ +      if (arm_dma_zone_size)
                 arm_adjust_dma_zone(zone_size, zhole_size,
                         arm_dma_zone_size >> PAGE_SHIFT);
- -              arm_dma_limit = PHYS_OFFSET + arm_dma_zone_size - 1;
- -      } else
- -              arm_dma_limit = 0xffffffff;
   #endif
   
         free_area_init_node(0, zone_size, min, zhole_size);
@@@ -315,10 -322,19 +318,10 @@@ static void arm_memory_present(void
   }
   #endif
   
- -static int __init meminfo_cmp(const void *_a, const void *_b)
- -{
- -      const struct membank *a = _a, *b = _b;
- -      long cmp = bank_pfn_start(a) - bank_pfn_start(b);
- -      return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
- -}
- -
   void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
   {
         int i;
   
- -      sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL);
- -
         memblock_init();
         for (i = 0; i < mi->nr_banks; i++)
                 memblock_add(mi->bank[i].start, mi->bank[i].size);
@@@ -358,12 -374,6 +361,12 @@@
         if (mdesc->reserve)
                 mdesc->reserve();
   
+ +      /*
+ +       * reserve memory for DMA contigouos allocations,
+ +       * must come from DMA area inside low memory
+ +       */
+ +      dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));
+ +
         memblock_analyze();
         memblock_dump_all();
   }
@@@ -396,6 -406,8 +399,6 @@@ void __init bootmem_init(void
          */
         arm_bootmem_free(min, max_low, max_high);
   
- -      high_memory = __va(((phys_addr_t)max_low << PAGE_SHIFT) - 1) + 1;
- -
         /*
          * This doesn't seem to be used by the Linux memory manager any
          * more, but is used by ll_rw_block.  If we can get rid of it, we
diff --combined arch/arm/mm/nommu.c

index 4fc6794,a5018fb..385171e
--- 1/arch/arm/mm/nommu.c
--- 2/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@@ -29,8 -29,6 +29,8 @@@ void __init arm_mm_memblock_reserve(voi
   
   void __init sanity_check_meminfo(void)
   {
+ +      phys_addr_t end = bank_phys_end(&meminfo.bank[meminfo.nr_banks - 1]);
+ +      high_memory = __va(end - 1) + 1;
   }
   
   /*
@@@ -45,7 -43,7 +45,7 @@@ void __init paging_init(struct machine_
   /*
    * We don't need to do anything here for nommu machines.
    */
- -void setup_mm_for_reboot(char mode)
+ +void setup_mm_for_reboot(void)
   {
   }
   
@@@ -55,6 -53,12 +55,12 @@@ void flush_dcache_page(struct page *pag
   }
   EXPORT_SYMBOL(flush_dcache_page);
   
+ void flush_kernel_dcache_page(struct page *page)
+ {
+       __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
+ }
+ EXPORT_SYMBOL(flush_kernel_dcache_page);
+ 
   void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
                        unsigned long uaddr, void *dst, const void *src,
                        unsigned long len)
diff --combined arch/powerpc/kernel/sysfs.c

index 2a2e155,ca683a1..876c52b
--- 1/arch/powerpc/kernel/sysfs.c
--- 2/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@@ -18,6 -18,7 +18,7 @@@
   #include <asm/machdep.h>
   #include <asm/smp.h>
   #include <asm/pmc.h>
+ #include <asm/firmware.h>
   
   #include "cacheinfo.h"
   
@@@ -178,14 -179,24 +179,24 @@@ SYSFS_PMCSETUP(purr, SPRN_PURR)
   SYSFS_PMCSETUP(spurr, SPRN_SPURR);
   SYSFS_PMCSETUP(dscr, SPRN_DSCR);
   
+ /*
+   Lets only enable read for phyp resources and
+   enable write when needed with a separate function.
+   Lets be conservative and default to pseries.
+ */
   static SYSDEV_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
   static SYSDEV_ATTR(spurr, 0600, show_spurr, NULL);
   static SYSDEV_ATTR(dscr, 0600, show_dscr, store_dscr);
- static SYSDEV_ATTR(purr, 0600, show_purr, store_purr);
+ static SYSDEV_ATTR(purr, 0400, show_purr, store_purr);
   
   unsigned long dscr_default = 0;
   EXPORT_SYMBOL(dscr_default);
   
+ static void add_write_permission_dev_attr(struct sysdev_attribute *attr)
+ {
+       attr->attr.mode |= 0200;
+ }
+ 
   static ssize_t show_dscr_default(struct sysdev_class *class,
                 struct sysdev_class_attribute *attr, char *buf)
   {
@@@ -394,8 -405,11 +405,11 @@@ static void __cpuinit register_cpu_onli
         if (cpu_has_feature(CPU_FTR_MMCRA))
                 sysdev_create_file(s, &attr_mmcra);
   
-       if (cpu_has_feature(CPU_FTR_PURR))
+       if (cpu_has_feature(CPU_FTR_PURR)) {
+               if (!firmware_has_feature(FW_FEATURE_LPAR))
+                       add_write_permission_dev_attr(&attr_purr);
                 sysdev_create_file(s, &attr_purr);
+       }
   
         if (cpu_has_feature(CPU_FTR_SPURR))
                 sysdev_create_file(s, &attr_spurr);
@@@ -603,7 -617,7 +617,7 @@@ static void register_nodes(void
   int sysfs_add_device_to_node(struct sys_device *dev, int nid)
   {
         struct node *node = &node_devices[nid];
- -      return sysfs_create_link(&node->sysdev.kobj, &dev->kobj,
+ +      return sysfs_create_link(&node->dev.kobj, &dev->kobj,
                         kobject_name(&dev->kobj));
   }
   EXPORT_SYMBOL_GPL(sysfs_add_device_to_node);
@@@ -611,7 -625,7 +625,7 @@@
   void sysfs_remove_device_from_node(struct sys_device *dev, int nid)
   {
         struct node *node = &node_devices[nid];
- -      sysfs_remove_link(&node->sysdev.kobj, kobject_name(&dev->kobj));
+ +      sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj));
   }
   EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node);
   
diff --combined arch/x86/Kconfig

index 2b1b88e,fb2e69d..a939180
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -73,7 -73,6 +73,7 @@@ config X8
         select IRQ_FORCED_THREADING
         select USE_GENERIC_SMP_HELPERS if SMP
         select HAVE_BPF_JIT if (X86_64 && NET)
+ +      select HAVE_ARCH_TRANSPARENT_HUGEPAGE
         select CLKEVT_I8253
         select ARCH_HAVE_NMI_SAFE_CMPXCHG
   
@@@ -2121,6 -2120,7 +2121,7 @@@ source "fs/Kconfig.binfmt
   config IA32_EMULATION
         bool "IA32 Emulation"
         depends on X86_64
+       select BINFMT_ELF
         select COMPAT_BINFMT_ELF
         ---help---
           Include code to run 32-bit programs under a 64-bit kernel. You should
diff --combined drivers/base/memory.c

index f17e3ea,732ad0d..38d0a34
--- 1/drivers/base/memory.c
--- 2/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@@ -1,5 -1,5 +1,5 @@@
   /*
- - * drivers/base/memory.c - basic Memory class support
+ + * Memory subsystem support
    *
    * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
    *            Dave Hansen <haveblue@us.ibm.com>
@@@ -10,6 -10,7 +10,6 @@@
    * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
    */
   
- -#include <linux/sysdev.h>
   #include <linux/module.h>
   #include <linux/init.h>
   #include <linux/topology.h>
@@@ -37,9 -38,26 +37,9 @@@ static inline int base_memory_block_id(
         return section_nr / sections_per_block;
   }
   
- -static struct sysdev_class memory_sysdev_class = {
+ +static struct bus_type memory_subsys = {
         .name = MEMORY_CLASS_NAME,
- -};
- -
- -static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj)
- -{
- -      return MEMORY_CLASS_NAME;
- -}
- -
- -static int memory_uevent(struct kset *kset, struct kobject *obj,
- -                      struct kobj_uevent_env *env)
- -{
- -      int retval = 0;
- -
- -      return retval;
- -}
- -
- -static const struct kset_uevent_ops memory_uevent_ops = {
- -      .name           = memory_uevent_name,
- -      .uevent         = memory_uevent,
+ +      .dev_name = MEMORY_CLASS_NAME,
   };
   
   static BLOCKING_NOTIFIER_HEAD(memory_chain);
@@@ -78,21 -96,21 +78,21 @@@ int register_memory(struct memory_bloc
   {
         int error;
   
- -      memory->sysdev.cls = &memory_sysdev_class;
- -      memory->sysdev.id = memory->start_section_nr / sections_per_block;
+ +      memory->dev.bus = &memory_subsys;
+ +      memory->dev.id = memory->start_section_nr / sections_per_block;
   
- -      error = sysdev_register(&memory->sysdev);
+ +      error = device_register(&memory->dev);
         return error;
   }
   
   static void
   unregister_memory(struct memory_block *memory)
   {
- -      BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
+ +      BUG_ON(memory->dev.bus != &memory_subsys);
   
         /* drop the ref. we got in remove_memory_block() */
- -      kobject_put(&memory->sysdev.kobj);
- -      sysdev_unregister(&memory->sysdev);
+ +      kobject_put(&memory->dev.kobj);
+ +      device_unregister(&memory->dev);
   }
   
   unsigned long __weak memory_block_size_bytes(void)
@@@ -120,22 -138,22 +120,22 @@@ static unsigned long get_memory_block_s
    * uses.
    */
   
- -static ssize_t show_mem_start_phys_index(struct sys_device *dev,
- -                      struct sysdev_attribute *attr, char *buf)
+ +static ssize_t show_mem_start_phys_index(struct device *dev,
+ +                      struct device_attribute *attr, char *buf)
   {
         struct memory_block *mem =
- -              container_of(dev, struct memory_block, sysdev);
+ +              container_of(dev, struct memory_block, dev);
         unsigned long phys_index;
   
         phys_index = mem->start_section_nr / sections_per_block;
         return sprintf(buf, "%08lx\n", phys_index);
   }
   
- -static ssize_t show_mem_end_phys_index(struct sys_device *dev,
- -                      struct sysdev_attribute *attr, char *buf)
+ +static ssize_t show_mem_end_phys_index(struct device *dev,
+ +                      struct device_attribute *attr, char *buf)
   {
         struct memory_block *mem =
- -              container_of(dev, struct memory_block, sysdev);
+ +              container_of(dev, struct memory_block, dev);
         unsigned long phys_index;
   
         phys_index = mem->end_section_nr / sections_per_block;
@@@ -145,15 -163,17 +145,17 @@@
   /*
    * Show whether the section of memory is likely to be hot-removable
    */
- -static ssize_t show_mem_removable(struct sys_device *dev,
- -                      struct sysdev_attribute *attr, char *buf)
+ +static ssize_t show_mem_removable(struct device *dev,
+ +                      struct device_attribute *attr, char *buf)
   {
         unsigned long i, pfn;
         int ret = 1;
         struct memory_block *mem =
- -              container_of(dev, struct memory_block, sysdev);
+ +              container_of(dev, struct memory_block, dev);
   
         for (i = 0; i < sections_per_block; i++) {
+               if (!present_section_nr(mem->start_section_nr + i))
+                       continue;
                 pfn = section_nr_to_pfn(mem->start_section_nr + i);
                 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
         }
@@@ -164,11 -184,11 +166,11 @@@
   /*
    * online, offline, going offline, etc.
    */
- -static ssize_t show_mem_state(struct sys_device *dev,
- -                      struct sysdev_attribute *attr, char *buf)
+ +static ssize_t show_mem_state(struct device *dev,
+ +                      struct device_attribute *attr, char *buf)
   {
         struct memory_block *mem =
- -              container_of(dev, struct memory_block, sysdev);
+ +              container_of(dev, struct memory_block, dev);
         ssize_t len = 0;
   
         /*
@@@ -306,13 -326,13 +308,13 @@@ out
   }
   
   static ssize_t
- -store_mem_state(struct sys_device *dev,
- -              struct sysdev_attribute *attr, const char *buf, size_t count)
+ +store_mem_state(struct device *dev,
+ +              struct device_attribute *attr, const char *buf, size_t count)
   {
         struct memory_block *mem;
         int ret = -EINVAL;
   
- -      mem = container_of(dev, struct memory_block, sysdev);
+ +      mem = container_of(dev, struct memory_block, dev);
   
         if (!strncmp(buf, "online", min((int)count, 6)))
                 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
@@@ -333,41 -353,41 +335,41 @@@
    * s.t. if I offline all of these sections I can then
    * remove the physical device?
    */
- -static ssize_t show_phys_device(struct sys_device *dev,
- -                              struct sysdev_attribute *attr, char *buf)
+ +static ssize_t show_phys_device(struct device *dev,
+ +                              struct device_attribute *attr, char *buf)
   {
         struct memory_block *mem =
- -              container_of(dev, struct memory_block, sysdev);
+ +              container_of(dev, struct memory_block, dev);
         return sprintf(buf, "%d\n", mem->phys_device);
   }
   
- -static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
- -static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
- -static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
- -static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
- -static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL);
+ +static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
+ +static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
+ +static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
+ +static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
+ +static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL);
   
   #define mem_create_simple_file(mem, attr_name)        \
- -      sysdev_create_file(&mem->sysdev, &attr_##attr_name)
+ +      device_create_file(&mem->dev, &dev_attr_##attr_name)
   #define mem_remove_simple_file(mem, attr_name)        \
- -      sysdev_remove_file(&mem->sysdev, &attr_##attr_name)
+ +      device_remove_file(&mem->dev, &dev_attr_##attr_name)
   
   /*
    * Block size attribute stuff
    */
   static ssize_t
- -print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr,
+ +print_block_size(struct device *dev, struct device_attribute *attr,
                  char *buf)
   {
         return sprintf(buf, "%lx\n", get_memory_block_size());
   }
   
- -static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
+ +static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
   
   static int block_size_init(void)
   {
- -      return sysfs_create_file(&memory_sysdev_class.kset.kobj,
- -                              &attr_block_size_bytes.attr);
+ +      return device_create_file(memory_subsys.dev_root,
+ +                                &dev_attr_block_size_bytes);
   }
   
   /*
@@@ -378,7 -398,7 +380,7 @@@
    */
   #ifdef CONFIG_ARCH_MEMORY_PROBE
   static ssize_t
- -memory_probe_store(struct class *class, struct class_attribute *attr,
+ +memory_probe_store(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
   {
         u64 phys_addr;
@@@ -405,11 -425,12 +407,11 @@@
   out:
         return ret;
   }
- -static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
+ +static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
   
   static int memory_probe_init(void)
   {
- -      return sysfs_create_file(&memory_sysdev_class.kset.kobj,
- -                              &class_attr_probe.attr);
+ +      return device_create_file(memory_subsys.dev_root, &dev_attr_probe);
   }
   #else
   static inline int memory_probe_init(void)
@@@ -425,8 -446,8 +427,8 @@@
   
   /* Soft offline a page */
   static ssize_t
- -store_soft_offline_page(struct class *class,
- -                      struct class_attribute *attr,
+ +store_soft_offline_page(struct device *dev,
+ +                      struct device_attribute *attr,
                         const char *buf, size_t count)
   {
         int ret;
@@@ -444,8 -465,8 +446,8 @@@
   
   /* Forcibly offline a page, including killing processes. */
   static ssize_t
- -store_hard_offline_page(struct class *class,
- -                      struct class_attribute *attr,
+ +store_hard_offline_page(struct device *dev,
+ +                      struct device_attribute *attr,
                         const char *buf, size_t count)
   {
         int ret;
@@@ -459,18 -480,18 +461,18 @@@
         return ret ? ret : count;
   }
   
- -static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
- -static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
+ +static DEVICE_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
+ +static DEVICE_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
   
   static __init int memory_fail_init(void)
   {
         int err;
   
- -      err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
- -                              &class_attr_soft_offline_page.attr);
+ +      err = device_create_file(memory_subsys.dev_root,
+ +                              &dev_attr_soft_offline_page);
         if (!err)
- -              err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
- -                              &class_attr_hard_offline_page.attr);
+ +              err = device_create_file(memory_subsys.dev_root,
+ +                              &dev_attr_hard_offline_page);
         return err;
   }
   #else
@@@ -490,23 -511,31 +492,23 @@@ int __weak arch_get_memory_phys_device(
         return 0;
   }
   
+ +/*
+ + * A reference for the returned object is held and the reference for the
+ + * hinted object is released.
+ + */
   struct memory_block *find_memory_block_hinted(struct mem_section *section,
                                               struct memory_block *hint)
   {
- -      struct kobject *kobj;
- -      struct sys_device *sysdev;
- -      struct memory_block *mem;
- -      char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
         int block_id = base_memory_block_id(__section_nr(section));
+ +      struct device *hintdev = hint ? &hint->dev : NULL;
+ +      struct device *dev;
   
- -      kobj = hint ? &hint->sysdev.kobj : NULL;
- -
- -      /*
- -       * This only works because we know that section == sysdev->id
- -       * slightly redundant with sysdev_register()
- -       */
- -      sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, block_id);
- -
- -      kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj);
- -      if (!kobj)
+ +      dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
+ +      if (hint)
+ +              put_device(&hint->dev);
+ +      if (!dev)
                 return NULL;
- -
- -      sysdev = container_of(kobj, struct sys_device, kobj);
- -      mem = container_of(sysdev, struct memory_block, sysdev);
- -
- -      return mem;
+ +      return container_of(dev, struct memory_block, dev);
   }
   
   /*
@@@ -515,7 -544,7 +517,7 @@@
    * this gets to be a real problem, we can always use a radix
    * tree or something here.
    *
- - * This could be made generic for all sysdev classes.
+ + * This could be made generic for all device subsystems.
    */
   struct memory_block *find_memory_block(struct mem_section *section)
   {
@@@ -571,7 -600,7 +573,7 @@@ static int add_memory_section(int nid, 
         mem = find_memory_block(section);
         if (mem) {
                 mem->section_count++;
- -              kobject_put(&mem->sysdev.kobj);
+ +              kobject_put(&mem->dev.kobj);
         } else
                 ret = init_memory_block(&mem, section, state);
   
@@@ -604,7 -633,7 +606,7 @@@ int remove_memory_block(unsigned long n
                 unregister_memory(mem);
                 kfree(mem);
         } else
- -              kobject_put(&mem->sysdev.kobj);
+ +              kobject_put(&mem->dev.kobj);
   
         mutex_unlock(&mem_sysfs_mutex);
         return 0;
@@@ -637,7 -666,8 +639,7 @@@ int __init memory_dev_init(void
         int err;
         unsigned long block_sz;
   
- -      memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
- -      ret = sysdev_class_register(&memory_sysdev_class);
+ +      ret = subsys_system_register(&memory_subsys, NULL);
         if (ret)
                 goto out;
   
diff --combined fs/debugfs/inode.c

index 8676415,fb001cd..d813d6f
--- 1/fs/debugfs/inode.c
--- 2/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@@ -30,7 -30,7 +30,7 @@@ static struct vfsmount *debugfs_mount
   static int debugfs_mount_count;
   static bool debugfs_registered;
   
- -static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev,
+ +static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev,
                                        void *data, const struct file_operations *fops)
   
   {
@@@ -69,7 -69,7 +69,7 @@@
   
   /* SMP-safe */
   static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
- -                       int mode, dev_t dev, void *data,
+ +                       umode_t mode, dev_t dev, void *data,
                          const struct file_operations *fops)
   {
         struct inode *inode;
@@@ -87,7 -87,7 +87,7 @@@
         return error;
   }
   
- -static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
+ +static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode,
                          void *data, const struct file_operations *fops)
   {
         int res;
@@@ -101,14 -101,14 +101,14 @@@
         return res;
   }
   
- -static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode,
+ +static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode,
                         void *data, const struct file_operations *fops)
   {
         mode = (mode & S_IALLUGO) | S_IFLNK;
         return debugfs_mknod(dir, dentry, mode, 0, data, fops);
   }
   
- -static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode,
+ +static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                           void *data, const struct file_operations *fops)
   {
         int res;
@@@ -146,7 -146,7 +146,7 @@@ static struct file_system_type debug_fs
         .kill_sb =      kill_litter_super,
   };
   
- -static int debugfs_create_by_name(const char *name, mode_t mode,
+ +static int debugfs_create_by_name(const char *name, umode_t mode,
                                   struct dentry *parent,
                                   struct dentry **dentry,
                                   void *data,
@@@ -214,7 -214,7 +214,7 @@@
    * If debugfs is not enabled in the kernel, the value -%ENODEV will be
    * returned.
    */
- -struct dentry *debugfs_create_file(const char *name, mode_t mode,
+ +struct dentry *debugfs_create_file(const char *name, umode_t mode,
                                    struct dentry *parent, void *data,
                                    const struct file_operations *fops)
   {
@@@ -380,8 -380,7 +380,7 @@@ EXPORT_SYMBOL_GPL(debugfs_remove)
    */
   void debugfs_remove_recursive(struct dentry *dentry)
   {
-       struct dentry *child;
-       struct dentry *parent;
+       struct dentry *child, *next, *parent;
   
         if (!dentry)
                 return;
@@@ -391,61 -390,37 +390,37 @@@
                 return;
   
         parent = dentry;
+  down:
         mutex_lock(&parent->d_inode->i_mutex);
+       list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) {
+               if (!debugfs_positive(child))
+                       continue;
   
-       while (1) {
-               /*
-                * When all dentries under "parent" has been removed,
-                * walk up the tree until we reach our starting point.
-                */
-               if (list_empty(&parent->d_subdirs)) {
-                       mutex_unlock(&parent->d_inode->i_mutex);
-                       if (parent == dentry)
-                               break;
-                       parent = parent->d_parent;
-                       mutex_lock(&parent->d_inode->i_mutex);
-               }
-               child = list_entry(parent->d_subdirs.next, struct dentry,
-                               d_u.d_child);
-  next_sibling:
- 
-               /*
-                * If "child" isn't empty, walk down the tree and
-                * remove all its descendants first.
-                */
+               /* perhaps simple_empty(child) makes more sense */
                 if (!list_empty(&child->d_subdirs)) {
                         mutex_unlock(&parent->d_inode->i_mutex);
                         parent = child;
-                       mutex_lock(&parent->d_inode->i_mutex);
-                       continue;
-               }
-               __debugfs_remove(child, parent);
-               if (parent->d_subdirs.next == &child->d_u.d_child) {
-                       /*
-                        * Try the next sibling.
-                        */
-                       if (child->d_u.d_child.next != &parent->d_subdirs) {
-                               child = list_entry(child->d_u.d_child.next,
-                                                  struct dentry,
-                                                  d_u.d_child);
-                               goto next_sibling;
-                       }
- 
-                       /*
-                        * Avoid infinite loop if we fail to remove
-                        * one dentry.
-                        */
-                       mutex_unlock(&parent->d_inode->i_mutex);
-                       break;
+                       goto down;
                 }
-               simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+  up:
+               if (!__debugfs_remove(child, parent))
+                       simple_release_fs(&debugfs_mount, &debugfs_mount_count);
         }
   
-       parent = dentry->d_parent;
+       mutex_unlock(&parent->d_inode->i_mutex);
+       child = parent;
+       parent = parent->d_parent;
         mutex_lock(&parent->d_inode->i_mutex);
-       __debugfs_remove(dentry, parent);
+ 
+       if (child != dentry) {
+               next = list_entry(child->d_u.d_child.next, struct dentry,
+                                       d_u.d_child);
+               goto up;
+       }
+ 
+       if (!__debugfs_remove(child, parent))
+               simple_release_fs(&debugfs_mount, &debugfs_mount_count);
         mutex_unlock(&parent->d_inode->i_mutex);
-       simple_release_fs(&debugfs_mount, &debugfs_mount_count);
   }
   EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
   
diff --combined fs/exec.c

index 3aa5c56,a2d0e51..ad963af
--- 1/fs/exec.c
--- 2/fs/exec.c
+++ b/fs/exec.c
@@@ -1159,13 -1159,6 +1159,6 @@@ void setup_new_exec(struct linux_binpr
                         set_dumpable(current->mm, suid_dumpable);
         }
   
-       /*
-        * Flush performance counters when crossing a
-        * security domain:
-        */
-       if (!get_dumpable(current->mm))
-               perf_event_exit_task(current);
- 
         /* An exec changes our domain. We are no longer part of the thread
            group */
   
@@@ -1229,6 -1222,15 +1222,15 @@@ void install_exec_creds(struct linux_bi
   
         commit_creds(bprm->cred);
         bprm->cred = NULL;
+ 
+       /*
+        * Disable monitoring for regular users
+        * when executing setuid binaries. Must
+        * wait until new credentials are committed
+        * by commit_creds() above
+        */
+       if (get_dumpable(current->mm) != SUID_DUMP_USER)
+               perf_event_exit_task(current);
         /*
          * cred_guard_mutex must be held at least to this point to prevent
          * ptrace_attach() from altering our determination of the task's
@@@ -2092,8 -2094,8 +2094,8 @@@ static int umh_pipe_setup(struct subpro
         fd_install(0, rp);
         spin_lock(&cf->file_lock);
         fdt = files_fdtable(cf);
- -      FD_SET(0, fdt->open_fds);
- -      FD_CLR(0, fdt->close_on_exec);
+ +      __set_open_fd(0, fdt);
+ +      __clear_close_on_exec(0, fdt);
         spin_unlock(&cf->file_lock);
   
         /* and disallow core files too */
diff --combined fs/fat/inode.c

index 1f66735,fc33ca1..04997de
--- 1/fs/fat/inode.c
--- 2/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@@ -1238,6 -1238,19 +1238,19 @@@ static int fat_read_root(struct inode *
         return 0;
   }
   
+ static unsigned long calc_fat_clusters(struct super_block *sb)
+ {
+       struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ 
+       /* Divide first to avoid overflow */
+       if (sbi->fat_bits != 12) {
+               unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits;
+               return ent_per_sec * sbi->fat_length;
+       }
+ 
+       return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
+ }
+ 
   /*
    * Read the super block of an MS-DOS FS.
    */
@@@ -1247,7 -1260,6 +1260,7 @@@ int fat_fill_super(struct super_block *
         struct inode *root_inode = NULL, *fat_inode = NULL;
         struct buffer_head *bh;
         struct fat_boot_sector *b;
+ +      struct fat_boot_bsx *bsx;
         struct msdos_sb_info *sbi;
         u16 logical_sector_size;
         u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors;
@@@ -1392,8 -1404,6 +1405,8 @@@
                         goto out_fail;
                 }
   
+ +              bsx = (struct fat_boot_bsx *)(bh->b_data + FAT32_BSX_OFFSET);
+ +
                 fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data;
                 if (!IS_FSINFO(fsinfo)) {
                         fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: "
@@@ -1409,14 -1419,8 +1422,14 @@@
                 }
   
                 brelse(fsinfo_bh);
+ +      } else {
+ +              bsx = (struct fat_boot_bsx *)(bh->b_data + FAT16_BSX_OFFSET);
         }
   
+ +      /* interpret volume ID as a little endian 32 bit integer */
+ +      sbi->vol_id = (((u32)bsx->vol_id[0]) | ((u32)bsx->vol_id[1] << 8) |
+ +              ((u32)bsx->vol_id[2] << 16) | ((u32)bsx->vol_id[3] << 24));
+ +
         sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
         sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
   
@@@ -1443,7 -1447,7 +1456,7 @@@
                 sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12;
   
         /* check that FAT table does not overflow */
-       fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
+       fat_clusters = calc_fat_clusters(sb);
         total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
         if (total_clusters > MAX_FAT(sb)) {
                 if (!silent)
diff --combined fs/ubifs/dir.c

index 1f1028d,aaebf0f..32fbe62
--- 1/fs/ubifs/dir.c
--- 2/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@@ -170,6 -170,8 +170,6 @@@ struct inode *ubifs_new_inode(struct ub
         return inode;
   }
   
- -#ifdef CONFIG_UBIFS_FS_DEBUG
- -
   static int dbg_check_name(const struct ubifs_info *c,
                           const struct ubifs_dent_node *dent,
                           const struct qstr *nm)
@@@ -183,6 -185,12 +183,6 @@@
         return 0;
   }
   
- -#else
- -
- -#define dbg_check_name(c, dent, nm) 0
- -
- -#endif
- -
   static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
                                    struct nameidata *nd)
   {
@@@ -349,31 -357,50 +349,50 @@@ static unsigned int vfs_dent_type(uint8
   static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
   {
         int err, over = 0;
+       loff_t pos = file->f_pos;
         struct qstr nm;
         union ubifs_key key;
         struct ubifs_dent_node *dent;
         struct inode *dir = file->f_path.dentry->d_inode;
         struct ubifs_info *c = dir->i_sb->s_fs_info;
   
-       dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
+       dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, pos);
   
-       if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
+       if (pos > UBIFS_S_KEY_HASH_MASK || pos == 2)
                 /*
                  * The directory was seek'ed to a senseless position or there
                  * are no more entries.
                  */
                 return 0;
   
+       if (file->f_version == 0) {
+               /*
+                * The file was seek'ed, which means that @file->private_data
+                * is now invalid. This may also be just the first
+                * 'ubifs_readdir()' invocation, in which case
+                * @file->private_data is NULL, and the below code is
+                * basically a no-op.
+                */
+               kfree(file->private_data);
+               file->private_data = NULL;
+       }
+ 
+       /*
+        * 'generic_file_llseek()' unconditionally sets @file->f_version to
+        * zero, and we use this for detecting whether the file was seek'ed.
+        */
+       file->f_version = 1;
+ 
         /* File positions 0 and 1 correspond to "." and ".." */
-       if (file->f_pos == 0) {
+       if (pos == 0) {
                 ubifs_assert(!file->private_data);
                 over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
                 if (over)
                         return 0;
-               file->f_pos = 1;
+               file->f_pos = pos = 1;
         }
   
-       if (file->f_pos == 1) {
+       if (pos == 1) {
                 ubifs_assert(!file->private_data);
                 over = filldir(dirent, "..", 2, 1,
                                parent_ino(file->f_path.dentry), DT_DIR);
@@@ -389,7 -416,7 +408,7 @@@
                         goto out;
                 }
   
-               file->f_pos = key_hash_flash(c, &dent->key);
+               file->f_pos = pos = key_hash_flash(c, &dent->key);
                 file->private_data = dent;
         }
   
@@@ -397,17 -424,16 +416,16 @@@
         if (!dent) {
                 /*
                  * The directory was seek'ed to and is now readdir'ed.
-                * Find the entry corresponding to @file->f_pos or the
-                * closest one.
+                * Find the entry corresponding to @pos or the closest one.
                  */
-               dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
+               dent_key_init_hash(c, &key, dir->i_ino, pos);
                 nm.name = NULL;
                 dent = ubifs_tnc_next_ent(c, &key, &nm);
                 if (IS_ERR(dent)) {
                         err = PTR_ERR(dent);
                         goto out;
                 }
-               file->f_pos = key_hash_flash(c, &dent->key);
+               file->f_pos = pos = key_hash_flash(c, &dent->key);
                 file->private_data = dent;
         }
   
@@@ -419,7 -445,7 +437,7 @@@
                              ubifs_inode(dir)->creat_sqnum);
   
                 nm.len = le16_to_cpu(dent->nlen);
-               over = filldir(dirent, dent->name, nm.len, file->f_pos,
+               over = filldir(dirent, dent->name, nm.len, pos,
                                le64_to_cpu(dent->inum),
                                vfs_dent_type(dent->type));
                 if (over)
@@@ -435,9 -461,17 +453,17 @@@
                 }
   
                 kfree(file->private_data);
-               file->f_pos = key_hash_flash(c, &dent->key);
+               file->f_pos = pos = key_hash_flash(c, &dent->key);
                 file->private_data = dent;
                 cond_resched();
+ 
+               if (file->f_version == 0)
+                       /*
+                        * The file was seek'ed meanwhile, lets return and start
+                        * reading direntries from the new position on the next
+                        * invocation.
+                        */
+                       return 0;
         }
   
   out:
@@@ -448,15 -482,13 +474,13 @@@
   
         kfree(file->private_data);
         file->private_data = NULL;
+       /* 2 is a special value indicating that there are no more direntries */
         file->f_pos = 2;
         return 0;
   }
   
- /* If a directory is seeked, we have to free saved readdir() state */
   static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin)
   {
-       kfree(file->private_data);
-       file->private_data = NULL;
         return generic_file_llseek(file, offset, origin);
   }
   
@@@ -558,7 -590,6 +582,7 @@@ static int ubifs_unlink(struct inode *d
         int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
         int err, budgeted = 1;
         struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+ +      unsigned int saved_nlink = inode->i_nlink;
   
         /*
          * Budget request settings: deletion direntry, deletion inode (+1 for
@@@ -606,7 -637,7 +630,7 @@@
   out_cancel:
         dir->i_size += sz_change;
         dir_ui->ui_size = dir->i_size;
- -      inc_nlink(inode);
+ +      set_nlink(inode, saved_nlink);
         unlock_2_inodes(dir, inode);
         if (budgeted)
                 ubifs_release_budget(c, &req);
@@@ -697,7 -728,8 +721,7 @@@ out_cancel
         dir->i_size += sz_change;
         dir_ui->ui_size = dir->i_size;
         inc_nlink(dir);
- -      inc_nlink(inode);
- -      inc_nlink(inode);
+ +      set_nlink(inode, 2);
         unlock_2_inodes(dir, inode);
         if (budgeted)
                 ubifs_release_budget(c, &req);
@@@ -969,7 -1001,6 +993,7 @@@ static int ubifs_rename(struct inode *o
         struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
                         .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
         struct timespec time;
+ +      unsigned int saved_nlink;
   
         /*
          * Budget request settings: deletion direntry, new direntry, removing
@@@ -980,8 -1011,8 +1004,8 @@@
          * separately.
          */
   
- -      dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in "
- -              "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
+ +      dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in dir ino %lu",
+ +              old_dentry->d_name.len, old_dentry->d_name.name,
                 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
                 new_dentry->d_name.name, new_dir->i_ino);
         ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
@@@ -1052,14 -1083,13 +1076,14 @@@
         if (unlink) {
                 /*
                  * Directories cannot have hard-links, so if this is a
- -               * directory, decrement its @i_nlink twice because an empty
- -               * directory has @i_nlink 2.
+ +               * directory, just clear @i_nlink.
                  */
+ +              saved_nlink = new_inode->i_nlink;
                 if (is_dir)
+ +                      clear_nlink(new_inode);
+ +              else
                         drop_nlink(new_inode);
                 new_inode->i_ctime = time;
- -              drop_nlink(new_inode);
         } else {
                 new_dir->i_size += new_sz;
                 ubifs_inode(new_dir)->ui_size = new_dir->i_size;
@@@ -1096,7 -1126,9 +1120,7 @@@
   
   out_cancel:
         if (unlink) {
- -              if (is_dir)
- -                      inc_nlink(new_inode);
- -              inc_nlink(new_inode);
+ +              set_nlink(new_inode, saved_nlink);
         } else {
                 new_dir->i_size -= new_sz;
                 ubifs_inode(new_dir)->ui_size = new_dir->i_size;
@@@ -1179,10 -1211,12 +1203,10 @@@ const struct inode_operations ubifs_dir
         .rename      = ubifs_rename,
         .setattr     = ubifs_setattr,
         .getattr     = ubifs_getattr,
- -#ifdef CONFIG_UBIFS_FS_XATTR
         .setxattr    = ubifs_setxattr,
         .getxattr    = ubifs_getxattr,
         .listxattr   = ubifs_listxattr,
         .removexattr = ubifs_removexattr,
- -#endif
   };
   
   const struct file_operations ubifs_dir_operations = {
diff --combined include/linux/mm.h

index e5f83b1,305fd75..8c3d2bf
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -865,7 -865,8 +865,8 @@@ extern void pagefault_out_of_memory(voi
    * Flags passed to show_mem() and show_free_areas() to suppress output in
    * various contexts.
    */
- #define SHOW_MEM_FILTER_NODES (0x0001u)       /* filter disallowed nodes */
+ #define SHOW_MEM_FILTER_NODES         (0x0001u)       /* disallowed nodes */
+ #define SHOW_MEM_FILTER_PAGE_COUNT    (0x0002u)       /* page type count */
   
   extern void show_free_areas(unsigned int flags);
   extern bool skip_free_areas_node(unsigned int flags, int nid);
@@@ -1630,22 -1631,5 +1631,22 @@@ extern void copy_user_huge_page(struct 
                                 unsigned int pages_per_huge_page);
   #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
   
+ +#ifdef CONFIG_DEBUG_PAGEALLOC
+ +extern unsigned int _debug_guardpage_minorder;
+ +
+ +static inline unsigned int debug_guardpage_minorder(void)
+ +{
+ +      return _debug_guardpage_minorder;
+ +}
+ +
+ +static inline bool page_is_guard(struct page *page)
+ +{
+ +      return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+ +}
+ +#else
+ +static inline unsigned int debug_guardpage_minorder(void) { return 0; }
+ +static inline bool page_is_guard(struct page *page) { return false; }
+ +#endif /* CONFIG_DEBUG_PAGEALLOC */
+ +
   #endif /* __KERNEL__ */
   #endif /* _LINUX_MM_H */
diff --combined kernel/sched.c

index 6ab532c,d93369a..10dad8e
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -5296,7 -5296,6 +5296,7 @@@ int can_nice(const struct task_struct *
         return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                 capable(CAP_SYS_NICE));
   }
+ +EXPORT_SYMBOL_GPL(can_nice);
   
   #ifdef __ARCH_WANT_SYS_NICE
   
@@@ -6673,16 -6672,25 +6673,25 @@@ static void sd_free_ctl_entry(struct ct
         *tablep = NULL;
   }
   
+ static int min_load_idx = 0;
+ static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+ 
   static void
   set_table_entry(struct ctl_table *entry,
                 const char *procname, void *data, int maxlen,
-               mode_t mode, proc_handler *proc_handler)
+               mode_t mode, proc_handler *proc_handler,
+               bool load_idx)
   {
         entry->procname = procname;
         entry->data = data;
         entry->maxlen = maxlen;
         entry->mode = mode;
         entry->proc_handler = proc_handler;
+ 
+       if (load_idx) {
+               entry->extra1 = &min_load_idx;
+               entry->extra2 = &max_load_idx;
+       }
   }
   
   static struct ctl_table *
@@@ -6694,30 -6702,30 +6703,30 @@@ sd_alloc_ctl_domain_table(struct sched_
                 return NULL;
   
         set_table_entry(&table[0], "min_interval", &sd->min_interval,
-               sizeof(long), 0644, proc_doulongvec_minmax);
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
         set_table_entry(&table[1], "max_interval", &sd->max_interval,
-               sizeof(long), 0644, proc_doulongvec_minmax);
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
         set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, true);
         set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, true);
         set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, true);
         set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, true);
         set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, true);
         set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, false);
         set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, false);
         set_table_entry(&table[9], "cache_nice_tries",
                 &sd->cache_nice_tries,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, false);
         set_table_entry(&table[10], "flags", &sd->flags,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, false);
         set_table_entry(&table[11], "name", sd->name,
-               CORENAME_MAX_SIZE, 0444, proc_dostring);
+               CORENAME_MAX_SIZE, 0444, proc_dostring, false);
         /* &table[12] is terminator */
   
         return table;
diff --combined kernel/trace/trace.c

index 0c99b15,ce1067f..6264daa
--- 1/kernel/trace/trace.c
--- 2/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@@ -631,7 -631,15 +631,15 @@@ __update_max_tr(struct trace_array *tr
   
         memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
         max_data->pid = tsk->pid;
-       max_data->uid = task_uid(tsk);
+       /*
+        * If tsk == current, then use current_uid(), as that does not use
+        * RCU. The irq tracer can be called out of RCU scope.
+        */
+       if (tsk == current)
+               max_data->uid = current_uid();
+       else
+               max_data->uid = task_uid(tsk);
+ 
         max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
         max_data->policy = tsk->policy;
         max_data->rt_priority = tsk->rt_priority;
@@@ -3367,6 -3375,7 +3375,7 @@@ waitagain
         memset(&iter->seq, 0,
                sizeof(struct trace_iterator) -
                offsetof(struct trace_iterator, seq));
+       cpumask_clear(iter->started);
         iter->pos = -1;
   
         trace_event_read_lock();
@@@ -4426,7 -4435,7 +4435,7 @@@ static const struct file_operations tra
   };
   
   struct dentry *trace_create_file(const char *name,
- -                               mode_t mode,
+ +                               umode_t mode,
                                  struct dentry *parent,
                                  void *data,
                                  const struct file_operations *fops)
diff --combined mm/huge_memory.c

index 0e5d01f,ed0ed8a..b43721f
--- 1/mm/huge_memory.c
--- 2/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@@ -538,7 -538,7 +538,7 @@@ static int __init hugepage_init(void
          * where the extra memory used could hurt more than TLB overhead
          * is likely to save.  The admin can still enable it through /sys.
          */
- -      if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+ +      if (totalram_pages < (200 << (20 - PAGE_SHIFT)))
                 transparent_hugepage_flags = 0;
   
         start_khugepaged();
@@@ -790,28 -790,6 +790,28 @@@ pgtable_t get_pmd_huge_pte(struct mm_st
         return pgtable;
   }
   
+ +void huge_pmd_set_accessed(struct mm_struct *mm,
+ +                         struct vm_area_struct *vma,
+ +                         unsigned long address,
+ +                         pmd_t *pmd, pmd_t orig_pmd,
+ +                         int dirty)
+ +{
+ +      pmd_t entry;
+ +      unsigned long haddr;
+ +
+ +      spin_lock(&mm->page_table_lock);
+ +      if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ +              goto unlock;
+ +
+ +      entry = pmd_mkyoung(orig_pmd);
+ +      haddr = address & HPAGE_PMD_MASK;
+ +      if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
+ +              update_mmu_cache_pmd(vma, address, pmd);
+ +
+ +unlock:
+ +      spin_unlock(&mm->page_table_lock);
+ +}
+ +
   static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                         struct vm_area_struct *vma,
                                         unsigned long address,
@@@ -925,7 -903,7 +925,7 @@@ int do_huge_pmd_wp_page(struct mm_struc
                 entry = pmd_mkyoung(orig_pmd);
                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                 if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
- -                      update_mmu_cache(vma, address, entry);
+ +                      update_mmu_cache(vma, address, pmd);
                 ret |= VM_FAULT_WRITE;
                 goto out_unlock;
         }
@@@ -975,7 -953,7 +975,7 @@@
                 pmdp_clear_flush_notify(vma, haddr, pmd);
                 page_add_new_anon_rmap(new_page, vma, haddr);
                 set_pmd_at(mm, haddr, pmd, entry);
- -              update_mmu_cache(vma, address, entry);
+ +              update_mmu_cache(vma, address, pmd);
                 page_remove_rmap(page);
                 put_page(page);
                 ret |= VM_FAULT_WRITE;
@@@ -1904,6 -1882,8 +1904,8 @@@ static void collapse_huge_page(struct m
                 goto out;
   
         vma = find_vma(mm, address);
+       if (!vma)
+               goto out;
         hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
         hend = vma->vm_end & HPAGE_PMD_MASK;
         if (address < hstart || address + HPAGE_PMD_SIZE > hend)
@@@ -1959,7 -1939,12 +1961,12 @@@
                 pte_unmap(pte);
                 spin_lock(&mm->page_table_lock);
                 BUG_ON(!pmd_none(*pmd));
-               set_pmd_at(mm, address, pmd, _pmd);
+               /*
+                * We can only use set_pmd_at when establishing
+                * hugepmds and never for establishing regular pmds that
+                * points to regular pagetables. Use pmd_populate for that
+                */
+               pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                 spin_unlock(&mm->page_table_lock);
                 anon_vma_unlock(vma->anon_vma);
                 goto out;
@@@ -1993,7 -1978,7 +2000,7 @@@
         BUG_ON(!pmd_none(*pmd));
         page_add_new_anon_rmap(new_page, vma, address);
         set_pmd_at(mm, address, pmd, _pmd);
- -      update_mmu_cache(vma, address, _pmd);
+ +      update_mmu_cache(vma, address, pmd);
         prepare_pmd_huge_pte(pgtable, mm);
         spin_unlock(&mm->page_table_lock);
   
diff --combined mm/hugetlb.c

index 581d985,ddf2128..267135d
--- 1/mm/hugetlb.c
--- 2/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@@ -633,7 -633,6 +633,7 @@@ static void free_huge_page(struct page 
                 h->surplus_huge_pages--;
                 h->surplus_huge_pages_node[nid]--;
         } else {
+ +              arch_clear_hugepage_flags(page);
                 enqueue_huge_page(h, page);
         }
         spin_unlock(&hugetlb_lock);
@@@ -680,6 -679,23 +680,23 @@@ int PageHuge(struct page *page
   }
   EXPORT_SYMBOL_GPL(PageHuge);
   
+ pgoff_t __basepage_index(struct page *page)
+ {
+       struct page *page_head = compound_head(page);
+       pgoff_t index = page_index(page_head);
+       unsigned long compound_idx;
+ 
+       if (!PageHuge(page_head))
+               return page_index(page);
+ 
+       if (compound_order(page_head) >= MAX_ORDER)
+               compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
+       else
+               compound_idx = page - page_head;
+ 
+       return (index << compound_order(page_head)) + compound_idx;
+ }
+ 
   static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
   {
         struct page *page;
@@@ -1678,9 -1694,9 +1695,9 @@@ static void __init hugetlb_sysfs_init(v
   
   /*
    * node_hstate/s - associate per node hstate attributes, via their kobjects,
- - * with node sysdevs in node_devices[] using a parallel array.  The array
- - * index of a node sysdev or _hstate == node id.
- - * This is here to avoid any static dependency of the node sysdev driver, in
+ + * with node devices in node_devices[] using a parallel array.  The array
+ + * index of a node device or _hstate == node id.
+ + * This is here to avoid any static dependency of the node device driver, in
    * the base kernel, on the hugetlb module.
    */
   struct node_hstate {
@@@ -1690,7 -1706,7 +1707,7 @@@
   struct node_hstate node_hstates[MAX_NUMNODES];
   
   /*
- - * A subset of global hstate attributes for node sysdevs
+ + * A subset of global hstate attributes for node devices
    */
   static struct attribute *per_node_hstate_attrs[] = {
         &nr_hugepages_attr.attr,
@@@ -1704,7 -1720,7 +1721,7 @@@ static struct attribute_group per_node_
   };
   
   /*
- - * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
+ + * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
    * Returns node id via non-NULL nidp.
    */
   static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
@@@ -1727,13 -1743,13 +1744,13 @@@
   }
   
   /*
- - * Unregister hstate attributes from a single node sysdev.
+ + * Unregister hstate attributes from a single node device.
    * No-op if no hstate attributes attached.
    */
   void hugetlb_unregister_node(struct node *node)
   {
         struct hstate *h;
- -      struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ +      struct node_hstate *nhs = &node_hstates[node->dev.id];
   
         if (!nhs->hugepages_kobj)
                 return;         /* no hstate attributes */
@@@ -1749,7 -1765,7 +1766,7 @@@
   }
   
   /*
- - * hugetlb module exit:  unregister hstate attributes from node sysdevs
+ + * hugetlb module exit:  unregister hstate attributes from node devices
    * that have them.
    */
   static void hugetlb_unregister_all_nodes(void)
@@@ -1757,7 -1773,7 +1774,7 @@@
         int nid;
   
         /*
- -       * disable node sysdev registrations.
+ +       * disable node device registrations.
          */
         register_hugetlbfs_with_node(NULL, NULL);
   
@@@ -1769,20 -1785,20 +1786,20 @@@
   }
   
   /*
- - * Register hstate attributes for a single node sysdev.
+ + * Register hstate attributes for a single node device.
    * No-op if attributes already registered.
    */
   void hugetlb_register_node(struct node *node)
   {
         struct hstate *h;
- -      struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ +      struct node_hstate *nhs = &node_hstates[node->dev.id];
         int err;
   
         if (nhs->hugepages_kobj)
                 return;         /* already allocated */
   
         nhs->hugepages_kobj = kobject_create_and_add("hugepages",
- -                                                      &node->sysdev.kobj);
+ +                                                      &node->dev.kobj);
         if (!nhs->hugepages_kobj)
                 return;
   
@@@ -1793,7 -1809,7 +1810,7 @@@
                 if (err) {
                         printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
                                         " for node %d\n",
- -                                              h->name, node->sysdev.id);
+ +                                              h->name, node->dev.id);
                         hugetlb_unregister_node(node);
                         break;
                 }
@@@ -1802,8 -1818,8 +1819,8 @@@
   
   /*
    * hugetlb init time:  register hstate attributes for all registered node
- - * sysdevs of nodes that have memory.  All on-line nodes should have
- - * registered their associated sysdev by this time.
+ + * devices of nodes that have memory.  All on-line nodes should have
+ + * registered their associated device by this time.
    */
   static void hugetlb_register_all_nodes(void)
   {
@@@ -1811,12 -1827,12 +1828,12 @@@
   
         for_each_node_state(nid, N_HIGH_MEMORY) {
                 struct node *node = &node_devices[nid];
- -              if (node->sysdev.id == nid)
+ +              if (node->dev.id == nid)
                         hugetlb_register_node(node);
         }
   
         /*
- -       * Let the node sysdev driver know we're here so it can
+ +       * Let the node device driver know we're here so it can
          * [un]register hstate attributes on node hotplug.
          */
         register_hugetlbfs_with_node(hugetlb_register_node,
@@@ -2752,7 -2768,7 +2769,7 @@@ int hugetlb_fault(struct mm_struct *mm
         if (ptep) {
                 entry = huge_ptep_get(ptep);
                 if (unlikely(is_hugetlb_entry_migration(entry))) {
-                       migration_entry_wait(mm, (pmd_t *)ptep, address);
+                       migration_entry_wait_huge(mm, ptep);
                         return 0;
                 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                         return VM_FAULT_HWPOISON_LARGE |
diff --combined mm/page_alloc.c

index dceb1be,d8762b2..38cd47d
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -57,8 -57,6 +57,8 @@@
   #include <linux/ftrace_event.h>
   #include <linux/memcontrol.h>
   #include <linux/prefetch.h>
+ +#include <linux/migrate.h>
+ +#include <linux/page-debug-flags.h>
   
   #include <asm/tlbflush.h>
   #include <asm/div64.h>
@@@ -98,14 -96,6 +98,14 @@@ EXPORT_SYMBOL(node_states)
   
   unsigned long totalram_pages __read_mostly;
   unsigned long totalreserve_pages __read_mostly;
+ +/*
+ + * When calculating the number of globally allowed dirty pages, there
+ + * is a certain number of per-zone reserves that should not be
+ + * considered dirtyable memory.  This is the sum of those reserves
+ + * over all existing zones that contribute dirtyable memory.
+ + */
+ +unsigned long dirty_balance_reserve __read_mostly;
+ +
   int percpu_pagelist_fraction;
   gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
   
@@@ -137,13 -127,6 +137,13 @@@ void pm_restrict_gfp_mask(void
         saved_gfp_mask = gfp_allowed_mask;
         gfp_allowed_mask &= ~GFP_IOFS;
   }
+ +
+ +bool pm_suspended_storage(void)
+ +{
+ +      if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+ +              return false;
+ +      return true;
+ +}
   #endif /* CONFIG_PM_SLEEP */
   
   #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@@ -420,37 -403,6 +420,37 @@@ static inline void prep_zero_page(struc
                 clear_highpage(page + i);
   }
   
+ +#ifdef CONFIG_DEBUG_PAGEALLOC
+ +unsigned int _debug_guardpage_minorder;
+ +
+ +static int __init debug_guardpage_minorder_setup(char *buf)
+ +{
+ +      unsigned long res;
+ +
+ +      if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
+ +              printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+ +              return 0;
+ +      }
+ +      _debug_guardpage_minorder = res;
+ +      printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+ +      return 0;
+ +}
+ +__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+ +
+ +static inline void set_page_guard_flag(struct page *page)
+ +{
+ +      __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+ +}
+ +
+ +static inline void clear_page_guard_flag(struct page *page)
+ +{
+ +      __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+ +}
+ +#else
+ +static inline void set_page_guard_flag(struct page *page) { }
+ +static inline void clear_page_guard_flag(struct page *page) { }
+ +#endif
+ +
   static inline void set_page_order(struct page *page, int order)
   {
         set_page_private(page, order);
@@@ -508,11 -460,6 +508,11 @@@ static inline int page_is_buddy(struct 
         if (page_zone_id(page) != page_zone_id(buddy))
                 return 0;
   
+ +      if (page_is_guard(buddy) && page_order(buddy) == order) {
+ +              VM_BUG_ON(page_count(buddy) != 0);
+ +              return 1;
+ +      }
+ +
         if (PageBuddy(buddy) && page_order(buddy) == order) {
                 VM_BUG_ON(page_count(buddy) != 0);
                 return 1;
@@@ -536,10 -483,10 +536,10 @@@
    * free pages of length of (1 << order) and marked with _mapcount -2. Page's
    * order is recorded in page_private(page) field.
    * So when we are allocating or freeing one, we can derive the state of the
- - * other.  That is, if we allocate a small block, and both were   
- - * free, the remainder of the region must be split into blocks.   
+ + * other.  That is, if we allocate a small block, and both were
+ + * free, the remainder of the region must be split into blocks.
    * If a block is freed, and its buddy is also free, then this
- - * triggers coalescing into a block of larger size.            
+ + * triggers coalescing into a block of larger size.
    *
    * -- wli
    */
@@@ -569,19 -516,11 +569,19 @@@ static inline void __free_one_page(stru
                 buddy = page + (buddy_idx - page_idx);
                 if (!page_is_buddy(page, buddy, order))
                         break;
- -
- -              /* Our buddy is free, merge with it and move up one order. */
- -              list_del(&buddy->lru);
- -              zone->free_area[order].nr_free--;
- -              rmv_page_order(buddy);
+ +              /*
+ +               * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
+ +               * merge with it and move up one order.
+ +               */
+ +              if (page_is_guard(buddy)) {
+ +                      clear_page_guard_flag(buddy);
+ +                      set_page_private(page, 0);
+ +                      __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+ +              } else {
+ +                      list_del(&buddy->lru);
+ +                      zone->free_area[order].nr_free--;
+ +                      rmv_page_order(buddy);
+ +              }
                 combined_idx = buddy_idx & page_idx;
                 page = page + (combined_idx - page_idx);
                 page_idx = combined_idx;
@@@ -715,7 -654,7 +715,7 @@@ static bool free_pages_prepare(struct p
         int i;
         int bad = 0;
   
- -      trace_mm_page_free_direct(page, order);
+ +      trace_mm_page_free(page, order);
         kmemcheck_free_shadow(page, order);
   
         if (PageAnon(page))
@@@ -781,24 -720,6 +781,24 @@@ void __meminit __free_pages_bootmem(str
         }
   }
   
+ +#ifdef CONFIG_CMA
+ +/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
+ +void __init init_cma_reserved_pageblock(struct page *page)
+ +{
+ +      unsigned i = pageblock_nr_pages;
+ +      struct page *p = page;
+ +
+ +      do {
+ +              __ClearPageReserved(p);
+ +              set_page_count(p, 0);
+ +      } while (++p, --i);
+ +
+ +      set_page_refcounted(page);
+ +      set_pageblock_migratetype(page, MIGRATE_CMA);
+ +      __free_pages(page, pageblock_order);
+ +      totalram_pages += pageblock_nr_pages;
+ +}
+ +#endif
   
   /*
    * The order of subdivision here is critical for the IO subsystem.
@@@ -825,23 -746,6 +825,23 @@@ static inline void expand(struct zone *
                 high--;
                 size >>= 1;
                 VM_BUG_ON(bad_range(zone, &page[size]));
+ +
+ +#ifdef CONFIG_DEBUG_PAGEALLOC
+ +              if (high < debug_guardpage_minorder()) {
+ +                      /*
+ +                       * Mark as guard pages (or page), that will allow to
+ +                       * merge back to allocator when buddy will be freed.
+ +                       * Corresponding page table entries will not be touched,
+ +                       * pages will stay not present in virtual address space
+ +                       */
+ +                      INIT_LIST_HEAD(&page[size].lru);
+ +                      set_page_guard_flag(&page[size]);
+ +                      set_page_private(&page[size], high);
+ +                      /* Guard pages are not available for any usage */
+ +                      __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+ +                      continue;
+ +              }
+ +#endif
                 list_add(&page[size].lru, &area->free_list[migratetype]);
                 area->nr_free++;
                 set_page_order(&page[size], high);
@@@ -924,17 -828,11 +924,17 @@@ struct page *__rmqueue_smallest(struct 
    * This array describes the order lists are fallen back to when
    * the free lists for the desirable migrate type are depleted
    */
- -static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
- -      [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
- -      [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
- -      [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
- -      [MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
+ +static int fallbacks[MIGRATE_TYPES][4] = {
+ +      [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
+ +      [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
+ +#ifdef CONFIG_CMA
+ +      [MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+ +      [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
+ +#else
+ +      [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
+ +#endif
+ +      [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
+ +      [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
   };
   
   /*
@@@ -1029,12 -927,12 +1029,12 @@@ __rmqueue_fallback(struct zone *zone, i
         /* Find the largest possible block of pages in the other list */
         for (current_order = MAX_ORDER-1; current_order >= order;
                                                 --current_order) {
- -              for (i = 0; i < MIGRATE_TYPES - 1; i++) {
+ +              for (i = 0;; i++) {
                         migratetype = fallbacks[start_migratetype][i];
   
                         /* MIGRATE_RESERVE handled later if necessary */
                         if (migratetype == MIGRATE_RESERVE)
- -                              continue;
+ +                              break;
   
                         area = &(zone->free_area[current_order]);
                         if (list_empty(&area->free_list[migratetype]))
@@@ -1049,18 -947,11 +1049,18 @@@
                          * pages to the preferred allocation list. If falling
                          * back for a reclaimable kernel allocation, be more
                          * aggressive about taking ownership of free pages
+ +                       *
+ +                       * On the other hand, never change migration
+ +                       * type of MIGRATE_CMA pageblocks nor move CMA
+ +                       * pages on different free lists. We don't
+ +                       * want unmovable pages to be allocated from
+ +                       * MIGRATE_CMA areas.
                          */
- -                      if (unlikely(current_order >= (pageblock_order >> 1)) ||
- -                                      start_migratetype == MIGRATE_RECLAIMABLE ||
- -                                      page_group_by_mobility_disabled) {
- -                              unsigned long pages;
+ +                      if (!is_migrate_cma(migratetype) &&
+ +                          (unlikely(current_order >= pageblock_order / 2) ||
+ +                           start_migratetype == MIGRATE_RECLAIMABLE ||
+ +                           page_group_by_mobility_disabled)) {
+ +                              int pages;
                                 pages = move_freepages_block(zone, page,
                                                                 start_migratetype);
   
@@@ -1078,14 -969,11 +1078,14 @@@
                         rmv_page_order(page);
   
                         /* Take ownership for orders >= pageblock_order */
- -                      if (current_order >= pageblock_order)
+ +                      if (current_order >= pageblock_order &&
+ +                          !is_migrate_cma(migratetype))
                                 change_pageblock_range(page, current_order,
                                                         start_migratetype);
   
- -                      expand(zone, page, order, current_order, area, migratetype);
+ +                      expand(zone, page, order, current_order, area,
+ +                             is_migrate_cma(migratetype)
+ +                           ? migratetype : start_migratetype);
   
                         trace_mm_page_alloc_extfrag(page, order, current_order,
                                 start_migratetype, migratetype);
@@@ -1127,17 -1015,17 +1127,17 @@@ retry_reserve
         return page;
   }
   
- -/* 
+ +/*
    * Obtain a specified number of elements from the buddy allocator, all under
    * a single hold of the lock, for efficiency.  Add them to the supplied list.
    * Returns the number of new pages which were placed at *list.
    */
- -static int rmqueue_bulk(struct zone *zone, unsigned int order, 
+ +static int rmqueue_bulk(struct zone *zone, unsigned int order,
                         unsigned long count, struct list_head *list,
                         int migratetype, int cold)
   {
- -      int i;
- -      
+ +      int mt = migratetype, i;
+ +
         spin_lock(&zone->lock);
         for (i = 0; i < count; ++i) {
                 struct page *page = __rmqueue(zone, order, migratetype);
@@@ -1157,12 -1045,7 +1157,12 @@@
                         list_add(&page->lru, list);
                 else
                         list_add_tail(&page->lru, list);
- -              set_page_private(page, migratetype);
+ +              if (IS_ENABLED(CONFIG_CMA)) {
+ +                      mt = get_pageblock_migratetype(page);
+ +                      if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
+ +                              mt = migratetype;
+ +              }
+ +              set_page_private(page, mt);
                 list = &page->lru;
         }
         __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
@@@ -1327,19 -1210,6 +1327,19 @@@ out
         local_irq_restore(flags);
   }
   
+ +/*
+ + * Free a list of 0-order pages
+ + */
+ +void free_hot_cold_page_list(struct list_head *list, int cold)
+ +{
+ +      struct page *page, *next;
+ +
+ +      list_for_each_entry_safe(page, next, list, lru) {
+ +              trace_mm_page_free_batched(page, cold);
+ +              free_hot_cold_page(page, cold);
+ +      }
+ +}
+ +
   /*
    * split_page takes a non-compound higher-order page, and splits it into
    * n (1<<order) sub-pages: page[0..n]
@@@ -1406,12 -1276,8 +1406,12 @@@ int split_free_page(struct page *page
   
         if (order >= pageblock_order - 1) {
                 struct page *endpage = page + (1 << order) - 1;
- -              for (; page < endpage; page += pageblock_nr_pages)
- -                      set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ +              for (; page < endpage; page += pageblock_nr_pages) {
+ +                      int mt = get_pageblock_migratetype(page);
+ +                      if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
+ +                              set_pageblock_migratetype(page,
+ +                                                        MIGRATE_MOVABLE);
+ +              }
         }
   
         return 1 << order;
@@@ -1542,7 -1408,7 +1542,7 @@@ static int should_fail_alloc_page(gfp_
   
   static int __init fail_page_alloc_debugfs(void)
   {
- -      mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ +      umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
         struct dentry *dir;
   
         dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
@@@ -1591,7 -1457,7 +1591,7 @@@ static bool __zone_watermark_ok(struct 
         long min = mark;
         int o;
   
- -      free_pages -= (1 << order) + 1;
+ +      free_pages -= (1 << order) - 1;
         if (alloc_flags & ALLOC_HIGH)
                 min -= min / 2;
         if (alloc_flags & ALLOC_HARDER)
@@@ -1890,10 -1756,16 +1890,17 @@@ void warn_alloc_failed(gfp_t gfp_mask, 
   {
         unsigned int filter = SHOW_MEM_FILTER_NODES;
   
- -      if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+ +      if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+ +          debug_guardpage_minorder() > 0)
                 return;
   
+       /*
+        * Walking all memory to count page types is very expensive and should
+        * be inhibited in non-blockable contexts.
+        */
+       if (!(gfp_mask & __GFP_WAIT))
+               filter |= SHOW_MEM_FILTER_PAGE_COUNT;
+ 
         /*
          * This documents exceptions given to allocations in certain
          * contexts that are allowed to allocate outside current's set
@@@ -1930,25 -1802,12 +1937,25 @@@
   
   static inline int
   should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+ +                              unsigned long did_some_progress,
                                 unsigned long pages_reclaimed)
   {
         /* Do not loop if specifically requested */
         if (gfp_mask & __GFP_NORETRY)
                 return 0;
   
+ +      /* Always retry if specifically requested */
+ +      if (gfp_mask & __GFP_NOFAIL)
+ +              return 1;
+ +
+ +      /*
+ +       * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+ +       * making forward progress without invoking OOM. Suspend also disables
+ +       * storage devices so kswapd will not help. Bail if we are suspending.
+ +       */
+ +      if (!did_some_progress && pm_suspended_storage())
+ +              return 0;
+ +
         /*
          * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
          * means __GFP_NOFAIL, but that may not be true in other
@@@ -1967,6 -1826,13 +1974,6 @@@
         if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
                 return 1;
   
- -      /*
- -       * Don't let big-order allocations loop unless the caller
- -       * explicitly requests that.
- -       */
- -      if (gfp_mask & __GFP_NOFAIL)
- -              return 1;
- -
         return 0;
   }
   
@@@ -2094,13 -1960,16 +2101,13 @@@ __alloc_pages_direct_compact(gfp_t gfp_
   }
   #endif /* CONFIG_COMPACTION */
   
- -/* The really slow allocator path where we enter direct reclaim */
- -static inline struct page *
- -__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
- -      struct zonelist *zonelist, enum zone_type high_zoneidx,
- -      nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- -      int migratetype, unsigned long *did_some_progress)
+ +/* Perform direct synchronous page reclaim */
+ +static int
+ +__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
+ +                nodemask_t *nodemask)
   {
- -      struct page *page = NULL;
         struct reclaim_state reclaim_state;
- -      bool drained = false;
+ +      int progress;
   
         cond_resched();
   
@@@ -2111,7 -1980,7 +2118,7 @@@
         reclaim_state.reclaimed_slab = 0;
         current->reclaim_state = &reclaim_state;
   
- -      *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+ +      progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
   
         current->reclaim_state = NULL;
         lockdep_clear_current_reclaim_state();
@@@ -2119,21 -1988,6 +2126,21 @@@
   
         cond_resched();
   
+ +      return progress;
+ +}
+ +
+ +/* The really slow allocator path where we enter direct reclaim */
+ +static inline struct page *
+ +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+ +      struct zonelist *zonelist, enum zone_type high_zoneidx,
+ +      nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ +      int migratetype, unsigned long *did_some_progress)
+ +{
+ +      struct page *page = NULL;
+ +      bool drained = false;
+ +
+ +      *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
+ +                                             nodemask);
         if (unlikely(!(*did_some_progress)))
                 return NULL;
   
@@@ -2395,8 -2249,7 +2402,8 @@@ rebalance
   
         /* Check if we should retry the allocation */
         pages_reclaimed += did_some_progress;
- -      if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+ +      if (should_alloc_retry(gfp_mask, order, did_some_progress,
+ +                                              pages_reclaimed)) {
                 /* Wait for some write requests to complete then retry */
                 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                 goto rebalance;
@@@ -2518,6 -2371,16 +2525,6 @@@ unsigned long get_zeroed_page(gfp_t gfp
   }
   EXPORT_SYMBOL(get_zeroed_page);
   
- -void __pagevec_free(struct pagevec *pvec)
- -{
- -      int i = pagevec_count(pvec);
- -
- -      while (--i >= 0) {
- -              trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
- -              free_hot_cold_page(pvec->pages[i], pvec->cold);
- -      }
- -}
- -
   void __free_pages(struct page *page, unsigned int order)
   {
         if (put_page_testzero(page)) {
@@@ -4482,7 -4345,7 +4489,7 @@@ static void __paginginit free_area_init
         init_waitqueue_head(&pgdat->kswapd_wait);
         pgdat->kswapd_max_order = 0;
         pgdat_page_cgroup_init(pgdat);
- -      
+ +
         for (j = 0; j < MAX_NR_ZONES; j++) {
                 struct zone *zone = pgdat->node_zones + j;
                 unsigned long size, realsize, memmap_pages;
@@@ -5272,19 -5135,8 +5279,19 @@@ static void calculate_totalreserve_page
                         if (max > zone->present_pages)
                                 max = zone->present_pages;
                         reserve_pages += max;
+ +                      /*
+ +                       * Lowmem reserves are not available to
+ +                       * GFP_HIGHUSER page cache allocations and
+ +                       * kswapd tries to balance zones to their high
+ +                       * watermark.  As a result, neither should be
+ +                       * regarded as dirtyable memory, to prevent a
+ +                       * situation where reclaim has to clean pages
+ +                       * in order to balance the zones.
+ +                       */
+ +                      zone->dirty_balance_reserve = max;
                 }
         }
+ +      dirty_balance_reserve = reserve_pages;
         totalreserve_pages = reserve_pages;
   }
   
@@@ -5327,7 -5179,14 +5334,7 @@@ static void setup_per_zone_lowmem_reser
         calculate_totalreserve_pages();
   }
   
- -/**
- - * setup_per_zone_wmarks - called when min_free_kbytes changes
- - * or when memory is hot-{added|removed}
- - *
- - * Ensures that the watermark[min,low,high] values for each zone are set
- - * correctly with respect to min_free_kbytes.
- - */
- -void setup_per_zone_wmarks(void)
+ +static void __setup_per_zone_wmarks(void)
   {
         unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
         unsigned long lowmem_pages = 0;
@@@ -5374,11 -5233,6 +5381,11 @@@
   
                 zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
                 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+ +
+ +              zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
+ +              zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
+ +              zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
+ +
                 setup_zone_migrate_reserve(zone);
                 spin_unlock_irqrestore(&zone->lock, flags);
         }
@@@ -5387,20 -5241,6 +5394,20 @@@
         calculate_totalreserve_pages();
   }
   
+ +/**
+ + * setup_per_zone_wmarks - called when min_free_kbytes changes
+ + * or when memory is hot-{added|removed}
+ + *
+ + * Ensures that the watermark[min,low,high] values for each zone are set
+ + * correctly with respect to min_free_kbytes.
+ + */
+ +void setup_per_zone_wmarks(void)
+ +{
+ +      mutex_lock(&zonelists_mutex);
+ +      __setup_per_zone_wmarks();
+ +      mutex_unlock(&zonelists_mutex);
+ +}
+ +
   /*
    * The inactive anon list should be small enough that the VM never has to
    * do too much work, but large enough that each inactive page has a chance
@@@ -5774,16 -5614,14 +5781,16 @@@ static in
   __count_immobile_pages(struct zone *zone, struct page *page, int count)
   {
         unsigned long pfn, iter, found;
+ +      int mt;
+ +
         /*
          * For avoiding noise data, lru_add_drain_all() should be called
          * If ZONE_MOVABLE, the zone never contains immobile pages
          */
         if (zone_idx(zone) == ZONE_MOVABLE)
                 return true;
- -
- -      if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+ +      mt = get_pageblock_migratetype(page);
+ +      if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
                 return true;
   
         pfn = page_to_pfn(page);
@@@ -5893,7 -5731,7 +5900,7 @@@ out
         return ret;
   }
   
- -void unset_migratetype_isolate(struct page *page)
+ +void unset_migratetype_isolate(struct page *page, unsigned migratetype)
   {
         struct zone *zone;
         unsigned long flags;
@@@ -5901,264 -5739,12 +5908,264 @@@
         spin_lock_irqsave(&zone->lock, flags);
         if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
                 goto out;
- -      set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- -      move_freepages_block(zone, page, MIGRATE_MOVABLE);
+ +      set_pageblock_migratetype(page, migratetype);
+ +      move_freepages_block(zone, page, migratetype);
   out:
         spin_unlock_irqrestore(&zone->lock, flags);
   }
   
+ +#ifdef CONFIG_CMA
+ +
+ +static unsigned long pfn_max_align_down(unsigned long pfn)
+ +{
+ +      return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
+ +                           pageblock_nr_pages) - 1);
+ +}
+ +
+ +static unsigned long pfn_max_align_up(unsigned long pfn)
+ +{
+ +      return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
+ +                              pageblock_nr_pages));
+ +}
+ +
+ +static struct page *
+ +__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
+ +                           int **resultp)
+ +{
+ +      gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+ +
+ +      if (PageHighMem(page))
+ +              gfp_mask |= __GFP_HIGHMEM;
+ +
+ +      return alloc_page(gfp_mask);
+ +}
+ +
+ +/* [start, end) must belong to a single zone. */
+ +static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
+ +{
+ +      /* This function is based on compact_zone() from compaction.c. */
+ +
+ +      unsigned long pfn = start;
+ +      unsigned int tries = 0;
+ +      int ret = 0;
+ +
+ +      struct compact_control cc = {
+ +              .nr_migratepages = 0,
+ +              .order = -1,
+ +              .zone = page_zone(pfn_to_page(start)),
+ +              .sync = true,
+ +      };
+ +      INIT_LIST_HEAD(&cc.migratepages);
+ +
+ +      migrate_prep_local();
+ +
+ +      while (pfn < end || !list_empty(&cc.migratepages)) {
+ +              if (fatal_signal_pending(current)) {
+ +                      ret = -EINTR;
+ +                      break;
+ +              }
+ +
+ +              if (list_empty(&cc.migratepages)) {
+ +                      cc.nr_migratepages = 0;
+ +                      pfn = isolate_migratepages_range(cc.zone, &cc,
+ +                                                       pfn, end);
+ +                      if (!pfn) {
+ +                              ret = -EINTR;
+ +                              break;
+ +                      }
+ +                      tries = 0;
+ +              } else if (++tries == 5) {
+ +                      ret = ret < 0 ? ret : -EBUSY;
+ +                      break;
+ +              }
+ +
+ +              ret = migrate_pages(&cc.migratepages,
+ +                                  __alloc_contig_migrate_alloc,
+ +                                  0, false, MIGRATE_SYNC);
+ +      }
+ +
+ +      putback_lru_pages(&cc.migratepages);
+ +      return ret > 0 ? 0 : ret;
+ +}
+ +
+ +/*
+ + * Update zone's cma pages counter used for watermark level calculation.
+ + */
+ +static inline void __update_cma_watermarks(struct zone *zone, int count)
+ +{
+ +      unsigned long flags;
+ +      spin_lock_irqsave(&zone->lock, flags);
+ +      zone->min_cma_pages += count;
+ +      spin_unlock_irqrestore(&zone->lock, flags);
+ +      setup_per_zone_wmarks();
+ +}
+ +
+ +/*
+ + * Trigger memory pressure bump to reclaim some pages in order to be able to
+ + * allocate 'count' pages in single page units. Does similar work as
+ + *__alloc_pages_slowpath() function.
+ + */
+ +static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
+ +{
+ +      enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ +      struct zonelist *zonelist = node_zonelist(0, gfp_mask);
+ +      int did_some_progress = 0;
+ +      int order = 1;
+ +
+ +      /*
+ +       * Increase level of watermarks to force kswapd do his job
+ +       * to stabilise at new watermark level.
+ +       */
+ +      __update_cma_watermarks(zone, count);
+ +
+ +      /* Obey watermarks as if the page was being allocated */
+ +      while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
+ +              wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
+ +
+ +              did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
+ +                                                    NULL);
+ +              if (!did_some_progress) {
+ +                      /* Exhausted what can be done so it's blamo time */
+ +                      out_of_memory(zonelist, gfp_mask, order, NULL);
+ +              }
+ +      }
+ +
+ +      /* Restore original watermark levels. */
+ +      __update_cma_watermarks(zone, -count);
+ +
+ +      return count;
+ +}
+ +
+ +/**
+ + * alloc_contig_range() -- tries to allocate given range of pages
+ + * @start:    start PFN to allocate
+ + * @end:      one-past-the-last PFN to allocate
+ + * @migratetype:      migratetype of the underlaying pageblocks (either
+ + *                    #MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
+ + *                    in range must have the same migratetype and it must
+ + *                    be either of the two.
+ + *
+ + * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
+ + * aligned, however it's the caller's responsibility to guarantee that
+ + * we are the only thread that changes migrate type of pageblocks the
+ + * pages fall in.
+ + *
+ + * The PFN range must belong to a single zone.
+ + *
+ + * Returns zero on success or negative error code.  On success all
+ + * pages which PFN is in [start, end) are allocated for the caller and
+ + * need to be freed with free_contig_range().
+ + */
+ +int alloc_contig_range(unsigned long start, unsigned long end,
+ +                     unsigned migratetype)
+ +{
+ +      struct zone *zone = page_zone(pfn_to_page(start));
+ +      unsigned long outer_start, outer_end;
+ +      int ret = 0, order;
+ +
+ +      /*
+ +       * What we do here is we mark all pageblocks in range as
+ +       * MIGRATE_ISOLATE.  Because pageblock and max order pages may
+ +       * have different sizes, and due to the way page allocator
+ +       * work, we align the range to biggest of the two pages so
+ +       * that page allocator won't try to merge buddies from
+ +       * different pageblocks and change MIGRATE_ISOLATE to some
+ +       * other migration type.
+ +       *
+ +       * Once the pageblocks are marked as MIGRATE_ISOLATE, we
+ +       * migrate the pages from an unaligned range (ie. pages that
+ +       * we are interested in).  This will put all the pages in
+ +       * range back to page allocator as MIGRATE_ISOLATE.
+ +       *
+ +       * When this is done, we take the pages in range from page
+ +       * allocator removing them from the buddy system.  This way
+ +       * page allocator will never consider using them.
+ +       *
+ +       * This lets us mark the pageblocks back as
+ +       * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
+ +       * aligned range but not in the unaligned, original range are
+ +       * put back to page allocator so that buddy can use them.
+ +       */
+ +
+ +      ret = start_isolate_page_range(pfn_max_align_down(start),
+ +                                     pfn_max_align_up(end), migratetype);
+ +      if (ret)
+ +              goto done;
+ +
+ +      ret = __alloc_contig_migrate_range(start, end);
+ +      if (ret)
+ +              goto done;
+ +
+ +      /*
+ +       * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
+ +       * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
+ +       * more, all pages in [start, end) are free in page allocator.
+ +       * What we are going to do is to allocate all pages from
+ +       * [start, end) (that is remove them from page allocator).
+ +       *
+ +       * The only problem is that pages at the beginning and at the
+ +       * end of interesting range may be not aligned with pages that
+ +       * page allocator holds, ie. they can be part of higher order
+ +       * pages.  Because of this, we reserve the bigger range and
+ +       * once this is done free the pages we are not interested in.
+ +       *
+ +       * We don't have to hold zone->lock here because the pages are
+ +       * isolated thus they won't get removed from buddy.
+ +       */
+ +
+ +      lru_add_drain_all();
+ +      drain_all_pages();
+ +
+ +      order = 0;
+ +      outer_start = start;
+ +      while (!PageBuddy(pfn_to_page(outer_start))) {
+ +              if (++order >= MAX_ORDER) {
+ +                      ret = -EBUSY;
+ +                      goto done;
+ +              }
+ +              outer_start &= ~0UL << order;
+ +      }
+ +
+ +      /* Make sure the range is really isolated. */
+ +      if (test_pages_isolated(outer_start, end)) {
+ +              pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
+ +                     outer_start, end);
+ +              ret = -EBUSY;
+ +              goto done;
+ +      }
+ +
+ +      /*
+ +       * Reclaim enough pages to make sure that contiguous allocation
+ +       * will not starve the system.
+ +       */
+ +      __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
+ +
+ +      /* Grab isolated pages from freelists. */
+ +      outer_end = isolate_freepages_range(outer_start, end);
+ +      if (!outer_end) {
+ +              ret = -EBUSY;
+ +              goto done;
+ +      }
+ +
+ +      /* Free head and tail (if any) */
+ +      if (start != outer_start)
+ +              free_contig_range(outer_start, start - outer_start);
+ +      if (end != outer_end)
+ +              free_contig_range(end, outer_end - end);
+ +
+ +done:
+ +      undo_isolate_page_range(pfn_max_align_down(start),
+ +                              pfn_max_align_up(end), migratetype);
+ +      return ret;
+ +}
+ +
+ +void free_contig_range(unsigned long pfn, unsigned nr_pages)
+ +{
+ +      for (; nr_pages--; ++pfn)
+ +              __free_page(pfn_to_page(pfn));
+ +}
+ +#endif
+ +
   #ifdef CONFIG_MEMORY_HOTREMOVE
   /*
    * All pages in the range must be isolated before calling this.
@@@ -6198,6 -5784,10 +6205,10 @@@ __offline_isolated_pages(unsigned long 
                 zone->free_area[order].nr_free--;
                 __mod_zone_page_state(zone, NR_FREE_PAGES,
                                       - (1UL << order));
+ #ifdef CONFIG_HIGHMEM
+               if (PageHighMem(page))
+                       totalhigh_pages -= 1 << order;
+ #endif
                 for (i = 0; i < (1 << order); i++)
                         SetPageReserved((page+i));
                 pfn += (1 << order);
author	Grazvydas Ignotas <notasas@gmail.com>
	Sat, 26 Oct 2013 22:47:09 +0000 (01:47 +0300)
committer	Grazvydas Ignotas <notasas@gmail.com>
	Sat, 26 Oct 2013 22:47:09 +0000 (01:47 +0300)
		1	2
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/boot/compressed/head.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/mm/flush.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/mm/init.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/mm/nommu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/sysfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/base/memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/debugfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exec.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fat/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ubifs/dir.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/huge_memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/hugetlb.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history